def get_output_genome_directory_list(option_map, config_map, genome_directory_list): # output genome directory list output_genome_directory_list = [] # get working directory working_directory = option_map[working_directory_str] # get reference_genome_directory reference_genome_directory = utils.set_directory_path(config_map[ConfigString.reference_genome_directory_str]) parent_genome_directory_name = "sigma_alignments_output" # get parent genome directory parent_genome_directory = working_directory + parent_genome_directory_name # mkdir parent_genome_directory if not os.path.isdir(parent_genome_directory): check_call(["mkdir", parent_genome_directory], stdout = PIPE, stderr = sys.stderr) # for loop genome_index_base_list for genome_directory in genome_directory_list: # retrieve genome_directory_child_path genome_name = os.path.basename(genome_directory) child_genome_directory = utils.set_directory_path(parent_genome_directory) + genome_name output_genome_directory_list.append(child_genome_directory) # mkdir child_genome_directory if not os.path.isdir(child_genome_directory): check_call(["mkdir", child_genome_directory], stdout = PIPE, stderr = sys.stderr) return output_genome_directory_list
def get_genome_index_base(genome_directory_list): # genome_index_base_list genome_index_base_list = [] # initialize dictionary # for loop genome_index_base_list for genome_directory in genome_directory_list: # retrieve genome_directory_child_path genome_name = os.path.basename(genome_directory) # set bowtie_index_count as 0 bowtie_index_count = 0 # get genome_index_base genome_index_base = utils.set_directory_path(genome_directory) + genome_name # loop genome_index_path for filename in os.listdir(genome_directory): if utils.check_bowtie_index_format(filename): bowtie_index_count += 1 # if bowtie index exist, then save to list if bowtie_index_count > 0: genome_index_base_list.append(genome_index_base) # check genome index == 0 if len(genome_index_base_list) == 0: sys.stderr.write("\n** Cannot open bowtie2 index files. Check the Reference_Genome_Directory in config file.\n") utils.die("** Program exit!") return genome_index_base_list
def get_bamlog_path(config_path, genome_output_directory): # get sam log filename config_file = os.path.basename(config_path) (config_base, config_ext) = os.path.splitext(config_file) genome_name = os.path.basename(genome_output_directory) bamlog_path = utils.set_directory_path(genome_output_directory) + genome_name + ".align.log" return bamlog_path
def get_filtered_bamout_path(config_path, genome_output_directory): # get sam output filename config_file = os.path.basename(config_path) (config_base, config_ext) = os.path.splitext(config_file) genome_name = os.path.basename(genome_output_directory) filtered_bamout_path = utils.set_directory_path(genome_output_directory) + genome_name + ".filtered.bam" return filtered_bamout_path
def search_genome_fasta_path(config_map): # retrieve options reference_genome_directory = utils.set_directory_path(config_map[ConfigString.reference_genome_directory_str]) # genome data structure genome_directory_list = [] genome_fasta_path_list = [] fasta_count = 0 # loop the genome directory dirs = [d for d in os.listdir(reference_genome_directory) if os.path.isdir(os.path.join(reference_genome_directory, d))] for genome_enum, genome_name in enumerate(dirs): # get genome directory genome_directory = reference_genome_directory + genome_name genome_directory_list.append(genome_directory) genome_fasta_path_list.append([]) # loop fasta files for filename in os.listdir(genome_directory): # check fasta file if utils.check_fasta_format(filename): fasta_path = utils.set_directory_path(genome_directory) + filename genome_fasta_path_list[genome_enum].append(fasta_path) fasta_count += 1 # check no orgaism list if len(genome_directory_list) == 0: sys.stderr.write("\n** Cannot search genome directory. Check the reference genome directory.\n") utils.die("** Program exit!") # check no fasta file if fasta_count == 0: sys.stderr.write("\n** Cannot find genome fasta file. Check the reference genome directory.\n") utils.die("** Program exit!") return (genome_directory_list, genome_fasta_path_list)
def get_output_genome_directory_list_wo(option_map, config_map): # output genome directory list output_genome_directory_list = [] # get working directory working_directory = option_map[working_directory_str] # get reference_genome_directory reference_genome_directory = utils.set_directory_path(config_map[ConfigString.reference_genome_directory_str]) parent_genome_directory_name = os.path.basename(os.path.normpath(reference_genome_directory)) # get parent genome directory parent_genome_directory = working_directory + parent_genome_directory_name # mkdir parent_genome_directory if not os.path.isdir(parent_genome_directory): sys.stderr.write("\n** Cannot open output directory. Check the Reference_Genome_Directory in config file exists in the working directory.\n") utils.die("** Program exit!") # loop the genome directory dirs = [d for d in os.listdir(parent_genome_directory) if os.path.isdir(os.path.join(parent_genome_directory, d))] for genome_enum, genome_name in enumerate(dirs): # get genome directory child_genome_directory = utils.set_directory_path(parent_genome_directory) + genome_name # loop fasta files for filename in os.listdir(child_genome_directory): # check bam file exists if utils.check_bam_format(filename): # append the genome directory into the list only if bam file exists output_genome_directory_list.append(child_genome_directory) return output_genome_directory_list
def get_output_diff_sequence_path_sublist(output_genome_directory, genome_fasta_path_sublist): # initialize list output_diff_sequence_path_sublist = [] # for loop for genome_fasta_path in genome_fasta_path_sublist: genome_fasta_name = os.path.basename(genome_fasta_path) (genome_fasta_name_base, genome_fasta_name_ext) = os.path.splitext(genome_fasta_name) output_diff_sequence_name = genome_fasta_name_base + ".SNPs.txt" output_diff_sequence_path = utils.set_directory_path(output_genome_directory) + output_diff_sequence_name # append to list output_diff_sequence_path_sublist.append(output_diff_sequence_path) return output_diff_sequence_path_sublist