def add_length_to_accordance_file(accordance_file, length_file, output_prefix): accordance_dict = SynDict(filename=accordance_file, allow_repeats_of_key=True) length_dict = SynDict(filename=length_file, expression=int) print length_dict longest_list = IdList() all_output_file = "%s.all.correspondence" % output_prefix longest_output_file = "%s.longest.correspondence" % output_prefix longest_id_file = "%s.longest.ids" % output_prefix with open(all_output_file, "w") as all_out_fd: with open(longest_output_file, "w") as longest_out_fd: for gene in accordance_dict: current_transcript = None current_length = 0 for transcript in accordance_dict[gene]: if length_dict[transcript] > current_length: current_transcript = transcript current_length = length_dict[transcript] all_out_fd.write("%s\t%s\t%i\n" % (gene, transcript, length_dict[transcript])) longest_out_fd.write("%s\t%s\t%i\n" % (gene, current_transcript, current_length)) longest_list.append(current_transcript) longest_list.write(longest_id_file)
def get_longest_pep_per_gene_from_ensembl_pep_dict(protein_dict, output_prefix=None): length_file = "%s.protein_length.tsv" % output_prefix if output_prefix: longest_protein_id_file = "%s.longest_pep.ids" % output_prefix len_fd = open(length_file, 'w') len_fd.write("#gene_id\tprotein_id\tprotein_length\n") data_dict = OrderedDict() for protein_id in protein_dict: length = len(protein_dict[protein_id].seq) description_list = protein_dict[protein_id].description.split() #print protein_dict[protein_id] #print '' #print description_list for entry in description_list: if "gene:" in entry: gene_id = entry.split(":")[1] if output_prefix: len_fd.write("%s\t%s\t%i\n" % (gene_id, protein_id, length)) if gene_id not in data_dict: data_dict[gene_id] = protein_id else: if length > len(protein_dict[data_dict[gene_id]].seq): data_dict[gene_id] = protein_id longest_pep_ids = IdList(data_dict.values()) if output_prefix: longest_pep_ids.write(longest_protein_id_file) len_fd.close() return longest_pep_ids
def extract_counts_by_max_level(input_file, output_prefix, separator="\t", verbose=True): output_file = "%s.divided_by_maxlvl" % output_prefix zero_max_lvl_list = IdList() zero_max_lvl_list_file = "%s.zero_max_lvl.ids" % output_prefix with open(input_file, "r") as in_fd: header = in_fd.readline() header_list = header.strip().split(separator) with open(output_file, "w") as out_fd: out_fd.write(header) for line in in_fd: tmp_line = line.strip().split(separator) data = np.array(map(float, tmp_line[1:])) max_level = max(data) if max_level == 0: zero_max_lvl_list.append(tmp_line[0]) if verbose: print("Zero max level for %s...Skipping..." % tmp_line[0]) continue data /= max_level output_string = tmp_line[0] + "\t" output_string += "\t".join(map(str, data)) output_string += "\n" out_fd.write(output_string) zero_max_lvl_list.write(zero_max_lvl_list_file)
def create_per_cluster_element_id_files(self, cluster_dict, output_directory): self.safe_mkdir(output_directory) for cluster_id in cluster_dict: cluster_element_id_list = IdList(cluster_dict[cluster_id]) cluster_element_id_list.write("%s/%s.ids" % (output_directory, cluster_id))
def extract_ids_from_file(input_file, output_file=None, header=False, column_separator="\t", comments_prefix="#", column_number=None): id_list = IdList() id_list.read(input_file, column_separator=column_separator, comments_prefix=comments_prefix, column_number=column_number, header=header) if output_file: id_list.write(output_file, header=header) return id_list
def prepare_annotation_file_from_transcript_and_cds( self, transcript_file, cds_file, correspondence_file, output_prefix, format="fasta", correspondence_key_column=0, correspondence_value_column=1, verbose=False): transcript_dict = self.parse_seq_file(transcript_file, "parse", format=format) cds_dict = self.parse_seq_file(cds_file, "parse", format=format) correspondence_dict = SynDict(filename=correspondence_file, comments_prefix="#", key_index=correspondence_key_column, value_index=correspondence_value_column) no_corresponding_cds_transcript_list = IdList() cds_not_found_transcript_list = IdList() annotation_file = "%s.annotation" % output_prefix no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix with open(annotation_file, "w") as annotation_fd: for transcript_id in transcript_dict: if transcript_id not in correspondence_dict: no_corresponding_cds_transcript_list.append(transcript_id) if verbose: print( "No cds in correspondence file for transcript %s" % transcript_id) continue cds_id = correspondence_dict[transcript_id] length = len(cds_dict[cds_id].seq) start = transcript_dict[transcript_id].seq.upper().find( cds_dict[cds_id].seq.upper()) if start == -1: cds_not_found_transcript_list.append(transcript_id) if verbose: print("CDS was not found for transcript %s" % transcript_id) continue annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id, start + 1, length) annotation_fd.write(annotation_string) no_corresponding_cds_transcript_list.write( no_corresponding_cds_transcript_file) cds_not_found_transcript_list.write(cds_not_found_transcript_file)
def split_hmm(self, hmmfile, output_dir, num_of_recs_per_file, num_of_files=None, output_prefix=None, threads=4): try: os.mkdir(output_dir) except OSError: pass id_fd = CGAS.cgas(hmmfile, grep_pattern="NAME", whole_word_match=True, awk_code="{print $2}", capture_output=True) split_index = 1 ids_written = 0 ids_list = IdList() #ids_list = read_ids(id_fd, close_after_if_file_object=False) ids_list.read(id_fd, close_after_if_file_object=True) number_of_ids = len(ids_list) out_prefix = self.split_filename(hmmfile)[1] if output_prefix is None else output_prefix num_of_ids = int(number_of_ids/num_of_files) + 1 if num_of_files else num_of_recs_per_file common_options = " -f" common_options += " %s" % hmmfile options_list = [] while (ids_written + num_of_ids) <= number_of_ids: tmp_id_list = IdList(ids_list[ids_written:ids_written+num_of_ids]) tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index)) options = common_options options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index) options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index)) options_list.append(options) split_index += 1 ids_written += num_of_ids if ids_written != number_of_ids: tmp_id_list = IdList(ids_list[ids_written:]) tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index)) options = common_options options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index) options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index)) options_list.append(options) split_index += 1 #print options_list self.parallel_execute(options_list, cmd="hmmfetch", threads=threads)
def divide_counts_by_base_level(input_file, output_prefix, base_level, separator="\t", verbose=True, secondary_base_lvl=None): output_file = "%s.divided_by_baselvl" % output_prefix zero_base_lvl_list = IdList() zero_both_base_lvls_list = IdList() zero_base_lvl_list_file = "%s.zero_base_lvl.ids" % output_prefix zero_both_base_lvls_list_file = "%s.zero_base_lvls.ids" % output_prefix with open(input_file, "r") as in_fd: header = in_fd.readline() header_list = header.strip().split(separator) data_base_level_index = header_list.index(base_level) - 1 if secondary_base_lvl: data_secondary_base_level_index = header_list.index(secondary_base_lvl) - 1 with open(output_file, "w") as out_fd: out_fd.write(header) for line in in_fd: tmp_line = line.strip().split(separator) data = np.array(map(float, tmp_line[1:])) if data[data_base_level_index] == 0: zero_base_lvl_list.append(tmp_line[0]) if not secondary_base_lvl: if verbose: print("Zero base level(%s) for %s...Skipping..." % (base_level, tmp_line[0])) continue if secondary_base_lvl: if data[data_secondary_base_level_index] == 0: zero_both_base_lvls_list.append(tmp_line[0]) if verbose: print("Both base levels are zero (%s, %s) for %s...Skipping..." % (base_level, secondary_base_lvl, tmp_line[0])) continue data /= data[data_base_level_index] if data[data_base_level_index] != 0 else data[data_secondary_base_level_index] else: data /= data[data_base_level_index] output_string = tmp_line[0] + "\t" output_string += "\t".join(map(str, data)) output_string += "\n" out_fd.write(output_string) zero_base_lvl_list.write(zero_base_lvl_list_file) zero_both_base_lvls_list.write(zero_both_base_lvls_list_file)
def divide_counts_by_several_base_level(input_file, output_prefix, base_levels, separator="\t", verbose=True, max_ratio_to_base_lvl=0.5): output_file = "%s.divided_by_max_baselvl" % output_prefix max_ratio_to_base_lvl_file = "%s.divided_by_max_baselvl.max_%f_ratio" % (output_prefix, max_ratio_to_base_lvl) zero_max_base_lvl_list = IdList() zero_max_base_lvl_list_file = "%s.zero_base_lvls.ids" % output_prefix max_ratio_to_base_lvl_fd = open(max_ratio_to_base_lvl_file, "w") with open(input_file, "r") as in_fd: header = in_fd.readline() header_list = header.strip().split(separator) data_base_lvl_index_list = [] base_level_list = [base_levels] if isinstance(base_levels, str) else base_levels for level in base_level_list: data_base_lvl_index_list.append(header_list.index(level) - 1) with open(output_file, "w") as out_fd: out_fd.write(header) max_ratio_to_base_lvl_fd.write(header) for line in in_fd: tmp_line = line.strip().split(separator) data = np.array(map(float, tmp_line[1:])) max_base_lvl = max(np.take(data, data_base_lvl_index_list)) if max_base_lvl == 0: zero_max_base_lvl_list.append(tmp_line[0]) if verbose: print("Zero max base level(s) for %s...Skipping..." % tmp_line[0]) continue data /= max_base_lvl output_string = tmp_line[0] + "\t" output_string += "\t".join(map(str, data)) output_string += "\n" if max(np.delete(data, data_base_lvl_index_list)) <= max_ratio_to_base_lvl: max_ratio_to_base_lvl_fd.write(output_string) out_fd.write(output_string) zero_max_base_lvl_list.write(zero_max_base_lvl_list_file) max_ratio_to_base_lvl_fd.close()
def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None): extracted_families = SynDict() common_protein_names_to_families_dict = SynDict() common_names_to_eggnog_proteins_syn_dict = SynDict() not_found_proteins_common_names = IdList() transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value() for common_protein_name in protein_syn_dict: not_found = True for protein_id in protein_syn_dict[common_protein_name]: extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id if extended_protein_id in transposed_eggnog_fam_dict: not_found = False if common_protein_name not in common_protein_names_to_families_dict: common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]] common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id] else: common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0]) common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id) if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families: extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]] if not_found: not_found_proteins_common_names.append(common_protein_name) if output_prefix: extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True) common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True) common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True) not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix) #print common_names_to_eggnog_proteins_syn_dict #print common_protein_names_to_families_dict return extracted_families, common_protein_names_to_families_dict, \ common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
def cluster_sequence_names_by_id_fragment(self, seq_id_list, id_element_index, id_separator="_", output_prefix=None): cluster_dict = SynDict() skipped_id_list = IdList() for seq_id in seq_id_list: seq_id_splited = seq_id.split(id_separator) if id_element_index < len(seq_id_splited): if seq_id_list[id_element_index] in cluster_dict: cluster_dict[seq_id_list[id_element_index]].append(seq_id) else: cluster_dict[seq_id_list[id_element_index]] = [seq_id] else: skipped_id_list.append(seq_id) if output_prefix: cluster_dict.write("%s.seqid.clusters" % output_prefix, splited_values=True) skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix) return cluster_dict
def extract_top_hits(hmmer_hits, top_hits_file, top_hits_ids_file=None, not_significant_ids_file=None, not_found_ids_file=None): top_hits_ids = IdList() not_significant_ids = IdList() not_found_ids = IdList() index_file = "hmmer_hits.tmp.idx" hmm_dict = SearchIO.index_db(index_file, hmmer_hits, "hmmer3-text") out_fd = open(top_hits_file, "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write("%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) top_hits_ids.append(query) else: not_significant_ids.append(query) else: not_found_ids.append(query) os.remove(index_file) if not_significant_ids_file: not_significant_ids.write(not_significant_ids_file) if not_found_ids_file: not_found_ids.write(not_found_ids_file) if top_hits_ids_file: top_hits_ids.write(top_hits_ids_file)
out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") annotations_dict = SeqIO.to_dict(GFF.parse(open(args.input))) single_gene_id_list = IdList() for record in annotations_dict: for feature in annotations_dict[record].features: #print feature.id if feature.type != "gene": continue for subfeature in feature.sub_features: if subfeature.type != "mRNA": continue exon_number = 0 for mRNA_subfeature in subfeature.sub_features: if mRNA_subfeature.type == "exon": exon_number += 1 if exon_number == 1: single_gene_id_list.append(feature.id) single_gene_id_list.write(out_fd, close_after_if_file_object=True) """ sequence_groups_id = SynDict() sequence_groups_id.read(args.id_file, split_values=True) #print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format) for group in sequence_groups_id: SeqIO.write(record_by_id_generator(sequence_dict, sequence_groups_id[group]), "%s%s.%s" % (args.output, group, args.extension), format=args.format) """
tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) sp_set.add(syn_id) complicated_families_syn_dict[sl_key] = sp_set complicated_families_syn_dict.write("complicated_families_connections.t", splited_values=True) for entry in complicated_families_dict.all_values(): tmp = entry.split(";") for i in range(0, len(tmp)): if "_" in tmp[i]: tmp[i] = tmp[i][2] tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) complicated_families_syn_ids.write("complicated_families_check.ids") nonassembled.write("splited_to_several_families.t", absent_symbol=".") assemled_to_different_families = species_syn_dict.filter_by_line(filter_different_assembly) species_syn_dict.write("correctly_assembled_families_in_all_species.t", absent_symbol=".") assemled_to_different_families.write("assembled_to_different_families_in_all_species.t", absent_symbol=".") correctly_assembled_families_synonym = IdList(set(species_syn_dict.all_values())) assemled_to_different_families_synonym = IdList(set(assemled_to_different_families.all_values())) correctly_assembled_families_synonym.write("correctly_assembled_families_syn_in_all_species.ids") assemled_to_different_families_synonym.write("assembled_to_different_families_syn_in_all_species.ids") if args.output != "output": out_fd.close()
def check_gvcf_integrity(self, gvcf_file, output_prefix, reference=None, length_dict=None, parsing_mode="parse"): len_dict = length_dict if length_dict else self.get_lengths(record_dict=self.parse_seq_file(reference, mode=parsing_mode), out_file=None, close_after_if_file_object=False) scaffold_dict = OrderedDict() with self.metaopen(gvcf_file, "r") as gvcf_fd: prev_scaffold = "" for line in gvcf_fd: #print line if line[0] == "#": continue line_list = line.split("\t") scaffold = line_list[0] start = int(line_list[1]) format = line_list[7].split(";") if (len(format) == 1) and (format[0][0:3] == "END"): end = int(format[0].split("=")[1]) else: end = start + len(line_list[3]) - 1 #print line_list #print scaffold, start, end, format if scaffold not in scaffold_dict: scaffold_dict[scaffold] = [] if scaffold != prev_scaffold: scaffold_dict[scaffold].append([deepcopy(start), deepcopy(end)]) else: #print scaffold_dict[scaffold][-1][1] if scaffold_dict[scaffold][-1][1] + 1 >= start: scaffold_dict[scaffold][-1][1] = deepcopy(max(end, scaffold_dict[scaffold][-1][1])) else: print scaffold_dict[scaffold] print line scaffold_dict[scaffold].append([deepcopy(start), deepcopy(end)]) prev_scaffold = scaffold complete_scaffolds = IdList() fragmented_scaffolds = IdList() scaffolds_with_absent_fragments = IdList() with open("%s.scaffold_regions" % output_prefix, "w") as scaf_reg_fd: for scaffold in scaffold_dict: if len(scaffold_dict[scaffold]) > 1: fragmented_scaffolds.append(scaffold) scaffold_length = sum(map(lambda s: s[1] - s[0] + 1, scaffold_dict[scaffold])) if scaffold_length != len_dict[scaffold]: scaffolds_with_absent_fragments.append(scaffold) else: complete_scaffolds.append(scaffold) scaf_reg_fd.write("%s\t%s\n" % (scaffold, ",".join(map(lambda s: "-".join(map(str,s)), scaffold_dict[scaffold])))) complete_scaffolds.write("%s.complete_scaffolds" % output_prefix) fragmented_scaffolds.write("%s.fragmented_scaffolds" % output_prefix) scaffolds_with_absent_fragments.write("%s.scaffolds_with_absent_fragments" % output_prefix)
import sys import argparse from CustomCollections.GeneralCollections import IdList parser = argparse.ArgumentParser() parser.add_argument("-i", "--fam_file", action="store", dest="fam_file", required=True, help="File with families") parser.add_argument("-o", "--output", action="store", dest="output", default="stdout", help="File to write ids") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") id_list = IdList() id_list.read(args.fam_file, close_after_if_file_object=True, column_number=1, id_in_column_separator=",") id_list.write(args.output, close_after_if_file_object=True)
def parallel_run( self, input_dir, output_dir, output_prefix, input_type="codon", min_seq_number_for_conserved_position=None, min_seq_number_for_flank_position=None, max_pos_number_for_noncons_contig_pos=None, min_block_len=None, allow_gaps="half", save_postscript=True, output_type="htm", threads=None, ): if threads: self.threads = threads data_dir = "%s/data/" % output_dir postscript_dir = "%s/ps/" % output_dir results_dir = "%s/results/" % output_dir htm_dir = "%s/htm/" % output_dir for directory in output_dir, data_dir, postscript_dir, results_dir, htm_dir: self.safe_mkdir(directory) #input_files_list = map(os.path.abspath, self.make_list_of_path_to_files(input_directory)) input_files_list = self.make_list_of_path_to_files( input_dir, return_absolute_paths=True) for entry in input_files_list: directory, prefix, extension = self.split_filename(entry) os.system("ln -s %s %s/%s%s" % (entry, data_dir, prefix, extension)) data_files_list = self.make_list_of_path_to_files( data_dir, return_absolute_paths=True) common_options = self.parse_options( input_type=input_type, min_seq_number_for_conserved_position= min_seq_number_for_conserved_position, min_seq_number_for_flank_position=min_seq_number_for_flank_position, max_pos_number_for_noncons_contig_pos= max_pos_number_for_noncons_contig_pos, min_block_len=min_block_len, allow_gaps=allow_gaps, save_postscript=save_postscript, output_type=output_type, concatenate_blocks_from_aignments=None) options_list = [] for data_file in data_files_list: options = " %s" % data_file options += " %s" % common_options options_list.append(options) self.parallel_execute(options_list=options_list) block_coordinates = OrderedDict() skipped_ids_file = "%s/%s.skipped.ids" % (output_dir, output_prefix) skipped_ids = IdList() for filename in data_files_list: data_dir, prefix, extension = self.split_filename(filename) blocks_file = "%s-gb" % filename htm_file = "%s-gb.htm" % filename postscript_file = "%s-gbPS" % filename if (not os.path.exists(blocks_file)) or ( not os.path.exists(htm_file)): skipped_ids.append(prefix) print("Warning!!! %s skipped..." % prefix) continue block_coordinates[prefix] = self.extract_block_coordinates( htm_file) os.system("mv %s %s/%s.ps" % (postscript_file, postscript_dir, prefix)) os.system("mv %s %s/%s.htm" % (htm_file, htm_dir, prefix)) self.convert_output_to_fasta( blocks_file, "%s/%s%s" % (results_dir, prefix, extension)) os.remove(blocks_file) block_coordinates_file = "%s/%s.block.coordinates" % (output_dir, output_prefix) skipped_ids.write(skipped_ids_file) with open(block_coordinates_file, "w") as block_fd: for entry in block_coordinates: coordinates_string = ";".join( map(lambda s: "%i,%i" % (s[0], s[1]), block_coordinates[entry])) block_fd.write("%s\t%s\n" % (entry, coordinates_string))
if args.all or args.tree: os.system("wget %s" % tree_options) if args.all or args.hmm: os.system("wget %s" % hmm_options) pool = Pool(args.threads) pool.map(download_data, family_ids) pool.close() for fam_id in family_ids: if args.all or args.alignment: if os.path.getsize("%s%s.fasta" % (args.output_dir, fam_id)) == 0: absent_alignment_list.append(fam_id) if args.all or args.tree: if os.path.getsize("%s%s.nwk" % (args.output_dir, fam_id)) == 0: absent_tree_list.append(fam_id) if args.all or args.hmm: if os.path.getsize("%s%s.hmm" % (args.output_dir, fam_id)) == 0: absent_hmm_list.append(fam_id) print absent_alignment_list if absent_alignment_list: absent_alignment_list.write("absent_alignments.ids") print("%i alignments were not downloaded" % len(absent_alignment_list)) if absent_tree_list: absent_tree_list.write("absent_trees.ids") print("%i trees were not downloaded" % len(absent_tree_list)) if absent_hmm_list: absent_hmm_list.write("absent_hmms.ids") print("%i hmms were not downloaded" % len(absent_hmm_list))
def get_cds_for_proteins(self, protein_id_list, output_prefix, download_chunk_size=100, temp_dir_prefix="temp"): from Tools.Abstract import Tool transcript_temp_dir = "%s_transcripts" % temp_dir_prefix protein_temp_dir = "%s_proteins" % temp_dir_prefix number_of_ids = len(protein_id_list) print "Total %i ids" % number_of_ids for directory in transcript_temp_dir, protein_temp_dir: self.save_mkdir(directory) pep_file = "%s.pep.genbank" % output_prefix transcript_file = "%s.trascript.genbank" % output_prefix ranges = np.append(np.arange(0, number_of_ids, download_chunk_size), [number_of_ids]) print "Downloading proteins..." for i in range(0, len(ranges)-1): print "Downloading chunk %i" % i pep_tmp_file = "%s/%s_%i" % (protein_temp_dir, pep_file, i) self.efetch("protein", protein_id_list[ranges[i]:ranges[i+1]], pep_tmp_file, rettype="gb", retmode="text") os.system("cat %s/* > %s" % (protein_temp_dir, pep_file)) peptide_dict = SeqIO.index_db("tmp.idx", pep_file, format="genbank") downloaded_protein_ids = IdList(peptide_dict.keys()) print "%i proteins were downloaded" % len(downloaded_protein_ids) not_downloaded_proteins_ids = Tool.intersect_ids([protein_id_list], [downloaded_protein_ids], mode="only_a") print "%i proteins were not downloaded" % len(not_downloaded_proteins_ids) not_downloaded_proteins_ids.write("%s.not_downloaded.ids" % output_prefix) downloaded_protein_ids.write("%s.downloaded.ids" % output_prefix) print Tool.intersect_ids([protein_id_list], [downloaded_protein_ids], mode="count") pep_without_transcripts = IdList() pep_with_several_CDS_features = IdList() pep_to_transcript_accordance = SynDict() transcript_ids = IdList() print "Extracting transcript ids corresponding to proteins..." for pep_id in peptide_dict: for feature in peptide_dict[pep_id].features: if feature.type == "CDS": try: transcript_id = feature.qualifiers["coded_by"][0].split(":")[0] if pep_id not in pep_to_transcript_accordance: pep_to_transcript_accordance[pep_id] = [transcript_id] else: pep_to_transcript_accordance[pep_id].append(transcript_id) print("Genbank record for %s contains several CDS features" % pep_id) pep_with_several_CDS_features.append(pep_id) if transcript_id in transcript_ids: print "Repeated transcript id: %s" % transcript_id continue transcript_ids.append(transcript_id) except: print "Transcript id for %s was not found" % pep_id pep_without_transcripts.append(pep_id) pep_with_several_CDS_features.write("%s.pep_with_several_CDS.ids" % output_prefix) pep_without_transcripts.write("%s.pep_without_transcripts.ids" % output_prefix) transcript_ids.write("%s.transcripts.ids" % output_prefix) number_of_transcripts = len(transcript_ids) print "%i transcripts were found" % number_of_transcripts pep_to_transcript_accordance.write("%s.pep_to_transcript.accordance" % output_prefix, splited_values=True) transcript_ranges = np.append(np.arange(0, number_of_transcripts, download_chunk_size), [number_of_transcripts]) print "Downloading transcripts..." for i in range(0, len(transcript_ranges)-1): print "Downloading chunk %i" % i transcript_tmp_file = "%s/%s_%i" % (transcript_temp_dir, transcript_file, i) self.efetch("nuccore", transcript_ids[transcript_ranges[i]:transcript_ranges[i+1]], transcript_tmp_file, rettype="gb", retmode="text") os.system("cat %s/* > %s" % (transcript_temp_dir, transcript_file)) transcript_dict = SeqIO.index_db("tmp_1.idx", transcript_file, format="genbank") cds_records_list = [] for transcript_id in transcript_dict: for feature in transcript_dict[transcript_id].features: CDS_counter = 1 if feature.type == "CDS": #print feature feature_seq = feature.extract(transcript_dict[transcript_id].seq) feature_id = transcript_id # case with several CDS per transcripts is was not taken into account if "protein_id" in feature.qualifiers: description = "protein=%s" % feature.qualifiers["protein_id"][0] else: print "Corresponding protein id was not found for %s" % transcript_id cds_records_list.append(SeqRecord(seq=feature_seq, id=feature_id, description=description)) SeqIO.write(cds_records_list, "%s.cds" % output_prefix, format="fasta") stat_string = "Input protein ids\t %i\n" % number_of_ids stat_string += "Downloaded proteins\t%i\n" % number_of_transcripts stat_string += "Downloaded transcripts\t%i\n" % len(transcript_dict) print stat_string with open("%s.stats" % output_prefix, "w") as stat_fd: stat_fd.write(stat_string) for filename in "tmp.idx", "tmp_1.idx": os.remove(filename)