def get_transcript_to_pep_accordance_from_gtf(gtf_file, output_file, comment_symbol="#"): """ Tested on gtf files from Ensembl relealese 70 """ accordance_dict = SynDict() with open(gtf_file, "r") as gtf_fd: for line in gtf_fd: if line[0] == comment_symbol: continue tmp_list = line.strip().split("\t") tmp_list = tmp_list[-1].split(";") protein_id = None transcript_id = None #print tmp_list for entry in tmp_list: tmp_entry = entry.split() if len(tmp_entry) != 2: continue if tmp_entry[0] == "transcript_id": #print "tttt" transcript_id = tmp_entry[1][1:-1] # remove quotes elif tmp_entry[0] == "protein_id": #print "ppppp" protein_id = tmp_entry[1][1:-1] if (transcript_id is not None) and (protein_id is not None): if transcript_id in accordance_dict: accordance_dict[transcript_id].add(protein_id) else: accordance_dict[transcript_id] = {protein_id} accordance_dict.write(output_file, splited_values=True)
def get_codon_alignment_from_files(self, protein_aln_file, nucleotide_seq_file, codon_alignment_file, cds2protein_accordance_file=None, alignment_format="fasta", nucleotide_sequence_format="fasta", cds_index_file=None, retain_cds_index=False): protein_aln_dict = AlignIO.read(protein_aln_file, format=alignment_format) nucleotide_seq_dict = SeqIO.index_db( cds_index_file if cds_index_file else "nuc_tmp.idx", nucleotide_seq_file, format=nucleotide_sequence_format) protein2cds_accordance_dict = None if cds2protein_accordance_file: protein2cds_accordance_dict = SynDict() protein2cds_accordance_dict.read(cds2protein_accordance_file, key_index=1, value_index=0) self.get_codon_alignment( protein_aln_dict, nucleotide_seq_dict, codon_alignment_file, protein2cds_accordance_dict=protein2cds_accordance_dict) if (not cds_index_file) and (not retain_cds_index): os.remove("nuc_tmp.idx")
def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", seq_format="fasta", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True): from Routines import SequenceRoutines cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) self.safe_mkdir(output_dir) out_dir = self.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db( "tmp.idx", self.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_dict) if absent_elements: print "Skipping cluster %s due to absent element(%s)" % ( fam_id, ",".join(absent_elements)) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) self.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, cluster_dict[fam_id], verbose=True), out_file, format=seq_format) os.remove("tmp.idx") print "%i of %i clusters were skipped due to absent elements" % ( number_of_skipped_clusters, len(cluster_dict)) return number_of_skipped_clusters
def add_len_to_simple_output(top_hits_simple, len_file, out_file): len_dict = SynDict() len_dict.read(len_file) with open(top_hits_simple, "r") as in_fd: with open(out_file, "w") as out_fd: for line in in_fd: tmp_list = line.strip().split("\t") out_fd.write( "%s\t%s\t%s\t%s\t%s\t%f\n" % (tmp_list[0], len_dict[tmp_list[0]], tmp_list[3], tmp_list[1], tmp_list[2], (float(tmp_list[2]) - float(tmp_list[1]) + 1) / float(len_dict[tmp_list[0]])))
def count_unique_positions_per_sequence_from_file(self, alignment_file, output_prefix, format="fasta", gap_symbol="-", return_mode="absolute", verbose=True): alignment = AlignIO.read(alignment_file, format=format) number_of_sequences = len(alignment) alignment_length = len(alignment[0]) position_presence_matrix = self.get_position_presence_matrix( alignment, gap_symbol=gap_symbol, verbose=verbose) unique_position_count_dict = SynDict() unique_position_count_percent_dict = SynDict() for row in range(0, number_of_sequences): sequence_id = alignment[row].id unique_positions = 0 for column in range(0, alignment_length): if (position_presence_matrix[row, column] == 1) or (position_presence_matrix[row, column] == -1): unique_positions += 1 unique_position_count_dict[sequence_id] = unique_positions unique_position_count_percent_dict[sequence_id] = 100 * float( unique_positions) / (alignment_length - str(alignment[row].seq).count(gap_symbol)) unique_position_count_dict.write("%s.absolute_counts" % output_prefix) unique_position_count_percent_dict.write("%s.percent_counts" % output_prefix) return unique_position_count_dict if return_mode == "absolute" else unique_position_count_percent_dict
def replace_region_names_in_gff(input_gff, synonyms_file, output_gff): syn_dict = SynDict() syn_dict.read(synonyms_file, comments_prefix="#") with open(input_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) else: line_list = line.split("\t") if line_list[0] in syn_dict: line_list[0] = syn_dict[line_list[0]] out_fd.write("\t".join(line_list)) else: out_fd.write(line)
def combine_count_files(count_file_list, output_file, sample_name_list=None): if sample_name_list is not None: if len(count_file_list) != len(sample_name_list): raise ValueError( "Several files doesn't have corresponding sample name") samples = zip( sample_name_list if sample_name_list else count_file_list, count_file_list) count_table = TwoLvlDict() for sample, filename in samples: count_table[sample] = SynDict(filename=filename, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=None, comments_prefix="__") count_table.write(output_file)
def merge_clusters(clusters_dict, label_species="False", separator_for_labeling="_", species_label_first=True): if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) if label_species: expression = label_sequence else: expression = lambda label, name: name merged_clusters = SynDict() for species in clusters_dict: for cluster in clusters_dict[species]: if cluster not in merged_clusters: merged_clusters[cluster] = [] for sequence_name in clusters_dict[species][cluster]: merged_clusters[cluster].append( expression(species, sequence_name)) return merged_clusters
def split_proteins_per_species(self, dir_with_proteins, output_dir, input_format="fasta", output_format="fasta"): #print type(FileRoutines) input_files = self.make_list_of_path_to_files([dir_with_proteins] if isinstance(dir_with_proteins, str) else dir_with_proteins) out_dir = self.check_path(output_dir) self.safe_mkdir(out_dir) protein_dict = SeqIO.index_db("temp.idx", input_files, format=input_format) syn_dict = SynDict() for protein_id in protein_dict: taxa_id = protein_id.split(".")[0] # pep_id = ".".join(tmp_list[1:]) if taxa_id not in syn_dict: syn_dict[taxa_id] = [] syn_dict[taxa_id].append(protein_id) def renamed_records_generator(record_dict, taxa_id): for record_id in syn_dict[taxa_id]: record = deepcopy(record_dict[record_id]) #print(record) record.id = ".".join(record_id.split(".")[1:]) yield record for taxa_id in syn_dict: out_file = "%s%s.pep" % (out_dir, taxa_id) SeqIO.write(renamed_records_generator(protein_dict, taxa_id), out_file, format=output_format)
def replace_label(cluster_dict, syn_dict=None, old_separator="@", old_label_position="first", new_separator="@", new_label_position="first"): new_cluster_dict = SynDict() for cluster in cluster_dict: new_cluster_dict[cluster] = [] for element in cluster_dict[cluster]: tmp = element.split(old_separator) if old_label_position == "first": label = tmp[0] element_id = old_separator.join(tmp[1:]) else: label = tmp[-1] element_id = old_separator.join(tmp[:-1]) if new_label_position == 'first': new_cluster_dict[cluster].append( "%s%s%s" % (syn_dict[label] if syn_dict else label, new_separator, element_id)) else: new_cluster_dict[cluster].append( "%s%s%s" % (element_id, new_separator, syn_dict[label] if syn_dict else label)) return new_cluster_dict
def extract_single_copy_clusters_from_files( self, list_of_cluster_files, output_file, label_elements=False, separator="@", label_position="first", function_to_convert_filename_to_label=None): dict_of_cluster_dicts = OrderedDict() for filename in list_of_cluster_files: if function_to_convert_filename_to_label: label = function_to_convert_filename_to_label(filename) else: label = self.split_filename(filename)[ 1] # use basename as label dict_of_cluster_dicts[label] = SynDict() dict_of_cluster_dicts[label].read(filename, split_values=True, comments_prefix="#") sc_clusters_dict = self.extract_single_copy_clusters( dict_of_cluster_dicts, label_elements=label_elements, separator=separator, label_position=label_position) sc_clusters_dict.write(output_file, splited_values=True) return sc_clusters_dict
def label_cluster_elements_from_file(self, input_file, label, output_file, separator="@", label_position="first"): input_dict = SynDict() input_dict.read(input_file, split_values=True, comments_prefix="#") output_dict = self.label_cluster_elements( input_dict, label, separator=separator, label_position=label_position) output_dict.write(output_file, splited_values=True) return output_dict
def extract_clusters_by_element_ids_from_file(self, cluster_file, element_file, output_file, mode="w"): """" mode: "w" - if elements from element_id_list are present in cluster extracts only that elements "a" - if elements from element_id_list are present in cluster extracts all elements """ cluster_dict = SynDict() cluster_dict.read(cluster_file, split_values=True, comments_prefix="#") element_id_list = IdList() element_id_list.read(element_file, comments_prefix="#") extracted_clusters = self.extract_clusters_by_element_ids( cluster_dict, element_id_list, mode=mode) extracted_clusters.write(output_file, splited_values=True)
def prepare_annotation_file_from_transcript_and_cds( self, transcript_file, cds_file, correspondence_file, output_prefix, format="fasta", correspondence_key_column=0, correspondence_value_column=1, verbose=False): transcript_dict = self.parse_seq_file(transcript_file, "parse", format=format) cds_dict = self.parse_seq_file(cds_file, "parse", format=format) correspondence_dict = SynDict(filename=correspondence_file, comments_prefix="#", key_index=correspondence_key_column, value_index=correspondence_value_column) no_corresponding_cds_transcript_list = IdList() cds_not_found_transcript_list = IdList() annotation_file = "%s.annotation" % output_prefix no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix with open(annotation_file, "w") as annotation_fd: for transcript_id in transcript_dict: if transcript_id not in correspondence_dict: no_corresponding_cds_transcript_list.append(transcript_id) if verbose: print( "No cds in correspondence file for transcript %s" % transcript_id) continue cds_id = correspondence_dict[transcript_id] length = len(cds_dict[cds_id].seq) start = transcript_dict[transcript_id].seq.upper().find( cds_dict[cds_id].seq.upper()) if start == -1: cds_not_found_transcript_list.append(transcript_id) if verbose: print("CDS was not found for transcript %s" % transcript_id) continue annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id, start + 1, length) annotation_fd.write(annotation_string) no_corresponding_cds_transcript_list.write( no_corresponding_cds_transcript_file) cds_not_found_transcript_list.write(cds_not_found_transcript_file)
def get_families_from_top_hits(top_hits_file, fam_file): hit_dict = SynDict() hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#") hit_dict.write(fam_file, splited_values=True) return hit_dict
def extract_dom_names_hits_from_domtblout(domtblout_file, output_file): hits_dict = SynDict() hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True, key_index=3, value_index=0, comments_prefix="#") if output_file: hits_dict.write(output_file, splited_values=True) return hits_dict
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from Routines import SequenceRoutines, FileRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def replace_label_from_file(self, input_file, output_file, syn_file_or_dict, old_separator="@", old_label_position="first", new_separator="@", new_label_position="first"): syn_dict = SynDict( filename=syn_file_or_dict, split_values=False) if isinstance( syn_file_or_dict, str) else SynDict(syn_file_or_dict) cluster_dict = SynDict(filename=input_file, split_values=True) new_cluster_dict = self.replace_label( cluster_dict, syn_dict=syn_dict, old_separator=old_separator, old_label_position=old_label_position, new_separator=new_separator, new_label_position=new_label_position) new_cluster_dict.write(output_file, splited_values=True) return new_cluster_dict
def label_cluster_elements(cluster_dict, label, separator="@", label_position="first"): labeled_cluster_dict = SynDict() if label_position == "first": label_function = lambda s: "%s%s%s" % (label, separator, s) elif label_position == "last": label_function = lambda s: "%s%s%s" % (s, separator, label) for cluster in cluster_dict: labeled_cluster_dict[cluster] = [] for element in cluster_dict[cluster]: labeled_cluster_dict[cluster].append(label_function(element)) return labeled_cluster_dict
def replace_column_value_by_syn(input_file, syn_file, out_file, column=0, comment_prefix=None, separator="\t", syn_header=False, syn_separator="\t", syn_key_index=0, syn_value_index=1, syn_comment_prefix=None): syn_dict = SynDict(filename=syn_file, header=syn_header, separator=syn_separator, key_index=syn_key_index, value_index=syn_value_index, comments_prefix=syn_comment_prefix) if comment_prefix: comment_prefix_len = len(comment_prefix) line_number = 0 replaced = 0 not_replaced = 0 with open(input_file, "r") as in_fd: with open(out_file, "w") as out_fd: for line in in_fd: line_number += 1 if comment_prefix: if line[0:comment_prefix_len] == comment_prefix: out_fd.write(line) continue line_list = line.strip("\n").split(separator) if len(line_list) < column + 1: sys.stderr.write( "WARNING!!! Line %i doesn't have column %i\n" % (line_number, column)) if line_list[column] in syn_dict: replaced += 1 line_list[column] = syn_dict[line_list[column]] else: not_replaced += 1 out_fd.write(separator.join(line_list)) out_fd.write("\n") sys.stderr.write("Replaced: %i\nNot replaced: %i\n" % (replaced, not_replaced))
def extract_hits_from_tbl_output(blast_hits, output_file): hits = SynDict() hits.read(blast_hits, allow_repeats_of_key=True, key_index=0, value_index=1, separator="\t") hits.write(output_file, splited_values=True, separator="\t", values_separator=",") return hits
def read_cluster_files_from_dir(self, dir_with_cluster_files): cluster_files_list = sorted(os.listdir(dir_with_cluster_files)) clusters_dict = OrderedDict() for filename in cluster_files_list: filepath = "%s%s" % (self.check_path(dir_with_cluster_files), filename) filename_list = self.split_filename(filepath) clusters_dict[filename_list[1]] = SynDict() clusters_dict[filename_list[1]].read(filepath, header=False, separator="\t", allow_repeats_of_key=False, split_values=True, values_separator=",", key_index=0, value_index=1, comments_prefix="#") return clusters_dict
def get_sequence_names(clusters_dict, write_ids=False, out_prefix=None, white_list_ids=None): sequence_names_dict = SynDict() for species in clusters_dict: sequence_names_dict[species] = IdSet() for species in clusters_dict: for cluster_id in clusters_dict[species]: if white_list_ids: if cluster_id not in white_list_ids: continue sequence_names_dict[species] = sequence_names_dict[ species] | IdSet(clusters_dict[species][cluster_id]) if write_ids: for species in clusters_dict: out_file = "%s_%s.ids" % ( out_prefix, species) if out_prefix else "%s.ids" % species sequence_names_dict[species].write(out_file) return sequence_names_dict
def get_species_from_eggnog_tsv(self, eggnog_tsv, output_prefix, email=None): cluster_dict = SynDict(filename=eggnog_tsv, key_index=1, value_index=5, split_values=True) species_ids = self.extract_labels_from_cluster_elements(cluster_dict, separator=".", label_position="first") print("Input species ids: %s" % " ".join(species_ids)) if not email: species = species_ids else: species = NCBIRoutines.get_taxonomy(species_ids, "%s.species.taxonomy" % output_prefix, email, input_type="id") species.write("%s.species" % output_prefix, splited_values=True) for species_id in species: for i in range(0, len(species[species_id])): species[species_id][i] = species[species_id][i].lower().replace(" ", "_") species.write("%s.replaced_spaces.species" % output_prefix, splited_values=True)
def add_length_to_fam_file(fam_file, len_file, out_file, close_after_if_file_object=False): fam_dict = SynDict() fam_dict.read(fam_file, split_values=True, comments_prefix="#") len_dict = SynDict() len_dict.read(len_file, comments_prefix="#") out_fd = out_file if isinstance(out_file, file) else open( out_file, "r") for family in fam_dict: len_list = [] for member in fam_dict[family]: len_list.append(None if member not in len_dict else len_dict[member]) out_fd.write( "%s\t%s\t%s\n" % (family, ",".join(fam_dict[family]), ",".join(len_list))) if close_after_if_file_object: out_fd.close()
def extract_single_copy_clusters(dict_of_cluster_dicts, label_elements=False, separator="@", label_position="first"): if label_position == "first": label_function = lambda s, label: "%s%s%s" % (label, separator, s) elif label_position == "last": label_function = lambda s, label: "%s%s%s" % (s, separator, label) sc_clusters_dict = SynDict() clusters_set = set() for group in dict_of_cluster_dicts: clusters_set = clusters_set | set( dict_of_cluster_dicts[group].keys()) for cluster in clusters_set: for group in dict_of_cluster_dicts: if cluster not in dict_of_cluster_dicts[group]: break if len(dict_of_cluster_dicts[group][cluster]) > 1: break else: sc_clusters_dict[cluster] = [] for group in dict_of_cluster_dicts: if label_elements: sc_clusters_dict[cluster].append( label_function( dict_of_cluster_dicts[group][cluster][0], group)) else: sc_clusters_dict[cluster].append( dict_of_cluster_dicts[group][cluster][0]) return sc_clusters_dict
def extract_clusters_by_element_ids(cluster_dict, element_id_list, mode="w"): """" mode: "w" - if elements from element_id_list are present in cluster extracts only that elements "a" - if elements from element_id_list are present in cluster extracts all elements """ extracted_clusters = SynDict() for cluster in cluster_dict: extracted_elements = [] if mode == "w": for element in cluster_dict[cluster]: if element in element_id_list: extracted_elements.append(element) if extracted_elements: extracted_clusters[cluster] = extracted_elements elif mode == "a": for element in cluster_dict[cluster]: if element in element_id_list: extracted_clusters[cluster] = cluster_dict[cluster] break return extracted_clusters
def replace_augustus_ids_by_syn(augustus_gff, output_gff, genes_syn_file, transcripts_syn_file, cds_syn_file=None): genes_syn_dict = SynDict() genes_syn_dict.read(genes_syn_file, comments_prefix="#") transcripts_syn_dict = SynDict() transcripts_syn_dict.read(transcripts_syn_file, comments_prefix="#") cds_syn_dict = SynDict() if cds_syn_file: cds_syn_dict.read(cds_syn_file, comments_prefix="#") with open(augustus_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: tmp = line.strip() if len(tmp) < 13: out_fd.write(line) continue if tmp[:12] != "# start gene": out_fd.write(line) continue augustus_gene_id = tmp.split(" ")[-1] gene_syn_id = genes_syn_dict[augustus_gene_id] augustus_transcript_id = "" augustus_transcript_parent = "" out_fd.write("# start gene %s\n" % gene_syn_id) tmp = in_fd.next().strip() while True: while tmp[0] != "#": tmp_list = tmp.split("\t") feature_type = tmp_list[2] edited_str = "\t".join(tmp_list[:-1]) info_field_list = tmp_list[-1].split(";") if feature_type == "gene": edited_str += "\tID=%s\n" % gene_syn_id elif feature_type == "transcript": for entry in info_field_list: if "ID" in entry: augustus_transcript_id = entry.split( "=")[-1] transcript_syn_id = transcripts_syn_dict[ augustus_transcript_id] if "Parent" in entry: augustus_transcript_parent = entry.split( "=")[-1] if augustus_transcript_parent != augustus_gene_id: raise ValueError( "Transcript parent id and gene id are not same!" ) edited_str += "\tID=%s;Parent=%s\n" % ( transcript_syn_id, gene_syn_id) elif feature_type == "CDS": for entry in info_field_list: if "ID" in entry: augustus_cds_id = entry.split("=")[-1] cds_syn_id = cds_syn_dict[ augustus_cds_id] if cds_syn_dict else "%s.cds" % transcripts_syn_dict[ augustus_cds_id[:-4]] if "Parent" in entry: augustus_cds_parent = entry.split( "=")[-1] if augustus_cds_parent != augustus_transcript_id: raise ValueError( "CDS parent id and transcript id are not same!" ) edited_str += "\tID=%s;Parent=%s\n" % ( cds_syn_id, transcript_syn_id) elif (feature_type == "stop_codon") or (feature_type == "start_codon"): for entry in info_field_list: if "Parent" in entry: augustus_feature_parent = entry.split( "=")[-1] if augustus_feature_parent != augustus_transcript_id: raise ValueError( "Feature parent id and transcript id are not same!" ) edited_str += "\tParent=%s\n" % ( transcript_syn_id) else: edited_str = tmp out_fd.write(edited_str) tmp = in_fd.next().strip() while tmp[0] == "#": if "# end gene" in tmp: break out_fd.write(tmp + "\n") tmp = in_fd.next().strip() if "# end gene" in tmp: break out_fd.write("# end gene %s\n" % gene_syn_id)
def replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8): output_gff = "%s.renamed.gff" % output_prefix genes_syn_file = "%s.gene.syn" % output_prefix transcripts_syn_file = "%s.transcript.syn" % output_prefix cds_syn_file = "%s.cds.syn" % output_prefix genes_syn_dict = SynDict() transcripts_syn_dict = SynDict() cds_syn_dict = SynDict() gene_counter = 0 gene_id_template = "%sG%%0%ii" % (species_prefix, number_of_digits_in_id) transcripts_counter = 0 transcript_id_template = "%sT%%0%ii" % (species_prefix, number_of_digits_in_id) cds_counter = 0 cds_id_template = "%sC%%0%ii" % (species_prefix, number_of_digits_in_id) with open(augustus_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: tmp = line.strip() if len(tmp) < 13: out_fd.write(line) continue if tmp[:12] != "# start gene": out_fd.write(line) continue augustus_gene_id = tmp.split(" ")[-1] gene_counter += 1 gene_syn_id = gene_id_template % gene_counter genes_syn_dict[augustus_gene_id] = gene_syn_id augustus_transcript_id = "" augustus_transcript_parent = "" out_fd.write("# start gene %s\n" % gene_syn_id) tmp = in_fd.next().strip() while True: while tmp[0] != "#": tmp_list = tmp.split("\t") feature_type = tmp_list[2] edited_str = "\t".join(tmp_list[:-1]) info_field_list = tmp_list[-1].split(";") if feature_type == "gene": edited_str += "\tID=%s\n" % gene_syn_id elif feature_type == "transcript": for entry in info_field_list: if "ID" in entry: augustus_transcript_id = entry.split( "=")[-1] if augustus_transcript_id not in transcripts_syn_dict: transcripts_counter += 1 transcripts_syn_dict[ augustus_transcript_id] = transcript_id_template % transcripts_counter transcript_syn_id = transcripts_syn_dict[ augustus_transcript_id] if "Parent" in entry: augustus_transcript_parent = entry.split( "=")[-1] if augustus_transcript_parent != augustus_gene_id: raise ValueError( "Transcript parent id and gene id are not same!" ) edited_str += "\tID=%s;Parent=%s\n" % ( transcript_syn_id, gene_syn_id) elif feature_type == "CDS": for entry in info_field_list: if "ID" in entry: augustus_cds_id = entry.split("=")[-1] if augustus_cds_id not in cds_syn_dict: cds_counter += 1 cds_syn_dict[ augustus_cds_id] = cds_id_template % cds_counter cds_syn_id = cds_syn_dict[ augustus_cds_id] if "Parent" in entry: augustus_cds_parent = entry.split( "=")[-1] if augustus_cds_parent != augustus_transcript_id: raise ValueError( "CDS parent id and transcript id are not same!" ) edited_str += "\tID=%s;Parent=%s\n" % ( cds_syn_id, transcript_syn_id) elif (feature_type == "stop_codon") or (feature_type == "start_codon"): for entry in info_field_list: if "Parent" in entry: augustus_feature_parent = entry.split( "=")[-1] if augustus_feature_parent != augustus_transcript_id: raise ValueError( "Feature parent id and transcript id are not same!" ) edited_str += "\tParent=%s\n" % transcript_syn_id else: edited_str = tmp + "\n" out_fd.write(edited_str) tmp = in_fd.next().strip() while tmp[0] == "#": if "# end gene" in tmp: break out_fd.write(tmp + "\n") tmp = in_fd.next().strip() if "# end gene" in tmp: break out_fd.write("# end gene %s\n" % gene_syn_id) genes_syn_dict.write(genes_syn_file) transcripts_syn_dict.write(transcripts_syn_file) cds_syn_dict.write(cds_syn_file)
def rename_elements_in_clusters( clusters_file, syn_file, output_clusters_file, remove_clusters_with_not_renamed_elements=False, elements_with_absent_synonyms_file=None, syn_file_key_column_index=0, syn_file_value_column_index=1, syn_file_column_separator='\t'): syn_dict = SynDict() syn_dict.read(syn_file, comments_prefix="#", key_index=syn_file_key_column_index, value_index=syn_file_value_column_index, separator=syn_file_column_separator) clusters_dict = SynDict() clusters_dict.read(clusters_file, split_values=True, values_separator=",", comments_prefix="#") output_clusters_dict = SynDict() absent_elements_dict = SynDict() for cluster in clusters_dict: renamed_element_list = [] all_elements_were_renamed_flag = True for element in clusters_dict[cluster]: if element in syn_dict: renamed_element_list.append(syn_dict[element]) else: if cluster not in absent_elements_dict: absent_elements_dict[cluster] = [element] else: absent_elements_dict[cluster].append(element) all_elements_were_renamed_flag = False renamed_element_list.append(element) if (not remove_clusters_with_not_renamed_elements) or ( remove_clusters_with_not_renamed_elements and all_elements_were_renamed_flag): output_clusters_dict[cluster] = renamed_element_list output_clusters_dict.write(output_clusters_file, splited_values=True) if elements_with_absent_synonyms_file: absent_elements_dict.write(elements_with_absent_synonyms_file, splited_values=True) return absent_elements_dict