def count_per_scaffold_feature_number(gff_file, out_file=None, feature_type_list=[]): feature_count_dict = SynDict() if feature_type_list: def check_feature_type(feature_type): return feature_type in feature_type_list else: def check_feature_type(feature_type): return True with open(gff_file, "r") as gff_fd: for line in gff_fd: if line[0] == "#": continue line_list = line.split("\t") if check_feature_type(line_list[2]): if line_list[0] in feature_count_dict: feature_count_dict[line_list[0]] += 1 else: feature_count_dict[line_list[0]] = 1 if out_file: feature_count_dict.write(out_file) return feature_count_dict
def add_length_to_accordance_file(accordance_file, length_file, output_prefix): accordance_dict = SynDict(filename=accordance_file, allow_repeats_of_key=True) length_dict = SynDict(filename=length_file, expression=int) print(length_dict) longest_list = IdList() all_output_file = "%s.all.correspondence" % output_prefix longest_output_file = "%s.longest.correspondence" % output_prefix longest_id_file = "%s.longest.ids" % output_prefix with open(all_output_file, "w") as all_out_fd: with open(longest_output_file, "w") as longest_out_fd: for gene in accordance_dict: current_transcript = None current_length = 0 for transcript in accordance_dict[gene]: if length_dict[transcript] > current_length: current_transcript = transcript current_length = length_dict[transcript] all_out_fd.write( "%s\t%s\t%i\n" % (gene, transcript, length_dict[transcript])) longest_out_fd.write( "%s\t%s\t%i\n" % (gene, current_transcript, current_length)) longest_list.append(current_transcript) longest_list.write(longest_id_file)
def correct_regions_from_gff( self, reference, variants_vcf, gff_file, output_prefix=None, feature_type_list=["CDS"], unification_key="Parent", #raw_seq_per_line=False, vcf_with_masking=None, override_vcf_by_mask=None, use_ambiguous_nuccleotides=None): feature_dict = AnnotationsRoutines.get_feature_dict( gff_file, output_prefix=output_prefix, feature_type_list=feature_type_list, unification_key=unification_key) region_file = "%s.coordinates_only.list" % output_prefix raw_regions = "%s.raw.seq" % output_prefix final_regions = "%s.fasta" % output_prefix regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix self.correct_reference( reference, raw_regions, variants_vcf, raw_seq_per_line=True, vcf_with_masking=vcf_with_masking, override_vcf_by_mask=override_vcf_by_mask, use_ambiguous_nuccleotides=use_ambiguous_nuccleotides, interval_list=region_file) region_with_frameshift = SynDict() def new_regions_generator(): with open(raw_regions, "r") as in_fd: for region_id in feature_dict: seq = "" for i in range(0, len(feature_dict[region_id])): seq_fragment = in_fd.readline().strip() if ((int(feature_dict[region_id][i][2]) - int(feature_dict[region_id][i][1]) + 1) - len(seq_fragment)) % 3 != 0: if region_id not in region_with_frameshift: region_with_frameshift[region_id] = [i] else: region_with_frameshift[region_id].append(i) seq += seq_fragment yield SeqRecord( seq=Seq(seq) if feature_dict[region_id][0][3] == "+" else Seq(seq).reverse_complement(), id=region_id, description="") SeqIO.write(new_regions_generator(), final_regions, format="fasta") region_with_frameshift.write(regions_with_frameshift_file, splited_values=True)
def get_transcript_to_pep_accordance_from_gtf(gtf_file, output_file, comment_symbol="#"): """ Tested on gtf files from Ensembl relealese 70 """ accordance_dict = SynDict() with open(gtf_file, "r") as gtf_fd: for line in gtf_fd: if line[0] == comment_symbol: continue tmp_list = line.strip().split("\t") tmp_list = tmp_list[-1].split(";") protein_id = None transcript_id = None #print tmp_list for entry in tmp_list: tmp_entry = entry.split() if len(tmp_entry) != 2: continue if tmp_entry[0] == "transcript_id": #print ("tttt") transcript_id = tmp_entry[1][1:-1] # remove quotes elif tmp_entry[0] == "protein_id": #print ("ppppp") protein_id = tmp_entry[1][1:-1] if (transcript_id is not None) and (protein_id is not None): if transcript_id in accordance_dict: accordance_dict[transcript_id].add(protein_id) else: accordance_dict[transcript_id] = {protein_id} accordance_dict.write(output_file, splited_values=True)
def prepare_data_for_target_alignment(self, query_fasta, target_fasta, correspondence_file, out_dir, correspondence_query_column=0, correspondence_target_column=1): query_dict = self.parse_seq_file(query_fasta, "parse") target_dict = self.parse_seq_file(target_fasta, "parse") self.safe_mkdir(out_dir) correspondence_dict = SynDict(filename=correspondence_file, allow_repeats_of_key=True, key_index=correspondence_query_column, value_index=correspondence_target_column) for query_id in correspondence_dict: query_outfile = "%s/%s.query.fasta" % (out_dir, query_id) target_outfile = "%s/%s.target.fasta" % (out_dir, query_id) SeqIO.write(self.record_by_id_generator(query_dict, [query_id]), query_outfile, format="fasta") SeqIO.write(self.record_by_id_generator( target_dict, correspondence_dict[query_id]), target_outfile, format="fasta") queries_with_targets_set = set(correspondence_dict.keys()) queries_set = set(query_dict.keys()) return queries_with_targets_set, queries_set - queries_with_targets_set
def count_gaps(self): gaps_dict = SynDict() seq_length_dict = SynDict() for row in range(0, self.number_of_sequences): sequence_id = self.alignment[row].id gaps_dict[sequence_id] = str(self.alignment[row].seq).count(self.gap_symbol) seq_length_dict[sequence_id] = self.length - gaps_dict[sequence_id] return gaps_dict, seq_length_dict
def get_monomer_len_file_from_trf_gff(trf_gff, len_file): len_dict = SynDict() with open(trf_gff, "r") as trf_fd: for line in trf_fd: if line[0] == "#": continue description_dict = AnnotationsRoutines.get_description_dict_from_gff_string(line) len_dict[description_dict["ID"]] = description_dict["Period"] # print len_dict len_dict.write(len_file)
def syn2fam(syn_file, fam_file, key_column=0, value_column=1, separator="\t"): syn_dict = SynDict(filename=syn_file, allow_repeats_of_key=True, key_index=key_column, value_index=value_column, separator=separator, split_values=True) syn_dict.write(fam_file, splited_values=True)
def get_feature_dict(self, input_gff, output_prefix=None, feature_type_list=["CDS"], unification_key="Parent"): feature_dict = SynDict() for line_list in self.file_line_as_list_generator(input_gff, comments_prefix="#", separator="\t"): annotation_dict = self.parse_gff_annotation_string_to_dict( line_list[self.GFF_ATTRIBUTE_COLUMN]) if line_list[self.GFF_FEATURETYPE_COLUMN] not in feature_type_list: continue if unification_key not in annotation_dict: continue #print unification_key #print(annotation_dict) if annotation_dict[unification_key][0] not in feature_dict: feature_dict[annotation_dict[unification_key][0]] = [] feature_dict[annotation_dict[unification_key][0]].append([ line_list[self.GFF_SCAFFOLD_COLUMN], line_list[self.GFF_START_COLUMN], line_list[self.GFF_END_COLUMN], line_list[self.GFF_STRAND_COLUMN] ]) if output_prefix: feature_dict.write( "%s.tab" % output_prefix, value_expression=self.feature_list_entry_to_tab_str, line_per_value=True) feature_dict.write( "%s.coordinates_only.tab" % output_prefix, value_expression=self.feature_list_entry_to_tab_str, line_per_value=True, values_only=True) feature_dict.write( "%s.list" % output_prefix, value_expression=self.feature_list_entry_to_gatk_interval_str, line_per_value=True) feature_dict.write( "%s.coordinates_only.list" % output_prefix, value_expression=self.feature_list_entry_to_gatk_interval_str, line_per_value=True, values_only=True) return feature_dict
def extract_GO_terms_from_emapper_annotation_file(emapper_annotation_file, output_file): GO_terms_dict = SynDict(filename=emapper_annotation_file, key_index=0, value_index=5, split_values=True, values_separator=",", comments_prefix="#", separator="\t") GO_terms_dict.header = "#protein_id\tGO_terms" GO_terms_dict.write(output_file, header=True, splited_values=True) return GO_terms_dict
def add_len_to_simple_output(top_hits_simple, len_file, out_file): len_dict = SynDict() len_dict.read(len_file) with open(top_hits_simple, "r") as in_fd: with open(out_file, "w") as out_fd: for line in in_fd: tmp_list = line.strip().split("\t") out_fd.write( "%s\t%s\t%s\t%s\t%s\t%f\n" % (tmp_list[0], len_dict[tmp_list[0]], tmp_list[3], tmp_list[1], tmp_list[2], (float(tmp_list[2]) - float(tmp_list[1]) + 1) / float(len_dict[tmp_list[0]])))
def replace_region_names_in_gff(input_gff, synonyms_file, output_gff): syn_dict = SynDict() syn_dict.read(synonyms_file, comments_prefix="#") with open(input_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) else: line_list = line.split("\t") if line_list[0] in syn_dict: line_list[0] = syn_dict[line_list[0]] out_fd.write("\t".join(line_list)) else: out_fd.write(line)
def extract_clusters_by_element_ids_from_file(self, cluster_file, element_file, output_file, mode="w", cluster_column=0, element_column=1, column_separator="\t", element_separator=",", id_column=None): """" mode: "w" - if elements from element_id_list are present in cluster extracts only that elements "a" - if elements from element_id_list are present in cluster extracts all elements """ cluster_dict = SynDict(filename=cluster_file, split_values=True, comments_prefix="#", key_index=cluster_column, value_index=element_column, separator=column_separator, values_separator=element_separator) element_id_list = IdList(filename=element_file, comments_prefix="#", column_number=id_column) extracted_clusters = self.extract_clusters_by_element_ids( cluster_dict, element_id_list, mode=mode) extracted_clusters.write(output_file, splited_values=True)
def extract_predicted_gene_names_from_emapper_annotation_file( emapper_annotation_file, output_file): extract_predicted_gene_names_dict = SynDict( filename=emapper_annotation_file, key_index=0, value_index=4, split_values=True, values_separator=",", comments_prefix="#", separator="\t") extract_predicted_gene_names_dict.header = "#protein_id\tpredicted_gene_name" extract_predicted_gene_names_dict.write(output_file, header=True, splited_values=True) return extract_predicted_gene_names_dict
def merge_clusters(clusters_dict, label_species="False", separator_for_labeling="_", species_label_first=True): if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) if label_species: expression = label_sequence else: expression = lambda label, name: name merged_clusters = SynDict() for species in clusters_dict: for cluster in clusters_dict[species]: if cluster not in merged_clusters: merged_clusters[cluster] = [] for sequence_name in clusters_dict[species][cluster]: merged_clusters[cluster].append( expression(species, sequence_name)) return merged_clusters
def extract_single_copy_clusters_from_files( self, list_of_cluster_files, output_file, label_elements=False, separator="@", label_position="first", function_to_convert_filename_to_label=None): dict_of_cluster_dicts = OrderedDict() for filename in list_of_cluster_files: if function_to_convert_filename_to_label: label = function_to_convert_filename_to_label(filename) else: label = self.split_filename(filename)[ 1] # use basename as label dict_of_cluster_dicts[label] = SynDict() dict_of_cluster_dicts[label].read(filename, split_values=True, comments_prefix="#") sc_clusters_dict = self.extract_single_copy_clusters( dict_of_cluster_dicts, label_elements=label_elements, separator=separator, label_position=label_position) sc_clusters_dict.write(output_file, splited_values=True) return sc_clusters_dict
def replace_label(cluster_dict, syn_dict=None, old_separator="@", old_label_position="first", new_separator="@", new_label_position="first"): new_cluster_dict = SynDict() for cluster in cluster_dict: new_cluster_dict[cluster] = [] for element in cluster_dict[cluster]: tmp = element.split(old_separator) if old_label_position == "first": label = tmp[0] element_id = old_separator.join(tmp[1:]) else: label = tmp[-1] element_id = old_separator.join(tmp[:-1]) if new_label_position == 'first': new_cluster_dict[cluster].append( "%s%s%s" % (syn_dict[label] if syn_dict else label, new_separator, element_id)) else: new_cluster_dict[cluster].append( "%s%s%s" % (element_id, new_separator, syn_dict[label] if syn_dict else label)) return new_cluster_dict
def add_add_new_column_by_key_column(self, table_file, syn_dict_file, key_column, output_file, new_column_name=None, separator='\t', absent_value="."): column_syn_dict = SynDict(filename=syn_dict_file, allow_repeats_of_key=True, values_separator="@") with open(table_file, "r") as in_fd, open(output_file, "w") as out_fd: if new_column_name: header_line = in_fd.readline().strip( ) + "\t%s\n" % new_column_name out_fd.write(header_line) for line in in_fd: line_list = line.strip().split(separator) if line_list[key_column] in column_syn_dict: print(line_list[key_column]) print(column_syn_dict[line_list[key_column]]) line_list.append( absent_value if line_list[key_column] not in column_syn_dict else "|". join(column_syn_dict[line_list[key_column]])) out_fd.write(separator.join(line_list) + "\n")
def rename_scaffolds_in_gff(self, input_gff, syn_file, output_prefix, verbose=True): syn_dict = SynDict(filename=syn_file) skipped_id_list = IdSet() output_gff = "%s.renamed.gff" % output_prefix skipped_gff = "%s.skipped.gff" % output_prefix skipped_id_file = "%s.skipped_scaffolds.ids" % output_prefix with self.metaopen(input_gff, "r") as in_fd, \ self.metaopen(output_gff, "w") as out_fd, \ self.metaopen(skipped_gff, "w") as skipped_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) gff_list = line.split("\t") if gff_list[0] in syn_dict: gff_list[0] = syn_dict[gff_list[0]] out_fd.write("\t".join(gff_list)) else: skipped_fd.write(line) skipped_id_list.add(gff_list[0]) if verbose: print("Not renamed scaffolds: %i" % len(skipped_id_list)) skipped_id_list.write(skipped_id_file)
def combine_count_files(count_file_list, output_file, sample_name_list=None): if sample_name_list is not None: if len(count_file_list) != len(sample_name_list): raise ValueError( "Several files doesn't have corresponding sample name") samples = zip( sample_name_list if sample_name_list else count_file_list, count_file_list) count_table = TwoLvlDict() for sample, filename in samples: count_table[sample] = SynDict(filename=filename, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=None, comments_prefix="__") count_table.write(output_file)
def split_proteins_per_species(self, dir_with_proteins, output_dir, input_format="fasta", output_format="fasta"): #print type(FileRoutines) input_files = self.make_list_of_path_to_files([dir_with_proteins] if isinstance(dir_with_proteins, str) else dir_with_proteins) out_dir = self.check_path(output_dir) self.safe_mkdir(out_dir) protein_dict = SeqIO.index_db("temp.idx", input_files, format=input_format) syn_dict = SynDict() for protein_id in protein_dict: taxa_id = protein_id.split(".")[0] # pep_id = ".".join(tmp_list[1:]) if taxa_id not in syn_dict: syn_dict[taxa_id] = [] syn_dict[taxa_id].append(protein_id) def renamed_records_generator(record_dict, taxa_id): for record_id in syn_dict[taxa_id]: record = deepcopy(record_dict[record_id]) #print(record) record.id = ".".join(record_id.split(".")[1:]) yield record for taxa_id in syn_dict: out_file = "%s%s.pep" % (out_dir, taxa_id) SeqIO.write(renamed_records_generator(protein_dict, taxa_id), out_file, format=output_format)
def convert_emapper_annotation_file_to_fam(emapper_annotation_file, output_fam, eggnogdb_prefix=None, species_name=None, label_separator="@", diamond_mode=False, database=None): fam_dict = SynDict() if diamond_mode and (database is not None): def extract_fam_from_line(line_list): db_dict = dict( map(lambda s: s.split("@")[::-1], line_list[9].split(","))) return db_dict[database] if database in db_dict else "unknown" elif diamond_mode: raise ValueError( "ERROR!!! Database name (veNOG or other) is required in diamond mode!" ) else: def extract_fam_from_line(line_list): return line_list[10].split("|")[0] with open(emapper_annotation_file, "r") as annotations_fd: for line in annotations_fd: if line[0] == "#": continue line_list = line.split("\t") fam_id = extract_fam_from_line(line_list) if not (eggnogdb_prefix is None): fam_id = eggnogdb_prefix + fam_id gene_id = "%s%s%s" % ( species_name, label_separator, line_list[0]) if species_name else line_list[0] if fam_id in fam_dict: fam_dict[fam_id].append(gene_id) else: fam_dict[fam_id] = [gene_id] fam_dict.write(filename=output_fam, splited_values=True)
def create_gvf_files_from_species_gene_fam_and_gene_GO_fam( self, species_gene_fam_file, gene_GO_fam_file, output_directory): species_gene_dict = SynDict(filename=species_gene_fam_file, split_values=True) gene_GO_dict = SynDict(filename=gene_GO_fam_file, split_values=True) self.safe_mkdir(output_directory) for species in species_gene_dict: with open("%s/%s.gvf" % (output_directory, species), "w") as out_fd: for gene in species_gene_dict[species]: if gene not in gene_GO_dict: print( "WARNING gene %s for species %s is absent in gene_GO file" % (gene, species)) continue out_fd.write("%s\t%s\n" % (gene, "\t".join(gene_GO_dict[gene])))
def combine_syn_dicts(list_of_syn_dict): combined_dict = SynDict() for syn_dict in list_of_syn_dict: for key in syn_dict: if key in combined_dict: combined_dict[key] += syn_dict[key] else: combined_dict[key] = syn_dict[key] return combined_dict
def prepare_annotation_file_from_transcript_and_cds( self, transcript_file, cds_file, correspondence_file, output_prefix, format="fasta", correspondence_key_column=0, correspondence_value_column=1, verbose=False): transcript_dict = self.parse_seq_file(transcript_file, "parse", format=format) cds_dict = self.parse_seq_file(cds_file, "parse", format=format) correspondence_dict = SynDict(filename=correspondence_file, comments_prefix="#", key_index=correspondence_key_column, value_index=correspondence_value_column) no_corresponding_cds_transcript_list = IdList() cds_not_found_transcript_list = IdList() annotation_file = "%s.annotation" % output_prefix no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix with open(annotation_file, "w") as annotation_fd: for transcript_id in transcript_dict: if transcript_id not in correspondence_dict: no_corresponding_cds_transcript_list.append(transcript_id) if verbose: print( "No cds in correspondence file for transcript %s" % transcript_id) continue cds_id = correspondence_dict[transcript_id] length = len(cds_dict[cds_id].seq) start = transcript_dict[transcript_id].seq.upper().find( cds_dict[cds_id].seq.upper()) if start == -1: cds_not_found_transcript_list.append(transcript_id) if verbose: print("CDS was not found for transcript %s" % transcript_id) continue annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id, start + 1, length) annotation_fd.write(annotation_string) no_corresponding_cds_transcript_list.write( no_corresponding_cds_transcript_file) cds_not_found_transcript_list.write(cds_not_found_transcript_file)
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from RouToolPa.Routines import SequenceRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def extract_clusters_by_size_from_file(self, cluster_file, min_cluster_size=None, max_cluster_size=None, white_list_ids=None, out_file=None): cluster_dict = SynDict(filename=cluster_file, split_values=True) return self.extract_clusters_by_size(cluster_dict, min_cluster_size=min_cluster_size, max_cluster_size=max_cluster_size, white_list_ids=white_list_ids, out_file=out_file)
def remove_elements_by_ids_from_files(self, input_file, output_file, black_list_file, mode="full"): cluster_dict = SynDict(filename=input_file, split_values=True) black_list = IdList(filename=black_list_file) filtered_dict = self.remove_elements_by_ids(cluster_dict, black_list, mode=mode) filtered_dict.write(output_file, splited_values=True)
def count_column_values_from_file(self, input_file, column_number, output_file=None, separator="\t", comments_prefix="#", verbose=False): column_value_dict = SynDict() for line_list in self.file_line_as_list_generator( input_file, separator=separator, comments_prefix=comments_prefix): if line_list[column_number] in column_value_dict: column_value_dict[line_list[column_number]] += 1 else: column_value_dict[line_list[column_number]] = 1 if output_file: column_value_dict.write(output_file) return column_value_dict
def cluster_sequence_names_by_id_fragment(self, seq_id_list, id_element_index, id_separator="_", output_prefix=None): cluster_dict = SynDict() skipped_id_list = IdList() for seq_id in seq_id_list: seq_id_splited = seq_id.split(id_separator) if id_element_index < len(seq_id_splited): if seq_id_list[id_element_index] in cluster_dict: cluster_dict[seq_id_list[id_element_index]].append(seq_id) else: cluster_dict[seq_id_list[id_element_index]] = [seq_id] else: skipped_id_list.append(seq_id) if output_prefix: cluster_dict.write("%s.seqid.clusters" % output_prefix, splited_values=True) skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix) return cluster_dict