def count_column_values_from_file(self, input_file, column_number, output_file=None, separator="\t", comments_prefix="#", verbose=False): column_value_dict = SynDict() for line_list in self.file_line_as_list_generator( input_file, separator=separator, comments_prefix=comments_prefix): if line_list[column_number] in column_value_dict: column_value_dict[line_list[column_number]] += 1 else: column_value_dict[line_list[column_number]] = 1 if output_file: column_value_dict.write(output_file) if verbose: print("#Column %i (0-based) contains %i different values" % (column_number, len(column_value_set))) return column_value_dict
def convert_emapper_annotation_file_to_fam(emapper_annotation_file, output_fam, eggnogdb_prefix=None, species_name=None, label_separator="."): fam_dict = SynDict() with open(emapper_annotation_file, "r") as annotations_fd: for line in annotations_fd: if line[0] == "#": continue line_list = line.split("\t") fam_id = line_list[10].split("|")[0] if not (eggnogdb_prefix is None): fam_id = eggnogdb_prefix + fam_id gene_id = "%s%s%s" % ( species_name, label_separator, line_list[0]) if species_name else line_list[0] if fam_id in fam_dict: fam_dict[fam_id].append(gene_id) else: fam_dict[fam_id] = [gene_id] fam_dict.write(filename=output_fam, splited_values=True)
def correct_regions_from_gff( self, reference, variants_vcf, gff_file, output_prefix=None, feature_type_list=["CDS"], unification_key="Parent", #raw_seq_per_line=False, vcf_with_masking=None, override_vcf_by_mask=None, use_ambiguous_nuccleotides=None): feature_dict = AnnotationsRoutines.get_feature_dict( gff_file, output_prefix=output_prefix, feature_type_list=feature_type_list, unification_key=unification_key) region_file = "%s.coordinates_only.list" % output_prefix raw_regions = "%s.raw.seq" % output_prefix final_regions = "%s.fasta" % output_prefix regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix self.correct_reference( reference, raw_regions, variants_vcf, raw_seq_per_line=True, vcf_with_masking=vcf_with_masking, override_vcf_by_mask=override_vcf_by_mask, use_ambiguous_nuccleotides=use_ambiguous_nuccleotides, interval_list=region_file) region_with_frameshift = SynDict() def new_regions_generator(): with open(raw_regions, "r") as in_fd: for region_id in feature_dict: seq = "" for i in range(0, len(feature_dict[region_id])): seq_fragment = in_fd.readline().strip() if ((int(feature_dict[region_id][i][2]) - int(feature_dict[region_id][i][1]) + 1) - len(seq_fragment)) % 3 != 0: if region_id not in region_with_frameshift: region_with_frameshift[region_id] = [i] else: region_with_frameshift[region_id].append(i) seq += seq_fragment yield SeqRecord( seq=Seq(seq) if feature_dict[region_id][0][3] == "+" else Seq(seq).reverse_complement(), id=region_id, description="") SeqIO.write(new_regions_generator(), final_regions, format="fasta") region_with_frameshift.write(regions_with_frameshift_file, splited_values=True)
def get_families_from_top_hits(top_hits_file, fam_file): hit_dict = SynDict() hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#") hit_dict.write(fam_file, splited_values=True) return hit_dict
def extract_dom_names_hits_from_domtblout(domtblout_file, output_file): hits_dict = SynDict() hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True, key_index=3, value_index=0, comments_prefix="#") if output_file: hits_dict.write(output_file, splited_values=True) return hits_dict
def get_transcript_to_pep_accordance_from_gtf(gtf_file, output_file, comment_symbol="#"): """ Tested on gtf files from Ensembl relealese 70 """ accordance_dict = SynDict() with open(gtf_file, "r") as gtf_fd: for line in gtf_fd: if line[0] == comment_symbol: continue tmp_list = line.strip().split("\t") tmp_list = tmp_list[-1].split(";") protein_id = None transcript_id = None #print tmp_list for entry in tmp_list: tmp_entry = entry.split() if len(tmp_entry) != 2: continue if tmp_entry[0] == "transcript_id": #print "tttt" transcript_id = tmp_entry[1][1:-1] # remove quotes elif tmp_entry[0] == "protein_id": #print "ppppp" protein_id = tmp_entry[1][1:-1] if (transcript_id is not None) and (protein_id is not None): if transcript_id in accordance_dict: accordance_dict[transcript_id].add(protein_id) else: accordance_dict[transcript_id] = {protein_id} accordance_dict.write(output_file, splited_values=True)
def count_unique_positions_per_sequence_from_file(self, alignment_file, output_prefix, format="fasta", gap_symbol="-", return_mode="absolute", verbose=True): alignment = AlignIO.read(alignment_file, format=format) number_of_sequences = len(alignment) alignment_length = len(alignment[0]) position_presence_matrix = self.get_position_presence_matrix( alignment, gap_symbol=gap_symbol, verbose=verbose) unique_position_count_dict = SynDict() unique_position_count_percent_dict = SynDict() for row in range(0, number_of_sequences): sequence_id = alignment[row].id unique_positions = 0 for column in range(0, alignment_length): if (position_presence_matrix[row, column] == 1) or (position_presence_matrix[row, column] == -1): unique_positions += 1 unique_position_count_dict[sequence_id] = unique_positions unique_position_count_percent_dict[sequence_id] = 100 * float( unique_positions) / (alignment_length - str(alignment[row].seq).count(gap_symbol)) unique_position_count_dict.write("%s.absolute_counts" % output_prefix) unique_position_count_percent_dict.write("%s.percent_counts" % output_prefix) return unique_position_count_dict if return_mode == "absolute" else unique_position_count_percent_dict
def count_per_scaffold_feature_number(gff_file, out_file=None, feature_type_list=[]): feature_count_dict = SynDict() if feature_type_list: def check_feature_type(feature_type): return feature_type in feature_type_list else: def check_feature_type(feature_type): return True with open(gff_file, "r") as gff_fd: for line in gff_fd: if line[0] == "#": continue line_list = line.split("\t") if check_feature_type(line_list[2]): if line_list[0] in feature_count_dict: feature_count_dict[line_list[0]] += 1 else: feature_count_dict[line_list[0]] = 1 if out_file: feature_count_dict.write(out_file) return feature_count_dict
def find_leaves_with_positive_selection(self, write=True): leaf_values_dict = self.get_leaf_values(write=False) positive_selected_leaves_dict = SynDict() for leaf_name in leaf_values_dict["W"]: if leaf_values_dict["W"][leaf_name] > 1: positive_selected_leaves_dict[leaf_name] = leaf_values_dict["W"][leaf_name] if write: positive_selected_leaves_dict.write("leaves_with_positive_selection.t") return positive_selected_leaves_dict
def get_monomer_len_file_from_trf_gff(trf_gff, len_file): len_dict = SynDict() with open(trf_gff, "r") as trf_fd: for line in trf_fd: if line[0] == "#": continue description_dict = AnnotationsRoutines.get_description_dict_from_gff_string( line) len_dict[description_dict["ID"]] = description_dict["Period"] # print len_dict len_dict.write(len_file)
def extract_hits_from_tbl_output(blast_hits, output_file): hits = SynDict() hits.read(blast_hits, allow_repeats_of_key=True, key_index=0, value_index=1, separator="\t") hits.write(output_file, splited_values=True, separator="\t", values_separator=",") return hits
def rename_elements_in_clusters( clusters_file, syn_file, output_clusters_file, remove_clusters_with_not_renamed_elements=False, elements_with_absent_synonyms_file=None, syn_file_key_column_index=0, syn_file_value_column_index=1, syn_file_column_separator='\t'): syn_dict = SynDict() syn_dict.read(syn_file, comments_prefix="#", key_index=syn_file_key_column_index, value_index=syn_file_value_column_index, separator=syn_file_column_separator) clusters_dict = SynDict() clusters_dict.read(clusters_file, split_values=True, values_separator=",", comments_prefix="#") output_clusters_dict = SynDict() absent_elements_dict = SynDict() for cluster in clusters_dict: renamed_element_list = [] all_elements_were_renamed_flag = True for element in clusters_dict[cluster]: if element in syn_dict: renamed_element_list.append(syn_dict[element]) else: if cluster not in absent_elements_dict: absent_elements_dict[cluster] = [element] else: absent_elements_dict[cluster].append(element) all_elements_were_renamed_flag = False renamed_element_list.append(element) if (not remove_clusters_with_not_renamed_elements) or ( remove_clusters_with_not_renamed_elements and all_elements_were_renamed_flag): output_clusters_dict[cluster] = renamed_element_list output_clusters_dict.write(output_clusters_file, splited_values=True) if elements_with_absent_synonyms_file: absent_elements_dict.write(elements_with_absent_synonyms_file, splited_values=True) return absent_elements_dict
def extract_GO_terms_from_emapper_annotation_file(emapper_annotation_file, output_file): GO_terms_dict = SynDict(filename=emapper_annotation_file, key_index=0, value_index=5, split_values=True, values_separator=",", comments_prefix="#", separator="\t") GO_terms_dict.header = "#protein_id\tGO_terms" GO_terms_dict.write(output_file, header=True, splited_values=True) return GO_terms_dict
def extract_predicted_gene_names_from_emapper_annotation_file( emapper_annotation_file, output_file): extract_predicted_gene_names_dict = SynDict( filename=emapper_annotation_file, key_index=0, value_index=4, split_values=True, values_separator=",", comments_prefix="#", separator="\t") extract_predicted_gene_names_dict.header = "#protein_id\tpredicted_gene_name" extract_predicted_gene_names_dict.write(output_file, header=True, splited_values=True) return extract_predicted_gene_names_dict
def count_miRNA_reads(self, alignment_file, gff_file, output_prefix, annotation_file_type="GTF", min_read_fraction_overlap=1.0, feature_type_to_use=None, attribute_type_to_use=None, sample_name=None, stranded=1): no_multimapped_read_counts = "%s.no_multimapped_reads.count" % output_prefix with_multimapped_read_counts = "%s.with_multimapped_reads.count" % output_prefix all_adjusted_read_counts = "%s.all_adjusted_reads.count" % output_prefix self.count(alignment_file, gff_file, no_multimapped_read_counts, annotation_file_type=annotation_file_type, min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use, attribute_type_to_use=attribute_type_to_use, stranded=stranded) self.count(alignment_file, gff_file, with_multimapped_read_counts, count_multimapped_reads=True, annotation_file_type=annotation_file_type, min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use, attribute_type_to_use=attribute_type_to_use, stranded=stranded) no_multimapped_read_count_dict = SynDict(filename=no_multimapped_read_counts, comments_prefix="#", key_index=0, value_index=6, expression=int, header=True) with_multimapped_read_count_dict = SynDict(filename=with_multimapped_read_counts, comments_prefix="#", key_index=0, value_index=6, expression=int, header=True) similar_feature_number_dict = SynDict(filename=with_multimapped_read_counts, comments_prefix="#", header=True, key_index=0, value_index=1, expression=lambda s: len(s.split(";"))) sample_nameeee = sample_name if sample_name else similar_feature_number_dict.header.split()[6] all_adjusted_read_count_dict = SynDict() all_adjusted_read_count_dict.header = ".\t%s" % sample_nameeee #print no_multimapped_read_count_dict #print with_multimapped_read_count_dict #print similar_feature_number_dict for feature_id in no_multimapped_read_count_dict: all_adjusted_read_count_dict[feature_id] = int(ceil(float(no_multimapped_read_count_dict[feature_id]) + \ (float(with_multimapped_read_count_dict[feature_id]) - float(no_multimapped_read_count_dict[feature_id])) / float(similar_feature_number_dict[feature_id]))) all_adjusted_read_count_dict.write(all_adjusted_read_counts, header=True)
def cluster_sequence_names_by_id_fragment(self, seq_id_list, id_element_index, id_separator="_", output_prefix=None): cluster_dict = SynDict() skipped_id_list = IdList() for seq_id in seq_id_list: seq_id_splited = seq_id.split(id_separator) if id_element_index < len(seq_id_splited): if seq_id_list[id_element_index] in cluster_dict: cluster_dict[seq_id_list[id_element_index]].append(seq_id) else: cluster_dict[seq_id_list[id_element_index]] = [seq_id] else: skipped_id_list.append(seq_id) if output_prefix: cluster_dict.write("%s.seqid.clusters" % output_prefix, splited_values=True) skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix) return cluster_dict
def rename_elements_in_clusters( clusters_file, syn_file, output_clusters_file, remove_clusters_with_not_renamed_elements=False, syn_file_key_column_index=0, syn_file_value_column_index=1, syn_file_column_separator='\t'): syn_dict = SynDict() syn_dict.read(syn_file, comments_prefix="#", key_index=syn_file_key_column_index, value_index=syn_file_value_column_index, separator=syn_file_column_separator) clusters_dict = SynDict() clusters_dict.read(clusters_file, split_values=True, values_separator=",", comments_prefix="#") output_clusters_dict = SynDict() for cluster in clusters_dict: renamed_element_list = [] for element in clusters_dict[cluster]: if element in syn_dict: renamed_element_list.append(syn_dict[element]) else: renamed_element_list.append(element) if remove_clusters_with_not_renamed_elements: break else: output_clusters_dict[cluster] = renamed_element_list output_clusters_dict.write(output_clusters_file, splited_values=True)
def get_feature_dict(self, input_gff, output_prefix=None, feature_type_list=["CDS"], unification_key="Parent"): feature_dict = SynDict() for line_list in self.file_line_as_list_generator(input_gff, comments_prefix="#", separator="\t"): annotation_dict = self.parse_gff_annotation_string_to_dict(line_list[self.GFF_ATTRIBUTE_COLUMN]) if line_list[self.GFF_FEATURETYPE_COLUMN] not in feature_type_list: continue if unification_key not in annotation_dict: continue #print unification_key #print(annotation_dict) if annotation_dict[unification_key][0] not in feature_dict: feature_dict[annotation_dict[unification_key][0]] = [] feature_dict[annotation_dict[unification_key][0]].append([line_list[self.GFF_SCAFFOLD_COLUMN], line_list[self.GFF_START_COLUMN], line_list[self.GFF_END_COLUMN], line_list[self.GFF_STRAND_COLUMN]]) if output_prefix: feature_dict.write("%s.tab" % output_prefix, value_expression=self.feature_list_entry_to_tab_str, line_per_value=True) feature_dict.write("%s.coordinates_only.tab" % output_prefix, value_expression=self.feature_list_entry_to_tab_str, line_per_value=True, values_only=True) feature_dict.write("%s.list" % output_prefix, value_expression=self.feature_list_entry_to_gatk_interval_str, line_per_value=True) feature_dict.write("%s.coordinates_only.list" % output_prefix, value_expression=self.feature_list_entry_to_gatk_interval_str, line_per_value=True, values_only=True) return feature_dict
def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None): extracted_families = SynDict() common_protein_names_to_families_dict = SynDict() common_names_to_eggnog_proteins_syn_dict = SynDict() not_found_proteins_common_names = IdList() transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value() for common_protein_name in protein_syn_dict: not_found = True for protein_id in protein_syn_dict[common_protein_name]: extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id if extended_protein_id in transposed_eggnog_fam_dict: not_found = False if common_protein_name not in common_protein_names_to_families_dict: common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]] common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id] else: common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0]) common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id) if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families: extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]] if not_found: not_found_proteins_common_names.append(common_protein_name) if output_prefix: extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True) common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True) common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True) not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix) #print common_names_to_eggnog_proteins_syn_dict #print common_protein_names_to_families_dict return extracted_families, common_protein_names_to_families_dict, \ common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
out_fd = sys.stdout if args.output_prefix == "stdout" else open( "%s_reference_random_genes.ids" % args.output_prefix, "w") reference_families = SynDict() reference_families.read(args.reference_fam, separator="\t", split_values=True, values_separator=",") node_family_ids = IdList() node_family_ids.read(args.input, header=True, column_number=0, column_separator="\t") reference_random_genes = SynDict() for family_id in node_family_ids: if family_id not in reference_families: reference_random_genes[family_id] = "." else: reference_random_genes[family_id] = choice( reference_families[family_id]) reference_random_genes.write("%s_reference_random_genes.t" % args.output_prefix) for family_id in reference_random_genes: if reference_random_genes[family_id] != ".": out_fd.write("%s\n" % reference_random_genes[family_id])
type=int, help="Format of input trees") parser.add_argument("-o", "--output_file", action="store", dest="output_file", default="stdout", help="Output file with leaves of trees. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output_file == "stdout" else open( args.output_file, "w") tree_files_list = os.listdir(args.tree_dir) names_dict = SynDict() for tree_file in tree_files_list: tree_name = split_filename(tree_file)[1] with open("%s%s" % (args.tree_dir, tree_file), "r") as tree_fd: tree = Tree(tree_fd.readline().strip(), format=args.tree_format) leaves_list = [] for node in tree.traverse(): if node.is_leaf(): leaves_list.append(node.name) names_dict[tree_name] = leaves_list names_dict.write(args.outp_fd, splited_values=True) if args.output_file != "stdout": out_fd.close()
def add_flanks_to_gff_record(self, input_gff, output_prefix, left_flank_len, right_flank_len, fasta_file, coords_description_entry="core_seq_coords", id_description_entry="ID"): sequence_length_dict = self.get_lengths_from_seq_file(fasta_file) shorter_flanks_dict = SynDict() output_gff = "%s.gff" % output_prefix short_flanks_file = "%s.short_flanks.dat" % output_prefix with open(input_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) continue line_list = line.strip().split("\t") scaffold = line_list[0] start = int(line_list[3]) end = int(line_list[4]) record_id = OrderedDict(map(lambda s: s.split("="), line_list[8].split(";")))[id_description_entry] line_list[8] += ";%s=%i,%i" % (coords_description_entry, start, end) if line_list[6] == "-": if start - right_flank_len > 0: line_list[3] = str(start - right_flank_len) right_flank_length = right_flank_len else: right_flank_length = start - 1 line_list[3] = "1" if end + left_flank_len <= sequence_length_dict[line_list[0]]: line_list[4] = str(end + left_flank_len) left_flank_length = left_flank_len else: left_flank_length = sequence_length_dict[line_list[0]] - end line_list[4] = sequence_length_dict[line_list[0]] else: if start - left_flank_len > 0: line_list[3] = str(start - left_flank_len) left_flank_length = left_flank_len else: left_flank_length = start - 1 line_list[3] = "1" if end + right_flank_len <= sequence_length_dict[line_list[0]]: line_list[4] = str(end + right_flank_len) right_flank_length = right_flank_len else: right_flank_length = sequence_length_dict[line_list[0]] - end line_list[4] = str(sequence_length_dict[line_list[0]]) if (left_flank_length < left_flank_len) or (right_flank_length < right_flank_len): print("%s: Short flank" % record_id) shorter_flanks_dict[record_id] = "%i,%i" % (left_flank_length, right_flank_length) line_list[8] += ";%s_relative=%i,%i\n" % (coords_description_entry, 1 + (right_flank_length if line_list[6] == "-" else left_flank_length), end - start + 1 + (right_flank_length if line_list[6] == "-" else left_flank_length)) """ print line print line_list for element in line_list: print element print type(element) """ out_fd.write("\t".join(line_list)) shorter_flanks_dict.write(short_flanks_file)
"--input", action="store", dest="input", required=True, type=make_list_of_path_to_files_from_comma_sep_string, help="Comma-separated list of fam files or directories with them") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file") args = parser.parse_args() out_fd = sys.stdout if args.input == "stdout" else open(args.output, "w") family_dict = SynDict() for filename in args.input: fam_dict = SynDict() fam_dict.read(filename, split_values=True) for family in fam_dict: if family not in family_dict: family_dict[family] = fam_dict[family] else: family_dict[family] += fam_dict[family] family_dict.write(args.output, splited_values=True) if args.output != "stdout": out_fd.close()
def draw_length_histogram(sequence_dict, output_prefix, number_of_bins=None, width_of_bins=None, min_length=1, max_length=None, extensions=("png", "svg"), legend_location='best'): length_dict = SynDict() for record in sequence_dict: length_dict[record] = len(sequence_dict[record].seq) length_dict.write("%s.len" % output_prefix) lengths = length_dict.values() max_len = max(lengths) min_len = min(lengths) median = np.median(lengths) mean = np.mean(lengths) if max_length is None: maximum_length = max_len else: maximum_length = max_length filtered = [] if (maximum_length < max_len) and (min_length > 1): for entry in lengths: if min_length <= entry <= maximum_length: filtered.append(entry) elif min_length > 1: for entry in lengths: if min_length <= entry: filtered.append(entry) elif maximum_length < max_len: for entry in lengths: if entry <= maximum_length: filtered.append(entry) else: filtered = lengths plt.figure(1, figsize=(6, 6)) plt.subplot(1, 1, 1) if number_of_bins: bins = number_of_bins elif width_of_bins: bins = np.arange(min_length - 1, maximum_length, width_of_bins, dtype=np.int32) bins[0] += 1 bins = np.append(bins, [maximum_length]) else: bins = 30 plt.hist(filtered, bins=bins) plt.xlim(xmin=min_length, xmax=maximum_length) plt.xlabel("Length") plt.ylabel("N") plt.title("Distribution of sequence lengths") plt.legend(("Min: %i\nMax: %i\nMean: %i\nMedian: %i" % (min_len, max_len, mean, median), ), loc=legend_location) for ext in extensions: plt.savefig("%s.%s" % (output_prefix, ext)) os.remove("temp.idx")
def replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8): output_gff = "%s.renamed.gff" % output_prefix genes_syn_file = "%s.gene.syn" % output_prefix transcripts_syn_file = "%s.transcript.syn" % output_prefix cds_syn_file = "%s.cds.syn" % output_prefix genes_syn_dict = SynDict() transcripts_syn_dict = SynDict() cds_syn_dict = SynDict() gene_counter = 0 gene_id_template = "%sG%%0%ii" % (species_prefix, number_of_digits_in_id) transcripts_counter = 0 transcript_id_template = "%sT%%0%ii" % (species_prefix, number_of_digits_in_id) cds_counter = 0 cds_id_template = "%sC%%0%ii" % (species_prefix, number_of_digits_in_id) with open(augustus_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: tmp = line.strip() if len(tmp) < 13: out_fd.write(line) continue if tmp[:12] != "# start gene": out_fd.write(line) continue augustus_gene_id = tmp.split(" ")[-1] gene_counter += 1 gene_syn_id = gene_id_template % gene_counter genes_syn_dict[augustus_gene_id] = gene_syn_id augustus_transcript_id = "" augustus_transcript_parent = "" out_fd.write("# start gene %s\n" % gene_syn_id) tmp = in_fd.next().strip() while True: while tmp[0] != "#": tmp_list = tmp.split("\t") feature_type = tmp_list[2] edited_str = "\t".join(tmp_list[:-1]) info_field_list = tmp_list[-1].split(";") if feature_type == "gene": edited_str += "\tID=%s\n" % gene_syn_id elif feature_type == "transcript": for entry in info_field_list: if "ID" in entry: augustus_transcript_id = entry.split("=")[-1] if augustus_transcript_id not in transcripts_syn_dict: transcripts_counter += 1 transcripts_syn_dict[augustus_transcript_id] = transcript_id_template % transcripts_counter transcript_syn_id = transcripts_syn_dict[augustus_transcript_id] if "Parent" in entry: augustus_transcript_parent = entry.split("=")[-1] if augustus_transcript_parent != augustus_gene_id: raise ValueError("Transcript parent id and gene id are not same!") edited_str += "\tID=%s;Parent=%s\n" % (transcript_syn_id, gene_syn_id) elif feature_type == "CDS": for entry in info_field_list: if "ID" in entry: augustus_cds_id = entry.split("=")[-1] if augustus_cds_id not in cds_syn_dict: cds_counter += 1 cds_syn_dict[augustus_cds_id] = cds_id_template % cds_counter cds_syn_id = cds_syn_dict[augustus_cds_id] if "Parent" in entry: augustus_cds_parent = entry.split("=")[-1] if augustus_cds_parent != augustus_transcript_id: raise ValueError("CDS parent id and transcript id are not same!") edited_str += "\tID=%s;Parent=%s\n" % (cds_syn_id, transcript_syn_id) elif (feature_type == "stop_codon") or (feature_type == "start_codon"): for entry in info_field_list: if "Parent" in entry: augustus_feature_parent = entry.split("=")[-1] if augustus_feature_parent != augustus_transcript_id: raise ValueError("Feature parent id and transcript id are not same!") edited_str += "\tParent=%s\n" % transcript_syn_id else: edited_str = tmp + "\n" out_fd.write(edited_str) tmp = in_fd.next().strip() while tmp[0] == "#": if "# end gene" in tmp: break out_fd.write(tmp + "\n") tmp = in_fd.next().strip() if "# end gene" in tmp: break out_fd.write("# end gene %s\n" % gene_syn_id) genes_syn_dict.write(genes_syn_file) transcripts_syn_dict.write(transcripts_syn_file) cds_syn_dict.write(cds_syn_file)
help="Output file with collapsed strings") parser.add_argument("-c", "--column_separator", action="store", dest="column_separator", default="\t", help="Column separator. Default: '\\t'") parser.add_argument("-v", "--value_separator", action="store", dest="value_separator", default=",", help="Value separator. Default: ','") parser.add_argument("-k", "--key_column", action="store", dest="key_column", default=0, type=int, help="Column to be used as key(0-based). Default: 0") parser.add_argument("-a", "--value_column", action="store", dest="value_column", default=1, type=int, help="Column to be used as value(0-based). Default: 1") parser.add_argument("-m", "--comments_prefix", action="store", dest="comments_prefix", default="#", help="Prefix of strings(comments) to be ignored. Default: #") parser.add_argument("-r", "--remove_value_repeats", action="store_true", dest="remove_value_repeats", help="Remove repeats of values") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") syn_dict = SynDict() syn_dict.read(args.input, header=False, separator=args.column_separator, allow_repeats_of_key=True, split_values=True, values_separator=args.value_separator, key_index=args.key_column, value_index=args.value_column, comments_prefix=args.comments_prefix) if args.remove_value_repeats: collapsed_dict = syn_dict.remove_value_repeats() collapsed_dict.write(out_fd, splited_values=True, values_separator=args.value_separator, close_after_if_file_object=True) else: syn_dict.write(out_fd, splited_values=True, values_separator=args.value_separator, close_after_if_file_object=True) #out_fd.close()
action="store", dest="output", default="stdout", help="Output file") parser.add_argument("-k", "--family_column", action="store", dest="fam_col", default=1, type=int, help="Family column position(0-based). Default: 1") parser.add_argument("-a", "--genes_column", action="store", dest="gen_col", default=0, type=int, help="Genes column position(0-based). Default: 0") args = parser.parse_args() hit_dict = SynDict() hit_dict.read(args.input, header=args.header, allow_repeats_of_key=True, key_index=args.fam_col, value_index=args.gen_col) hit_dict.write(args.output, splited_values=True)
parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True, help="File with ids of families to extract") parser.add_argument("-o", "--output", action="store", dest="output", default="stdout", help="File to write extracted families. Default - stdout") parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Print not found ids. Default - no") args = parser.parse_args() out_file = sys.stdout if args.output == "stdout" else open(args.output, "w") fam_dict = SynDict() fam_dict.read(args.input) id_set = IdSet() id_set.read(args.id_file) extracted_dict = SynDict() for id_entry in id_set: if id_entry in fam_dict: extracted_dict[id_entry] = fam_dict[id_entry] else: if args.verbose: print("%s was not found" % id_entry) extracted_dict.write(out_file, close_after_if_file_object=True)
def get_cds_for_proteins(self, protein_id_list, output_prefix, download_chunk_size=100, temp_dir_prefix="temp"): from Tools.Abstract import Tool transcript_temp_dir = "%s_transcripts" % temp_dir_prefix protein_temp_dir = "%s_proteins" % temp_dir_prefix number_of_ids = len(protein_id_list) print "Total %i ids" % number_of_ids for directory in transcript_temp_dir, protein_temp_dir: self.save_mkdir(directory) pep_file = "%s.pep.genbank" % output_prefix transcript_file = "%s.trascript.genbank" % output_prefix ranges = np.append(np.arange(0, number_of_ids, download_chunk_size), [number_of_ids]) print "Downloading proteins..." for i in range(0, len(ranges)-1): print "Downloading chunk %i" % i pep_tmp_file = "%s/%s_%i" % (protein_temp_dir, pep_file, i) self.efetch("protein", protein_id_list[ranges[i]:ranges[i+1]], pep_tmp_file, rettype="gb", retmode="text") os.system("cat %s/* > %s" % (protein_temp_dir, pep_file)) peptide_dict = SeqIO.index_db("tmp.idx", pep_file, format="genbank") downloaded_protein_ids = IdList(peptide_dict.keys()) print "%i proteins were downloaded" % len(downloaded_protein_ids) not_downloaded_proteins_ids = Tool.intersect_ids([protein_id_list], [downloaded_protein_ids], mode="only_a") print "%i proteins were not downloaded" % len(not_downloaded_proteins_ids) not_downloaded_proteins_ids.write("%s.not_downloaded.ids" % output_prefix) downloaded_protein_ids.write("%s.downloaded.ids" % output_prefix) print Tool.intersect_ids([protein_id_list], [downloaded_protein_ids], mode="count") pep_without_transcripts = IdList() pep_with_several_CDS_features = IdList() pep_to_transcript_accordance = SynDict() transcript_ids = IdList() print "Extracting transcript ids corresponding to proteins..." for pep_id in peptide_dict: for feature in peptide_dict[pep_id].features: if feature.type == "CDS": try: transcript_id = feature.qualifiers["coded_by"][0].split(":")[0] if pep_id not in pep_to_transcript_accordance: pep_to_transcript_accordance[pep_id] = [transcript_id] else: pep_to_transcript_accordance[pep_id].append(transcript_id) print("Genbank record for %s contains several CDS features" % pep_id) pep_with_several_CDS_features.append(pep_id) if transcript_id in transcript_ids: print "Repeated transcript id: %s" % transcript_id continue transcript_ids.append(transcript_id) except: print "Transcript id for %s was not found" % pep_id pep_without_transcripts.append(pep_id) pep_with_several_CDS_features.write("%s.pep_with_several_CDS.ids" % output_prefix) pep_without_transcripts.write("%s.pep_without_transcripts.ids" % output_prefix) transcript_ids.write("%s.transcripts.ids" % output_prefix) number_of_transcripts = len(transcript_ids) print "%i transcripts were found" % number_of_transcripts pep_to_transcript_accordance.write("%s.pep_to_transcript.accordance" % output_prefix, splited_values=True) transcript_ranges = np.append(np.arange(0, number_of_transcripts, download_chunk_size), [number_of_transcripts]) print "Downloading transcripts..." for i in range(0, len(transcript_ranges)-1): print "Downloading chunk %i" % i transcript_tmp_file = "%s/%s_%i" % (transcript_temp_dir, transcript_file, i) self.efetch("nuccore", transcript_ids[transcript_ranges[i]:transcript_ranges[i+1]], transcript_tmp_file, rettype="gb", retmode="text") os.system("cat %s/* > %s" % (transcript_temp_dir, transcript_file)) transcript_dict = SeqIO.index_db("tmp_1.idx", transcript_file, format="genbank") cds_records_list = [] for transcript_id in transcript_dict: for feature in transcript_dict[transcript_id].features: CDS_counter = 1 if feature.type == "CDS": #print feature feature_seq = feature.extract(transcript_dict[transcript_id].seq) feature_id = transcript_id # case with several CDS per transcripts is was not taken into account if "protein_id" in feature.qualifiers: description = "protein=%s" % feature.qualifiers["protein_id"][0] else: print "Corresponding protein id was not found for %s" % transcript_id cds_records_list.append(SeqRecord(seq=feature_seq, id=feature_id, description=description)) SeqIO.write(cds_records_list, "%s.cds" % output_prefix, format="fasta") stat_string = "Input protein ids\t %i\n" % number_of_ids stat_string += "Downloaded proteins\t%i\n" % number_of_transcripts stat_string += "Downloaded transcripts\t%i\n" % len(transcript_dict) print stat_string with open("%s.stats" % output_prefix, "w") as stat_fd: stat_fd.write(stat_string) for filename in "tmp.idx", "tmp_1.idx": os.remove(filename)
sl_keys = list(complicated_families_dict.sl_keys()) for sl_key in sl_keys: sp_set = set() for species in complicated_families_dict: if sl_key not in complicated_families_dict[species]: continue tmp = complicated_families_dict[species][sl_key].split(";") for i in range(0, len(tmp)): if "_" in tmp[i]: tmp[i] = tmp[i][2:] tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) sp_set.add(syn_id) complicated_families_syn_dict[sl_key] = sp_set complicated_families_syn_dict.write("complicated_families_connections.t", splited_values=True) for entry in complicated_families_dict.all_values(): tmp = entry.split(";") for i in range(0, len(tmp)): if "_" in tmp[i]: tmp[i] = tmp[i][2] tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) complicated_families_syn_ids.write("complicated_families_check.ids") nonassembled.write("splited_to_several_families.t", absent_symbol=".") assemled_to_different_families = species_syn_dict.filter_by_line(filter_different_assembly) species_syn_dict.write("correctly_assembled_families_in_all_species.t", absent_symbol=".")