def handle_input(filename): sys.stdout.write("Handling %s\n" % filename) not_significant_ids = IdList() not_found_ids = IdList() prefix = FileRoutines.split_filename(filename)[1] index_file = "%s.tmp.idx" % prefix hmm_dict = SearchIO.index_db(index_file, filename, args.format) if args.output == "stdout": out_fd = sys.stdout else: out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write( "%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) else: not_significant_ids.append(query) else: not_found_ids.append(query) if args.output != "stdout": out_fd.close() os.remove(index_file) return not_significant_ids, not_found_ids
def seq_ids(self): id_list = IdList() for record in self.records: id_list.append(record.id) return id_list
def add_length_to_accordance_file(accordance_file, length_file, output_prefix): accordance_dict = SynDict(filename=accordance_file, allow_repeats_of_key=True) length_dict = SynDict(filename=length_file, expression=int) print(length_dict) longest_list = IdList() all_output_file = "%s.all.correspondence" % output_prefix longest_output_file = "%s.longest.correspondence" % output_prefix longest_id_file = "%s.longest.ids" % output_prefix with open(all_output_file, "w") as all_out_fd: with open(longest_output_file, "w") as longest_out_fd: for gene in accordance_dict: current_transcript = None current_length = 0 for transcript in accordance_dict[gene]: if length_dict[transcript] > current_length: current_transcript = transcript current_length = length_dict[transcript] all_out_fd.write( "%s\t%s\t%i\n" % (gene, transcript, length_dict[transcript])) longest_out_fd.write( "%s\t%s\t%i\n" % (gene, current_transcript, current_length)) longest_list.append(current_transcript) longest_list.write(longest_id_file)
def prepare_annotation_file_from_transcript_and_cds( self, transcript_file, cds_file, correspondence_file, output_prefix, format="fasta", correspondence_key_column=0, correspondence_value_column=1, verbose=False): transcript_dict = self.parse_seq_file(transcript_file, "parse", format=format) cds_dict = self.parse_seq_file(cds_file, "parse", format=format) correspondence_dict = SynDict(filename=correspondence_file, comments_prefix="#", key_index=correspondence_key_column, value_index=correspondence_value_column) no_corresponding_cds_transcript_list = IdList() cds_not_found_transcript_list = IdList() annotation_file = "%s.annotation" % output_prefix no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix with open(annotation_file, "w") as annotation_fd: for transcript_id in transcript_dict: if transcript_id not in correspondence_dict: no_corresponding_cds_transcript_list.append(transcript_id) if verbose: print( "No cds in correspondence file for transcript %s" % transcript_id) continue cds_id = correspondence_dict[transcript_id] length = len(cds_dict[cds_id].seq) start = transcript_dict[transcript_id].seq.upper().find( cds_dict[cds_id].seq.upper()) if start == -1: cds_not_found_transcript_list.append(transcript_id) if verbose: print("CDS was not found for transcript %s" % transcript_id) continue annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id, start + 1, length) annotation_fd.write(annotation_string) no_corresponding_cds_transcript_list.write( no_corresponding_cds_transcript_file) cds_not_found_transcript_list.write(cds_not_found_transcript_file)
def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None): extracted_families = SynDict() common_protein_names_to_families_dict = SynDict() common_names_to_eggnog_proteins_syn_dict = SynDict() not_found_proteins_common_names = IdList() transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value() for common_protein_name in protein_syn_dict: not_found = True for protein_id in protein_syn_dict[common_protein_name]: extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id if extended_protein_id in transposed_eggnog_fam_dict: not_found = False if common_protein_name not in common_protein_names_to_families_dict: common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]] common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id] else: common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0]) common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id) if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families: extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]] if not_found: not_found_proteins_common_names.append(common_protein_name) if output_prefix: extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True) common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True) common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True) not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix) #print common_names_to_eggnog_proteins_syn_dict #print common_protein_names_to_families_dict return extracted_families, common_protein_names_to_families_dict, \ common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
def cluster_sequence_names_by_id_fragment(self, seq_id_list, id_element_index, id_separator="_", output_prefix=None): cluster_dict = SynDict() skipped_id_list = IdList() for seq_id in seq_id_list: seq_id_splited = seq_id.split(id_separator) if id_element_index < len(seq_id_splited): if seq_id_list[id_element_index] in cluster_dict: cluster_dict[seq_id_list[id_element_index]].append(seq_id) else: cluster_dict[seq_id_list[id_element_index]] = [seq_id] else: skipped_id_list.append(seq_id) if output_prefix: cluster_dict.write("%s.seqid.clusters" % output_prefix, splited_values=True) skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix) return cluster_dict
def extract_top_hits( self, hmmer_hits, output_prefix, parsing_mode="index_db" ): #top_hits_file, top_hits_ids_file=None,not_significant_ids_file=None, not_found_ids_file=None): print(hmmer_hits) top_hits_ids = IdList() not_significant_ids = IdList() not_found_ids = IdList() top_hits_file = "%s.top_hits" % output_prefix top_hits_ids_file = "%s.top_hits.ids" % output_prefix not_significant_ids_file = "%s.not_significant.ids" % output_prefix not_found_ids_file = "%s.not_found.ids" % output_prefix index_file = "%s.hmmer_hits.tmp.idx" % output_prefix #hmm_dict = SearchIO.index_db(index_file, hmmer_hits, "hmmer3-text") hmm_dict = self.parse_search_file(hmmer_hits, parsing_mode, format="hmmer3-text", index_file=None) print(len(hmm_dict)) with open(top_hits_file, "w") as out_fd: out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write("%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) top_hits_ids.append(query) else: not_significant_ids.append(query) else: not_found_ids.append(query) if parsing_mode == "index_db": os.remove(index_file) for id_list, id_file in zip( [not_significant_ids, not_found_ids, top_hits_ids], [not_significant_ids_file, not_found_ids_file, top_hits_ids_file]): id_list.write(id_file)
def prepare_template_for_popart(alignment_file, output_file, haplotype_fam_file=None, traits_file=None, whitelist_file=None): from RouToolPa.Parsers.Sequence import CollectionSequence sequence_collection = CollectionSequence(in_file=alignment_file, parsing_mode="parse") sequence_collection.get_stats_and_features(count_gaps=False, sort=False) whitelist = IdSet(filename=whitelist_file) alignment_len = sequence_collection.seq_lengths["length"].unique() if len(alignment_len) > 1: raise ValueError( "ERROR!!! Sequences in alignment have different lengths!") alignment_len = alignment_len[0] haplotype_selected_sequence_dict = SynDict() haplotypes_without_sequences_ids = IdList() traits_df = pd.read_csv( traits_file, sep="\t", index_col=0) if traits_file else pd.DataFrame() if haplotype_fam_file: haplotype_dict = SynDict(filename=haplotype_fam_file, split_values=True) for haplotype_id in haplotype_dict: for sequence_id in haplotype_dict[haplotype_id]: if sequence_id in sequence_collection.records: haplotype_selected_sequence_dict[ haplotype_id] = sequence_id break else: haplotypes_without_sequences_ids.append(haplotype_id) else: haplotype_dict = dict([(entry, [entry]) for entry in sequence_collection.scaffolds]) haplotype_selected_sequence_dict = dict([ (entry, entry) for entry in sequence_collection.scaffolds ]) final_haplotype_set = (set(haplotype_selected_sequence_dict.keys()) & whitelist) if whitelist else set( haplotype_selected_sequence_dict.keys()) with open(output_file, "w") as out_fd: #out_fd.write("#NEXUS\nBEGIN TAXA;\nDIMENSIONS\nNTAX = %i;\nTAXLABELS\n%s\n;\nEND;\n\n" % (len(haplotype_selected_sequence_dict), # "\n".join(haplotype_selected_sequence_dict.keys()))) out_fd.write("#NEXUS\n\n") out_fd.write( "BEGIN DATA;\n\tDIMENSIONS NTAX=%i NCHAR=%i;\n\tFORMAT DATATYPE=DNA MISSING=? GAP=- MATCHCHAR=. ;\n" % (len(final_haplotype_set), alignment_len)) out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %s\n" % (haplotype_id, sequence_collection.records[ haplotype_selected_sequence_dict[haplotype_id]])) out_fd.write("\t;\nEND;\n\n") if not traits_df.empty: traits_number = len(traits_df.columns) out_fd.write( "BEGIN TRAITS;\n\tDimensions NTRAITS={0};\n\tFormat labels=yes missing=? separator=Comma;\n" .format(traits_number)) out_fd.write("\tTraitLabels {0};\n".format(" ".join( traits_df.columns))) out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %s\n" % (haplotype_id, ",".join(map(str, traits_df.loc[haplotype_id])) if haplotype_id in traits_df.index else ("0," * traits_number)[:-1])) else: out_fd.write( "BEGIN TRAITS;\n\tDimensions NTRAITS=1;\n\tFormat labels=yes missing=? separator=Comma;\n" ) out_fd.write("\tTraitLabels Area;\n") out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %i\n" % (haplotype_id, len(haplotype_dict[haplotype_id]))) out_fd.write("\t;\nEND;\n\n")
def handle_sanger_data(self, input_dir, output_prefix, outdir=None, read_subfolders=False, min_mean_qual=0, min_median_qual=0, min_len=50): if outdir: self.workdir = outdir self.init_dirs() sanger_filelist = self.make_list_of_path_to_files( input_dir, expression=self.is_sanger_file, recursive=read_subfolders, return_absolute_paths=True) stat_dict = TwoLvlDict() record_dict = OrderedDict() trimmed_record_dict = OrderedDict() excluded_list = IdList() excluded_counter = 0 low_quality_counter = 0 too_short_counter = 0 merged_raw_fastq = "%s/%s.raw.fastq" % (self.workdir, output_prefix) merged_raw_fasta = "%s/%s.raw.fasta" % (self.workdir, output_prefix) merged_trimmed_fastq = "%s/%s.trimmed.fastq" % (self.workdir, output_prefix) merged_trimmed_fasta = "%s/%s.trimmed.fasta" % (self.workdir, output_prefix) for filename in sanger_filelist: filename_list = self.split_filename(filename) record_raw_fastq = "%s/fastq/raw/%s.raw.fastq" % (self.workdir, filename_list[1]) record_raw_fasta = "%s/fasta/raw/%s.raw.fasta" % (self.workdir, filename_list[1]) record_raw_qual_plot_prefix = "%s/qual_plot/raw/%s.raw.qual" % ( self.workdir, filename_list[1]) record_trimmed_fastq = "%s/fastq/trimmed/%s.trimmed.fastq" % ( self.workdir, filename_list[1]) record_trimmed_fasta = "%s/fasta/trimmed/%s.trimmed.fasta" % ( self.workdir, filename_list[1]) record_trimmed_qual_plot_prefix = "%s/qual_plot/trimmed/%s.trimmed.qual" % ( self.workdir, filename_list[1]) record = SeqIO.read(self.metaopen(filename, "rb"), format="abi") record_dict[record.id] = record SeqIO.write(record, record_raw_fastq, format="fastq") SeqIO.write(record, record_raw_fasta, format="fasta") trimmed_record = SeqIO.AbiIO._abi_trim(record) stat_dict[record.id] = OrderedDict({ "raw_len": len(record), "raw_mean_qual": np.mean(record.letter_annotations["phred_quality"]), "raw_median_qual": np.median(record.letter_annotations["phred_quality"]), "trimmed_len": len(trimmed_record), "trimmed_mean_qual": np.mean(trimmed_record.letter_annotations["phred_quality"]), "trimmed_median_qual": np.median(trimmed_record.letter_annotations["phred_quality"]), "retained": "-", }) MatplotlibRoutines.draw_bar_plot( record.letter_annotations["phred_quality"], record_raw_qual_plot_prefix, extentions=["png"], xlabel="Position", ylabel="Phred quality", title="Per base quality", min_value=None, max_value=None, new_figure=True, figsize=(3 * (int(len(record) / 100) + 1), 3), close_figure=True) if stat_dict[record.id]["trimmed_len"] >= min_len: if min_median_qual: if (stat_dict[record.id]["trimmed_median_qual"] >= min_median_qual) and ( stat_dict[record.id]["trimmed_mean_qual"] >= min_mean_qual): stat_dict[record.id]["retained"] = "+" else: low_quality_counter += 1 else: stat_dict[record.id]["retained"] = "+" else: too_short_counter += 1 if stat_dict[record.id]["retained"] == "-": excluded_list.append(record.id) continue SeqIO.write(trimmed_record, record_trimmed_fastq, format="fastq") SeqIO.write(trimmed_record, record_trimmed_fasta, format="fasta") MatplotlibRoutines.draw_bar_plot( trimmed_record.letter_annotations["phred_quality"], record_trimmed_qual_plot_prefix, extentions=["png"], xlabel="Position", ylabel="Phred quality", title="Per base quality", min_value=None, max_value=None, new_figure=True, figsize=(3 * (int(len(record) / 100) + 1), 3), close_figure=True) trimmed_record_dict[record.id] = trimmed_record SeqIO.write(self.record_from_dict_generator(record_dict), merged_raw_fastq, format="fastq") SeqIO.write(self.record_from_dict_generator(record_dict), merged_raw_fasta, format="fasta") SeqIO.write(self.record_from_dict_generator(trimmed_record_dict), merged_trimmed_fastq, format="fastq") SeqIO.write(self.record_from_dict_generator(trimmed_record_dict), merged_trimmed_fasta, format="fasta") excluded_list.write("%s.excluded.ids" % output_prefix) stat_dict.write(out_filename="%s.stats" % output_prefix) print("Excluded: %i" % excluded_counter) print("\tToo short( < %i ): %i" % (min_len, too_short_counter)) print("\tLow quality( median < %i or mean < %i ): %i" % (min_median_qual, min_mean_qual, low_quality_counter))
count_dict[args.name_b][scaffold][i], ratio)) elif ratio < (1.0 / float(args.minimal_ratio)): vcf_b_more_variants_file_fd.write( "%s\t%i\t%i\t%i\t%i\t%i\t%.3f\n" % (scaffold, start, stop, i, count_dict[args.name_a][scaffold][i], count_dict[args.name_b][scaffold][i], ratio)) elif count_dict[args.name_a][scaffold] == 0: vcf_a_no_variants_file_fd.write("%s\t%i\t%i\t%i" % (scaffold, start, stop, i)) elif count_dict[args.name_b][scaffold] == 0: vcf_b_no_variants_file_fd.write("%s\t%i\t%i\t%i" % (scaffold, start, stop, i)) else: if scaffold not in count_dict[args.name_a]: vcf_a_absent_scaffolds_id_list.append(scaffold) if scaffold not in count_dict[args.name_b]: vcf_b_absent_scaffolds_id_list.append(scaffold) vcf_a_more_variants_file_fd.close() vcf_b_more_variants_file_fd.close() vcf_a_no_variants_file_fd.close() vcf_b_no_variants_file_fd.close() vcf_density_ratio_fd.close() vcf_a_absent_scaffolds_id_list.write(vcf_a_absent_scaffolds_id_file) vcf_b_absent_scaffolds_id_list.write(vcf_b_absent_scaffolds_id_file)
out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") annotations_dict = SeqIO.to_dict(GFF.parse(open(args.input))) single_gene_id_list = IdList() for record in annotations_dict: for feature in annotations_dict[record].features: #print feature.id if feature.type != "gene": continue for subfeature in feature.sub_features: if subfeature.type != "mRNA": continue exon_number = 0 for mRNA_subfeature in subfeature.sub_features: if mRNA_subfeature.type == "exon": exon_number += 1 if exon_number == 1: single_gene_id_list.append(feature.id) single_gene_id_list.write(out_fd, close_after_if_file_object=True) """ sequence_groups_id = SynDict() sequence_groups_id.read(args.id_file, split_values=True) #print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format) for group in sequence_groups_id: SeqIO.write(record_by_id_generator(sequence_dict, sequence_groups_id[group]), "%s%s.%s" % (args.output, group, args.extension), format=args.format) """
def parallel_run( self, input_dir, output_dir, output_prefix, input_type="codon", min_seq_number_for_conserved_position=None, min_seq_number_for_flank_position=None, max_pos_number_for_noncons_contig_pos=None, min_block_len=None, allow_gaps="half", save_postscript=True, output_type="htm", threads=None, ): if threads: self.threads = threads data_dir = "%s/data/" % output_dir postscript_dir = "%s/ps/" % output_dir results_dir = "%s/results/" % output_dir htm_dir = "%s/htm/" % output_dir for directory in output_dir, data_dir, postscript_dir, results_dir, htm_dir: self.safe_mkdir(directory) #input_files_list = map(os.path.abspath, self.make_list_of_path_to_files(input_directory)) input_files_list = self.make_list_of_path_to_files( input_dir, return_absolute_paths=True) for entry in input_files_list: directory, prefix, extension = self.split_filename(entry) os.system("ln -s %s %s/%s%s" % (entry, data_dir, prefix, extension)) data_files_list = self.make_list_of_path_to_files( data_dir, return_absolute_paths=True) common_options = self.parse_options( input_type=input_type, min_seq_number_for_conserved_position= min_seq_number_for_conserved_position, min_seq_number_for_flank_position=min_seq_number_for_flank_position, max_pos_number_for_noncons_contig_pos= max_pos_number_for_noncons_contig_pos, min_block_len=min_block_len, allow_gaps=allow_gaps, save_postscript=save_postscript, output_type=output_type, concatenate_blocks_from_aignments=None) options_list = [] for data_file in data_files_list: options = " %s" % data_file options += " %s" % common_options options_list.append(options) self.parallel_execute(options_list=options_list) block_coordinates = OrderedDict() skipped_ids_file = "%s/%s.skipped.ids" % (output_dir, output_prefix) skipped_ids = IdList() for filename in data_files_list: data_dir, prefix, extension = self.split_filename(filename) blocks_file = "%s-gb" % filename htm_file = "%s-gb.htm" % filename postscript_file = "%s-gbPS" % filename if (not os.path.exists(blocks_file)) or ( not os.path.exists(htm_file)): skipped_ids.append(prefix) print("Warning!!! %s skipped..." % prefix) continue block_coordinates[prefix] = self.extract_block_coordinates( htm_file) os.system("mv %s %s/%s.ps" % (postscript_file, postscript_dir, prefix)) os.system("mv %s %s/%s.htm" % (htm_file, htm_dir, prefix)) self.convert_output_to_fasta( blocks_file, "%s/%s%s" % (results_dir, prefix, extension)) os.remove(blocks_file) block_coordinates_file = "%s/%s.block.coordinates" % (output_dir, output_prefix) skipped_ids.write(skipped_ids_file) with open(block_coordinates_file, "w") as block_fd: for entry in block_coordinates: coordinates_string = ";".join( map(lambda s: "%i,%i" % (s[0], s[1]), block_coordinates[entry])) block_fd.write("%s\t%s\n" % (entry, coordinates_string))
if args.all or args.alignment: os.system("wget %s" % alignment_options) if args.all or args.tree: os.system("wget %s" % tree_options) if args.all or args.hmm: os.system("wget %s" % hmm_options) pool = Pool(args.threads) pool.map(download_data, family_ids) pool.close() for fam_id in family_ids: if args.all or args.alignment: if os.path.getsize("%s%s.fasta" % (args.output_dir, fam_id)) == 0: absent_alignment_list.append(fam_id) if args.all or args.tree: if os.path.getsize("%s%s.nwk" % (args.output_dir, fam_id)) == 0: absent_tree_list.append(fam_id) if args.all or args.hmm: if os.path.getsize("%s%s.hmm" % (args.output_dir, fam_id)) == 0: absent_hmm_list.append(fam_id) if absent_alignment_list: absent_alignment_list.write("absent_alignments.ids") print("%i alignments were not downloaded" % len(absent_alignment_list)) if absent_tree_list: absent_tree_list.write("absent_trees.ids") print("%i trees were not downloaded" % len(absent_tree_list)) if absent_hmm_list: absent_hmm_list.write("absent_hmms.ids")