def handle_input(filename): sys.stdout.write("Handling %s\n" % filename) not_significant_ids = IdList() not_found_ids = IdList() prefix = FileRoutines.split_filename(filename)[1] index_file = "%s.tmp.idx" % prefix hmm_dict = SearchIO.index_db(index_file, filename, args.format) if args.output == "stdout": out_fd = sys.stdout else: out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write( "%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) else: not_significant_ids.append(query) else: not_found_ids.append(query) if args.output != "stdout": out_fd.close() os.remove(index_file) return not_significant_ids, not_found_ids
def prepare_annotation_file_from_transcript_and_cds( self, transcript_file, cds_file, correspondence_file, output_prefix, format="fasta", correspondence_key_column=0, correspondence_value_column=1, verbose=False): transcript_dict = self.parse_seq_file(transcript_file, "parse", format=format) cds_dict = self.parse_seq_file(cds_file, "parse", format=format) correspondence_dict = SynDict(filename=correspondence_file, comments_prefix="#", key_index=correspondence_key_column, value_index=correspondence_value_column) no_corresponding_cds_transcript_list = IdList() cds_not_found_transcript_list = IdList() annotation_file = "%s.annotation" % output_prefix no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix with open(annotation_file, "w") as annotation_fd: for transcript_id in transcript_dict: if transcript_id not in correspondence_dict: no_corresponding_cds_transcript_list.append(transcript_id) if verbose: print( "No cds in correspondence file for transcript %s" % transcript_id) continue cds_id = correspondence_dict[transcript_id] length = len(cds_dict[cds_id].seq) start = transcript_dict[transcript_id].seq.upper().find( cds_dict[cds_id].seq.upper()) if start == -1: cds_not_found_transcript_list.append(transcript_id) if verbose: print("CDS was not found for transcript %s" % transcript_id) continue annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id, start + 1, length) annotation_fd.write(annotation_string) no_corresponding_cds_transcript_list.write( no_corresponding_cds_transcript_file) cds_not_found_transcript_list.write(cds_not_found_transcript_file)
def extract_entries_by_GO_from_eggnogmapper_output(eggnogmapper_output, GO_file, output_prefix, comments_prefix="#", separator="\t", ): GO_list = IdList(filename=GO_file, column_number=0) #print "GOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO" #print GO_list print (len(GO_list)) extracted_entries_file = "%s.annotations" % output_prefix extracted_entries = 0 with open(eggnogmapper_output, "r") as eggnog_fd: with open(extracted_entries_file, "w") as out_fd: for line in eggnog_fd: if line[0] == comments_prefix: out_fd.write(line) continue line_list = line.strip().split(separator) entry_GO_list = line_list[5].split(",") #print entry_GO_list for GO in entry_GO_list: if GO in GO_list: out_fd.write(line) extracted_entries += 1 break print("Extracted %i entries" % extracted_entries)
def extract_clusters_by_element_ids_from_file(self, cluster_file, element_file, output_file, mode="w", cluster_column=0, element_column=1, column_separator="\t", element_separator=",", id_column=None): """" mode: "w" - if elements from element_id_list are present in cluster extracts only that elements "a" - if elements from element_id_list are present in cluster extracts all elements """ cluster_dict = SynDict(filename=cluster_file, split_values=True, comments_prefix="#", key_index=cluster_column, value_index=element_column, separator=column_separator, values_separator=element_separator) element_id_list = IdList(filename=element_file, comments_prefix="#", column_number=id_column) extracted_clusters = self.extract_clusters_by_element_ids( cluster_dict, element_id_list, mode=mode) extracted_clusters.write(output_file, splited_values=True)
def get_longest_pep_per_gene_from_ensembl_pep_dict(protein_dict, output_prefix=None): length_file = "%s.protein_length.tsv" % output_prefix if output_prefix: longest_protein_id_file = "%s.longest_pep.ids" % output_prefix len_fd = open(length_file, 'w') len_fd.write("#gene_id\tprotein_id\tprotein_length\n") data_dict = OrderedDict() for protein_id in protein_dict: length = len(protein_dict[protein_id].seq) description_list = protein_dict[protein_id].description.split() #print protein_dict[protein_id] #print '' #print description_list for entry in description_list: if "gene:" in entry: gene_id = entry.split(":")[1] if output_prefix: len_fd.write("%s\t%s\t%i\n" % (gene_id, protein_id, length)) if gene_id not in data_dict: data_dict[gene_id] = protein_id else: if length > len(protein_dict[data_dict[gene_id]].seq): data_dict[gene_id] = protein_id longest_pep_ids = IdList(data_dict.values()) if output_prefix: longest_pep_ids.write(longest_protein_id_file) len_fd.close() return longest_pep_ids
def seq_ids(self): id_list = IdList() for record in self.records: id_list.append(record.id) return id_list
def add_length_to_accordance_file(accordance_file, length_file, output_prefix): accordance_dict = SynDict(filename=accordance_file, allow_repeats_of_key=True) length_dict = SynDict(filename=length_file, expression=int) print(length_dict) longest_list = IdList() all_output_file = "%s.all.correspondence" % output_prefix longest_output_file = "%s.longest.correspondence" % output_prefix longest_id_file = "%s.longest.ids" % output_prefix with open(all_output_file, "w") as all_out_fd: with open(longest_output_file, "w") as longest_out_fd: for gene in accordance_dict: current_transcript = None current_length = 0 for transcript in accordance_dict[gene]: if length_dict[transcript] > current_length: current_transcript = transcript current_length = length_dict[transcript] all_out_fd.write( "%s\t%s\t%i\n" % (gene, transcript, length_dict[transcript])) longest_out_fd.write( "%s\t%s\t%i\n" % (gene, current_transcript, current_length)) longest_list.append(current_transcript) longest_list.write(longest_id_file)
def extract_top_hits_from_target_gff(list_of_target_gff, top_hits_gff, secondary_hits_gff, id_white_list_file=None, max_hits_per_query=None): if id_white_list_file: white_ids = IdList() white_ids.read(id_white_list_file) top_hits_gff_fd = open(top_hits_gff, "w") secondary_hits_gff_fd = open(secondary_hits_gff, "w") targets_list = [] hit_counter = 0 gene_counter = 0 for filename in list_of_target_gff: index = 0 with open(filename, "r") as in_fd: #print u #tmp = None for line in in_fd: tmp = line if tmp == "# --- START OF GFF DUMP ---\n": # read until string with target_name will appear while tmp[0] == "#": tmp = next(in_fd, "") target_name = tmp.split("\t")[8].split( ";")[1].split()[1] if id_white_list_file: if target_name not in white_ids: continue if target_name not in targets_list: writing_fd = top_hits_gff_fd targets_list.append(target_name) gene_counter += 1 hit_counter = 0 else: writing_fd = secondary_hits_gff_fd # print target_name hit_counter += 1 tmp = tmp.replace( "gene_id 0", "gene_id g%i_h%i" % (gene_counter, hit_counter)) if hit_counter <= max_hits_per_query: writing_fd.write(tmp) while True: tmp = next(in_fd, "") # print("cccc") if tmp == "# --- END OF GFF DUMP ---\n": break if max_hits_per_query: if hit_counter > max_hits_per_query: #print "aaaaa" continue writing_fd.write(tmp) if tmp == "": break top_hits_gff_fd.close() secondary_hits_gff_fd.close()
def extract_annotation_by_refence_id(list_of_target_gff, id_file, extracted_gff, filtered_out_gff): ids = IdList() ids.read(id_file) extracted_gff_fd = open(extracted_gff, "w") filtered_out_gff_fd = open(filtered_out_gff, "w") for filename in list_of_target_gff: with open(filename, "r") as in_fd: for line in in_fd: tmp = line if tmp == "# --- START OF GFF DUMP ---\n": # read until string with target_name will appear while tmp[0] == "#": tmp = next(in_fd, "") target_name = tmp.split("\t")[8].split( ";")[1].split()[1] if target_name not in ids: writing_fd = filtered_out_gff_fd else: writing_fd = extracted_gff_fd # print target_name writing_fd.write(tmp) while True: tmp = next(in_fd, "") if tmp == "# --- END OF GFF DUMP ---\n": break writing_fd.write(tmp) if tmp == "": break extracted_gff_fd.close() filtered_out_gff_fd.close()
def extract_top_hits( self, hmmer_hits, output_prefix, parsing_mode="index_db" ): #top_hits_file, top_hits_ids_file=None,not_significant_ids_file=None, not_found_ids_file=None): print(hmmer_hits) top_hits_ids = IdList() not_significant_ids = IdList() not_found_ids = IdList() top_hits_file = "%s.top_hits" % output_prefix top_hits_ids_file = "%s.top_hits.ids" % output_prefix not_significant_ids_file = "%s.not_significant.ids" % output_prefix not_found_ids_file = "%s.not_found.ids" % output_prefix index_file = "%s.hmmer_hits.tmp.idx" % output_prefix #hmm_dict = SearchIO.index_db(index_file, hmmer_hits, "hmmer3-text") hmm_dict = self.parse_search_file(hmmer_hits, parsing_mode, format="hmmer3-text", index_file=None) print(len(hmm_dict)) with open(top_hits_file, "w") as out_fd: out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write("%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) top_hits_ids.append(query) else: not_significant_ids.append(query) else: not_found_ids.append(query) if parsing_mode == "index_db": os.remove(index_file) for id_list, id_file in zip( [not_significant_ids, not_found_ids, top_hits_ids], [not_significant_ids_file, not_found_ids_file, top_hits_ids_file]): id_list.write(id_file)
def create_per_cluster_element_id_files(self, cluster_dict, output_directory): self.safe_mkdir(output_directory) for cluster_id in cluster_dict: cluster_element_id_list = IdList(cluster_dict[cluster_id]) cluster_element_id_list.write("%s/%s.ids" % (output_directory, cluster_id))
def filter_psl_by_ids_from_file(self, psl_file, output_file, white_query_id_file=None, black_query_id_file=None, white_target_id_file=None, black_target_id_file=None): self.filter_psl_by_ids( psl_file, output_file, white_query_id_list=IdList( filename=white_query_id_file) if white_query_id_file else (), black_query_id_list=IdList( filename=black_query_id_file) if black_query_id_file else (), white_target_id_list=IdList( filename=white_target_id_file) if white_target_id_file else (), black_target_id_list=IdList( filename=black_target_id_file) if black_target_id_file else ())
def extract_emapper_annotations_by_protein_ids(emapper_annotation_file, protein_id_file, output_annotations): protein_ids = IdList(filename=protein_id_file) with open(emapper_annotation_file, "r") as ann_fd: with open(output_annotations, "w") as out_fd: for line in ann_fd: if line[0] == "#": out_fd.write(line) continue if line.split("\t")[0] in protein_ids: out_fd.write(line)
def cluster_sequence_names_by_id_fragment_from_file( self, seq_id_file, id_element_index, id_separator="_", output_prefix=None): seq_id_list = IdList(filename=seq_id_file) self.cluster_sequence_names_by_id_fragment(seq_id_list, id_element_index, id_separator=id_separator, output_prefix=output_prefix)
def remove_elements_by_ids_from_files(self, input_file, output_file, black_list_file, mode="full"): cluster_dict = SynDict(filename=input_file, split_values=True) black_list = IdList(filename=black_list_file) filtered_dict = self.remove_elements_by_ids(cluster_dict, black_list, mode=mode) filtered_dict.write(output_file, splited_values=True)
def extract_ids_from_file(input_file, output_file=None, header=False, column_separator="\t", comments_prefix="#", column_number=None): id_list = IdList() id_list.read(input_file, column_separator=column_separator, comments_prefix=comments_prefix, column_number=column_number, header=header) if output_file: id_list.write(output_file, header=header) return id_list
def extract_clusters_and_elements_by_labels_from_files( self, cluster_file, label_file, output_file, separator="@", label_position="first"): cluster_dict = SynDict(filename=cluster_file, split_values=True) label_list = IdList( filename=label_file) if isinstance(label_file, str) else label_file output_dict = self.extract_clusters_and_elements_by_labels( cluster_dict, label_list, separator=separator, label_position=label_position) output_dict.write(output_file, splited_values=True)
def convert_gff_to_simple_bed(input_gff, output_bed, feature_type_list=[], scaffold_id_file=None): if scaffold_id_file: scaffolds_id_list = IdList(filename=scaffold_id_file) with open(input_gff, "r") as gff_fd: with open(output_bed, "w") as bed_fd: for line in gff_fd: if line[0] == "#": continue tmp_list = line.strip().split("\t") if scaffold_id_file: if tmp_list[0] not in scaffolds_id_list: continue if feature_type_list: if tmp_list[2] not in feature_type_list: continue bed_fd.write("%s\t%s\t%s\n" % (tmp_list[0], tmp_list[3], tmp_list[4]))
def extract_evidence_by_ids(evidence_file, id_file, output_evidence_file, mode="transcript"): # possible modes: transcript, gene ids = IdList() ids.read(id_file, comments_prefix="#") column_id = 0 if mode == "gene" else 1 with open(evidence_file, "r") as ev_fd: with open(output_evidence_file, "w") as out_fd: for line in ev_fd: if line[0] == "#": out_fd.write(line) continue entry_id = line.split("\t")[column_id] if entry_id in ids: out_fd.write(line)
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from RouToolPa.Routines import SequenceRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def combine_ds_dn_w_from_bootstrap_data(self, input_dir, output_dir, use_node_names_if_possible=True): dn_dir = "%s/dN/" % output_dir ds_dir = "%s/dS/" % output_dir w_dir = "%s/W/" % output_dir for directory in output_dir, dn_dir, ds_dir, w_dir: self.safe_mkdir(directory) input_files = map(lambda s: "%s/%s" % (input_dir, s), os.listdir(input_dir)) data_dict = OrderedDict() for filename in input_files: with open(filename, "r") as in_fd: in_fd.readline() # read header for line in in_fd: node_id, node_name, dn, ds, w = line.strip().split("\t") if use_node_names_if_possible: node = node_id if node_name == "." else node_name else: node = node_id if node not in data_dict: data_dict[node] = OrderedDict() for parameter in "dN", "dS", "W": data_dict[node][parameter] = IdList() data_dict[node]["dN"].append(dn) data_dict[node]["dS"].append(ds) data_dict[node]["W"].append(w) for node in data_dict: for parameter in "dN", "dS", "W": out_file = "%s/%s/%s.tsv" % (output_dir, parameter, node) data_dict[node][parameter].write(out_file)
def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None): extracted_families = SynDict() common_protein_names_to_families_dict = SynDict() common_names_to_eggnog_proteins_syn_dict = SynDict() not_found_proteins_common_names = IdList() transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value() for common_protein_name in protein_syn_dict: not_found = True for protein_id in protein_syn_dict[common_protein_name]: extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id if extended_protein_id in transposed_eggnog_fam_dict: not_found = False if common_protein_name not in common_protein_names_to_families_dict: common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]] common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id] else: common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0]) common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id) if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families: extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]] if not_found: not_found_proteins_common_names.append(common_protein_name) if output_prefix: extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True) common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True) common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True) not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix) #print common_names_to_eggnog_proteins_syn_dict #print common_protein_names_to_families_dict return extracted_families, common_protein_names_to_families_dict, \ common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
def cluster_sequence_names_by_id_fragment(self, seq_id_list, id_element_index, id_separator="_", output_prefix=None): cluster_dict = SynDict() skipped_id_list = IdList() for seq_id in seq_id_list: seq_id_splited = seq_id.split(id_separator) if id_element_index < len(seq_id_splited): if seq_id_list[id_element_index] in cluster_dict: cluster_dict[seq_id_list[id_element_index]].append(seq_id) else: cluster_dict[seq_id_list[id_element_index]] = [seq_id] else: skipped_id_list.append(seq_id) if output_prefix: cluster_dict.write("%s.seqid.clusters" % output_prefix, splited_values=True) skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix) return cluster_dict
default=1, help="Number of simultaneous downloads") parser.add_argument("-c", "--connections", action="store", dest="connections", type=int, default=8, help="Number of connections for each download") args = parser.parse_args() if (not args.ids) and (not args.id_file): raise ValueError("Both ids and id file were not set") loader = IdList() id_list = loader.read(args.id_file) if args.id_file else args.ids Axel.threads = args.threads Axel.parallel_download_from_sra(id_list, args.connections) """ options_list = [] for entry_id in id_list: ftp_path = NCBIRoutines.get_sra_ftp_path_from_id(entry_id) options_list.append("-n %i %s" % (args.connections, ftp_path)) tool = Tool(cmd="axel", max_threads=args.threads) tool.parallel_execute(options_list) for filename in os.listdir(os.getcwd()):
def prepare_template_for_popart(alignment_file, output_file, haplotype_fam_file=None, traits_file=None, whitelist_file=None): from RouToolPa.Parsers.Sequence import CollectionSequence sequence_collection = CollectionSequence(in_file=alignment_file, parsing_mode="parse") sequence_collection.get_stats_and_features(count_gaps=False, sort=False) whitelist = IdSet(filename=whitelist_file) alignment_len = sequence_collection.seq_lengths["length"].unique() if len(alignment_len) > 1: raise ValueError( "ERROR!!! Sequences in alignment have different lengths!") alignment_len = alignment_len[0] haplotype_selected_sequence_dict = SynDict() haplotypes_without_sequences_ids = IdList() traits_df = pd.read_csv( traits_file, sep="\t", index_col=0) if traits_file else pd.DataFrame() if haplotype_fam_file: haplotype_dict = SynDict(filename=haplotype_fam_file, split_values=True) for haplotype_id in haplotype_dict: for sequence_id in haplotype_dict[haplotype_id]: if sequence_id in sequence_collection.records: haplotype_selected_sequence_dict[ haplotype_id] = sequence_id break else: haplotypes_without_sequences_ids.append(haplotype_id) else: haplotype_dict = dict([(entry, [entry]) for entry in sequence_collection.scaffolds]) haplotype_selected_sequence_dict = dict([ (entry, entry) for entry in sequence_collection.scaffolds ]) final_haplotype_set = (set(haplotype_selected_sequence_dict.keys()) & whitelist) if whitelist else set( haplotype_selected_sequence_dict.keys()) with open(output_file, "w") as out_fd: #out_fd.write("#NEXUS\nBEGIN TAXA;\nDIMENSIONS\nNTAX = %i;\nTAXLABELS\n%s\n;\nEND;\n\n" % (len(haplotype_selected_sequence_dict), # "\n".join(haplotype_selected_sequence_dict.keys()))) out_fd.write("#NEXUS\n\n") out_fd.write( "BEGIN DATA;\n\tDIMENSIONS NTAX=%i NCHAR=%i;\n\tFORMAT DATATYPE=DNA MISSING=? GAP=- MATCHCHAR=. ;\n" % (len(final_haplotype_set), alignment_len)) out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %s\n" % (haplotype_id, sequence_collection.records[ haplotype_selected_sequence_dict[haplotype_id]])) out_fd.write("\t;\nEND;\n\n") if not traits_df.empty: traits_number = len(traits_df.columns) out_fd.write( "BEGIN TRAITS;\n\tDimensions NTRAITS={0};\n\tFormat labels=yes missing=? separator=Comma;\n" .format(traits_number)) out_fd.write("\tTraitLabels {0};\n".format(" ".join( traits_df.columns))) out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %s\n" % (haplotype_id, ",".join(map(str, traits_df.loc[haplotype_id])) if haplotype_id in traits_df.index else ("0," * traits_number)[:-1])) else: out_fd.write( "BEGIN TRAITS;\n\tDimensions NTRAITS=1;\n\tFormat labels=yes missing=? separator=Comma;\n" ) out_fd.write("\tTraitLabels Area;\n") out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %i\n" % (haplotype_id, len(haplotype_dict[haplotype_id]))) out_fd.write("\t;\nEND;\n\n")
action="store", dest="output_prefix", default="stdout", help="Prefix of output file") args = parser.parse_args() out_fd = sys.stdout if args.output_prefix == "stdout" else open( "%s_reference_random_genes.ids" % args.output_prefix, "w") reference_families = SynDict() reference_families.read(args.reference_fam, separator="\t", split_values=True, values_separator=",") node_family_ids = IdList() node_family_ids.read(args.input, header=True, column_number=0, column_separator="\t") reference_random_genes = SynDict() for family_id in node_family_ids: if family_id not in reference_families: reference_random_genes[family_id] = "." else: reference_random_genes[family_id] = choice( reference_families[family_id]) reference_random_genes.write("%s_reference_random_genes.t" %
__author__ = 'Sergei F. Kliver' import sys import argparse from RouToolPa.Collections.General import IdList parser = argparse.ArgumentParser() parser.add_argument("-i", "--fam_file", action="store", dest="fam_file", required=True, help="File with families") parser.add_argument("-o", "--output", action="store", dest="output", default="stdout", help="File to write ids") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") id_list = IdList() id_list.read(args.fam_file, close_after_if_file_object=True, column_number=1, id_in_column_separator=",") id_list.write(args.output, close_after_if_file_object=True)
def handle_sanger_data(self, input_dir, output_prefix, outdir=None, read_subfolders=False, min_mean_qual=0, min_median_qual=0, min_len=50): if outdir: self.workdir = outdir self.init_dirs() sanger_filelist = self.make_list_of_path_to_files( input_dir, expression=self.is_sanger_file, recursive=read_subfolders, return_absolute_paths=True) stat_dict = TwoLvlDict() record_dict = OrderedDict() trimmed_record_dict = OrderedDict() excluded_list = IdList() excluded_counter = 0 low_quality_counter = 0 too_short_counter = 0 merged_raw_fastq = "%s/%s.raw.fastq" % (self.workdir, output_prefix) merged_raw_fasta = "%s/%s.raw.fasta" % (self.workdir, output_prefix) merged_trimmed_fastq = "%s/%s.trimmed.fastq" % (self.workdir, output_prefix) merged_trimmed_fasta = "%s/%s.trimmed.fasta" % (self.workdir, output_prefix) for filename in sanger_filelist: filename_list = self.split_filename(filename) record_raw_fastq = "%s/fastq/raw/%s.raw.fastq" % (self.workdir, filename_list[1]) record_raw_fasta = "%s/fasta/raw/%s.raw.fasta" % (self.workdir, filename_list[1]) record_raw_qual_plot_prefix = "%s/qual_plot/raw/%s.raw.qual" % ( self.workdir, filename_list[1]) record_trimmed_fastq = "%s/fastq/trimmed/%s.trimmed.fastq" % ( self.workdir, filename_list[1]) record_trimmed_fasta = "%s/fasta/trimmed/%s.trimmed.fasta" % ( self.workdir, filename_list[1]) record_trimmed_qual_plot_prefix = "%s/qual_plot/trimmed/%s.trimmed.qual" % ( self.workdir, filename_list[1]) record = SeqIO.read(self.metaopen(filename, "rb"), format="abi") record_dict[record.id] = record SeqIO.write(record, record_raw_fastq, format="fastq") SeqIO.write(record, record_raw_fasta, format="fasta") trimmed_record = SeqIO.AbiIO._abi_trim(record) stat_dict[record.id] = OrderedDict({ "raw_len": len(record), "raw_mean_qual": np.mean(record.letter_annotations["phred_quality"]), "raw_median_qual": np.median(record.letter_annotations["phred_quality"]), "trimmed_len": len(trimmed_record), "trimmed_mean_qual": np.mean(trimmed_record.letter_annotations["phred_quality"]), "trimmed_median_qual": np.median(trimmed_record.letter_annotations["phred_quality"]), "retained": "-", }) MatplotlibRoutines.draw_bar_plot( record.letter_annotations["phred_quality"], record_raw_qual_plot_prefix, extentions=["png"], xlabel="Position", ylabel="Phred quality", title="Per base quality", min_value=None, max_value=None, new_figure=True, figsize=(3 * (int(len(record) / 100) + 1), 3), close_figure=True) if stat_dict[record.id]["trimmed_len"] >= min_len: if min_median_qual: if (stat_dict[record.id]["trimmed_median_qual"] >= min_median_qual) and ( stat_dict[record.id]["trimmed_mean_qual"] >= min_mean_qual): stat_dict[record.id]["retained"] = "+" else: low_quality_counter += 1 else: stat_dict[record.id]["retained"] = "+" else: too_short_counter += 1 if stat_dict[record.id]["retained"] == "-": excluded_list.append(record.id) continue SeqIO.write(trimmed_record, record_trimmed_fastq, format="fastq") SeqIO.write(trimmed_record, record_trimmed_fasta, format="fasta") MatplotlibRoutines.draw_bar_plot( trimmed_record.letter_annotations["phred_quality"], record_trimmed_qual_plot_prefix, extentions=["png"], xlabel="Position", ylabel="Phred quality", title="Per base quality", min_value=None, max_value=None, new_figure=True, figsize=(3 * (int(len(record) / 100) + 1), 3), close_figure=True) trimmed_record_dict[record.id] = trimmed_record SeqIO.write(self.record_from_dict_generator(record_dict), merged_raw_fastq, format="fastq") SeqIO.write(self.record_from_dict_generator(record_dict), merged_raw_fasta, format="fasta") SeqIO.write(self.record_from_dict_generator(trimmed_record_dict), merged_trimmed_fastq, format="fastq") SeqIO.write(self.record_from_dict_generator(trimmed_record_dict), merged_trimmed_fasta, format="fasta") excluded_list.write("%s.excluded.ids" % output_prefix) stat_dict.write(out_filename="%s.stats" % output_prefix) print("Excluded: %i" % excluded_counter) print("\tToo short( < %i ): %i" % (min_len, too_short_counter)) print("\tLow quality( median < %i or mean < %i ): %i" % (min_median_qual, min_mean_qual, low_quality_counter))
"-p", "--depth", action="store", dest="depth", type=int, default=2, help= "The maximum depth to perform extraction of cluster using inconsistent method. Default: 2" ) parser.add_argument( "-a", "--scaffold_white_list", action="store", dest="scaffold_white_list", default=[], type=lambda s: IdList(filename=s) if os.path.exists(s) else s.split(","), help="Comma-separated list of the only scaffolds to draw. Default: all") parser.add_argument( "-b", "--scaffold_black_list", action="store", dest="scaffold_black_list", default=[], type=lambda s: IdList(filename=s) if os.path.exists(s) else s.split(","), help= "Comma-separated list of scaffolds to skip at drawing. Default: not set") parser.add_argument( "-z", "--scaffold_ordered_list",
parser.add_argument("-f", "--value_file", action="store", dest="value_file", required=True, help="Value with values to seek for") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", required=True, help="Output .gff file") parser.add_argument( "-d", "--description_fields", action="store", dest="field_id_list", type=lambda s: s.split(","), required=True, help="Comma-separated list of fields in gff description to check") args = parser.parse_args() value_list = IdList(filename=args.value_file) AnnotationsRoutines.extract_gff_records_by_description_value( args.input_gff, args.output_gff, args.field_id_list, value_list, retain_comments=False)