def extract_top_hits(hmmer_hits, top_hits_file, top_hits_ids_file=None, not_significant_ids_file=None, not_found_ids_file=None): top_hits_ids = IdList() not_significant_ids = IdList() not_found_ids = IdList() index_file = "hmmer_hits.tmp.idx" hmm_dict = SearchIO.index_db(index_file, hmmer_hits, "hmmer3-text") out_fd = open(top_hits_file, "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write("%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) top_hits_ids.append(query) else: not_significant_ids.append(query) else: not_found_ids.append(query) os.remove(index_file) if not_significant_ids_file: not_significant_ids.write(not_significant_ids_file) if not_found_ids_file: not_found_ids.write(not_found_ids_file) if top_hits_ids_file: top_hits_ids.write(top_hits_ids_file)
def prepare_annotation_file_from_transcript_and_cds( self, transcript_file, cds_file, correspondence_file, output_prefix, format="fasta", correspondence_key_column=0, correspondence_value_column=1, verbose=False): transcript_dict = self.parse_seq_file(transcript_file, "parse", format=format) cds_dict = self.parse_seq_file(cds_file, "parse", format=format) correspondence_dict = SynDict(filename=correspondence_file, comments_prefix="#", key_index=correspondence_key_column, value_index=correspondence_value_column) no_corresponding_cds_transcript_list = IdList() cds_not_found_transcript_list = IdList() annotation_file = "%s.annotation" % output_prefix no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix with open(annotation_file, "w") as annotation_fd: for transcript_id in transcript_dict: if transcript_id not in correspondence_dict: no_corresponding_cds_transcript_list.append(transcript_id) if verbose: print( "No cds in correspondence file for transcript %s" % transcript_id) continue cds_id = correspondence_dict[transcript_id] length = len(cds_dict[cds_id].seq) start = transcript_dict[transcript_id].seq.upper().find( cds_dict[cds_id].seq.upper()) if start == -1: cds_not_found_transcript_list.append(transcript_id) if verbose: print("CDS was not found for transcript %s" % transcript_id) continue annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id, start + 1, length) annotation_fd.write(annotation_string) no_corresponding_cds_transcript_list.write( no_corresponding_cds_transcript_file) cds_not_found_transcript_list.write(cds_not_found_transcript_file)
def divide_counts_by_base_level(input_file, output_prefix, base_level, separator="\t", verbose=True, secondary_base_lvl=None): output_file = "%s.divided_by_baselvl" % output_prefix zero_base_lvl_list = IdList() zero_both_base_lvls_list = IdList() zero_base_lvl_list_file = "%s.zero_base_lvl.ids" % output_prefix zero_both_base_lvls_list_file = "%s.zero_base_lvls.ids" % output_prefix with open(input_file, "r") as in_fd: header = in_fd.readline() header_list = header.strip().split(separator) data_base_level_index = header_list.index(base_level) - 1 if secondary_base_lvl: data_secondary_base_level_index = header_list.index( secondary_base_lvl) - 1 with open(output_file, "w") as out_fd: out_fd.write(header) for line in in_fd: tmp_line = line.strip().split(separator) data = np.array(map(float, tmp_line[1:])) if data[data_base_level_index] == 0: zero_base_lvl_list.append(tmp_line[0]) if not secondary_base_lvl: if verbose: print( "Zero base level(%s) for %s...Skipping..." % (base_level, tmp_line[0])) continue if secondary_base_lvl: if data[data_secondary_base_level_index] == 0: zero_both_base_lvls_list.append(tmp_line[0]) if verbose: print( "Both base levels are zero (%s, %s) for %s...Skipping..." % (base_level, secondary_base_lvl, tmp_line[0])) continue data /= data[data_base_level_index] if data[ data_base_level_index] != 0 else data[ data_secondary_base_level_index] else: data /= data[data_base_level_index] output_string = tmp_line[0] + "\t" output_string += "\t".join(map(str, data)) output_string += "\n" out_fd.write(output_string) zero_base_lvl_list.write(zero_base_lvl_list_file) zero_both_base_lvls_list.write(zero_both_base_lvls_list_file)
def divide_counts_by_max_level( input_file, output_prefix, separator="\t", verbose=True, ): output_file = "%s.divided_by_maxlvl" % output_prefix zero_max_lvl_list = IdList() zero_max_lvl_list_file = "%s.zero_max_lvl.ids" % output_prefix with open(input_file, "r") as in_fd: header = in_fd.readline() header_list = header.strip().split(separator) with open(output_file, "w") as out_fd: out_fd.write(header) for line in in_fd: tmp_line = line.strip().split(separator) data = np.array(map(float, tmp_line[1:])) max_level = max(data) if max_level == 0: zero_max_lvl_list.append(tmp_line[0]) if verbose: print("Zero max level for %s...Skipping..." % tmp_line[0]) continue data /= max_level output_string = tmp_line[0] + "\t" output_string += "\t".join(map(str, data)) output_string += "\n" out_fd.write(output_string) zero_max_lvl_list.write(zero_max_lvl_list_file)
def extract_annotation_by_refence_id(list_of_target_gff, id_file, extracted_gff, filtered_out_gff): ids = IdList() ids.read(id_file) extracted_gff_fd = open(extracted_gff, "w") filtered_out_gff_fd = open(filtered_out_gff, "w") for filename in list_of_target_gff: with open(filename, "r") as in_fd: for line in in_fd: tmp = line if tmp == "# --- START OF GFF DUMP ---\n": # read until string with target_name will appear while tmp[0] == "#": tmp = next(in_fd, "") target_name = tmp.split("\t")[8].split( ";")[1].split()[1] if target_name not in ids: writing_fd = filtered_out_gff_fd else: writing_fd = extracted_gff_fd # print target_name writing_fd.write(tmp) while True: tmp = next(in_fd, "") if tmp == "# --- END OF GFF DUMP ---\n": break writing_fd.write(tmp) if tmp == "": break extracted_gff_fd.close() filtered_out_gff_fd.close()
def extract_top_hits_from_target_gff(list_of_target_gff, top_hits_gff, secondary_hits_gff, id_white_list_file=None, max_hits_per_query=None): if id_white_list_file: white_ids = IdList() white_ids.read(id_white_list_file) top_hits_gff_fd = open(top_hits_gff, "w") secondary_hits_gff_fd = open(secondary_hits_gff, "w") targets_list = [] hit_counter = 0 gene_counter = 0 for filename in list_of_target_gff: index = 0 with open(filename, "r") as in_fd: #print u #tmp = None for line in in_fd: tmp = line if tmp == "# --- START OF GFF DUMP ---\n": # read until string with target_name will appear while tmp[0] == "#": tmp = next(in_fd, "") target_name = tmp.split("\t")[8].split( ";")[1].split()[1] if id_white_list_file: if target_name not in white_ids: continue if target_name not in targets_list: writing_fd = top_hits_gff_fd targets_list.append(target_name) gene_counter += 1 hit_counter = 0 else: writing_fd = secondary_hits_gff_fd # print target_name hit_counter += 1 tmp = tmp.replace( "gene_id 0", "gene_id g%i_h%i" % (gene_counter, hit_counter)) if hit_counter <= max_hits_per_query: writing_fd.write(tmp) while True: tmp = next(in_fd, "") # print("cccc") if tmp == "# --- END OF GFF DUMP ---\n": break if max_hits_per_query: if hit_counter > max_hits_per_query: #print "aaaaa" continue writing_fd.write(tmp) if tmp == "": break top_hits_gff_fd.close() secondary_hits_gff_fd.close()
def get_longest_pep_per_gene_from_ensembl_pep_dict(protein_dict, output_prefix=None): length_file = "%s.protein_length.tsv" % output_prefix if output_prefix: longest_protein_id_file = "%s.longest_pep.ids" % output_prefix len_fd = open(length_file, 'w') len_fd.write("#gene_id\tprotein_id\tprotein_length\n") data_dict = OrderedDict() for protein_id in protein_dict: length = len(protein_dict[protein_id].seq) description_list = protein_dict[protein_id].description.split() #print protein_dict[protein_id] #print '' #print description_list for entry in description_list: if "gene:" in entry: gene_id = entry.split(":")[1] if output_prefix: len_fd.write("%s\t%s\t%i\n" % (gene_id, protein_id, length)) if gene_id not in data_dict: data_dict[gene_id] = protein_id else: if length > len(protein_dict[data_dict[gene_id]].seq): data_dict[gene_id] = protein_id longest_pep_ids = IdList(data_dict.values()) if output_prefix: longest_pep_ids.write(longest_protein_id_file) len_fd.close() return longest_pep_ids
def split_hmm(self, hmmfile, output_dir, num_of_recs_per_file, num_of_files=None, output_prefix=None, threads=4): try: os.mkdir(output_dir) except OSError: pass id_fd = CGAS.cgas(hmmfile, grep_pattern="NAME", whole_word_match=True, awk_code="{print $2}", capture_output=True) split_index = 1 ids_written = 0 ids_list = IdList() #ids_list = read_ids(id_fd, close_after_if_file_object=False) ids_list.read(id_fd, close_after_if_file_object=True) number_of_ids = len(ids_list) out_prefix = self.split_filename(hmmfile)[1] if output_prefix is None else output_prefix num_of_ids = int(number_of_ids/num_of_files) + 1 if num_of_files else num_of_recs_per_file common_options = " -f" common_options += " %s" % hmmfile options_list = [] while (ids_written + num_of_ids) <= number_of_ids: tmp_id_list = IdList(ids_list[ids_written:ids_written+num_of_ids]) tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index)) options = common_options options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index) options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index)) options_list.append(options) split_index += 1 ids_written += num_of_ids if ids_written != number_of_ids: tmp_id_list = IdList(ids_list[ids_written:]) tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index)) options = common_options options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index) options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index)) options_list.append(options) split_index += 1 #print options_list self.parallel_execute(options_list, cmd="hmmfetch", threads=threads)
def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", seq_format="fasta", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True): from Routines import SequenceRoutines cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) self.safe_mkdir(output_dir) out_dir = self.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db( "tmp.idx", self.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_dict) if absent_elements: print "Skipping cluster %s due to absent element(%s)" % ( fam_id, ",".join(absent_elements)) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) self.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, cluster_dict[fam_id], verbose=True), out_file, format=seq_format) os.remove("tmp.idx") print "%i of %i clusters were skipped due to absent elements" % ( number_of_skipped_clusters, len(cluster_dict)) return number_of_skipped_clusters
def extract_ids_from_file(input_file, output_file=None, header=False, column_separator="\t", comments_prefix="#", column_number=None): id_list = IdList() id_list.read(input_file, column_separator=column_separator, comments_prefix=comments_prefix, column_number=column_number, header=header) if output_file: id_list.write(output_file, header=header) return id_list
def divide_counts_by_several_base_level(input_file, output_prefix, base_levels, separator="\t", verbose=True, max_ratio_to_base_lvl=0.5): output_file = "%s.divided_by_max_baselvl" % output_prefix max_ratio_to_base_lvl_file = "%s.divided_by_max_baselvl.max_%f_ratio" % ( output_prefix, max_ratio_to_base_lvl) zero_max_base_lvl_list = IdList() zero_max_base_lvl_list_file = "%s.zero_base_lvls.ids" % output_prefix max_ratio_to_base_lvl_fd = open(max_ratio_to_base_lvl_file, "w") with open(input_file, "r") as in_fd: header = in_fd.readline() header_list = header.strip().split(separator) data_base_lvl_index_list = [] base_level_list = [base_levels] if isinstance(base_levels, str) else base_levels for level in base_level_list: data_base_lvl_index_list.append(header_list.index(level) - 1) with open(output_file, "w") as out_fd: out_fd.write(header) max_ratio_to_base_lvl_fd.write(header) for line in in_fd: tmp_line = line.strip().split(separator) data = np.array(map(float, tmp_line[1:])) max_base_lvl = max(np.take(data, data_base_lvl_index_list)) if max_base_lvl == 0: zero_max_base_lvl_list.append(tmp_line[0]) if verbose: print( "Zero max base level(s) for %s...Skipping..." % tmp_line[0]) continue data /= max_base_lvl output_string = tmp_line[0] + "\t" output_string += "\t".join(map(str, data)) output_string += "\n" if max(np.delete(data, data_base_lvl_index_list) ) <= max_ratio_to_base_lvl: max_ratio_to_base_lvl_fd.write(output_string) out_fd.write(output_string) zero_max_base_lvl_list.write(zero_max_base_lvl_list_file) max_ratio_to_base_lvl_fd.close()
def extract_clusters_by_element_ids_from_file(self, cluster_file, element_file, output_file, mode="w"): """" mode: "w" - if elements from element_id_list are present in cluster extracts only that elements "a" - if elements from element_id_list are present in cluster extracts all elements """ cluster_dict = SynDict() cluster_dict.read(cluster_file, split_values=True, comments_prefix="#") element_id_list = IdList() element_id_list.read(element_file, comments_prefix="#") extracted_clusters = self.extract_clusters_by_element_ids( cluster_dict, element_id_list, mode=mode) extracted_clusters.write(output_file, splited_values=True)
def extract_evidence_by_ids(evidence_file, id_file, output_evidence_file, mode="transcript"): # possible modes: transcript, gene ids = IdList() ids.read(id_file, comments_prefix="#") column_id = 0 if mode == "gene" else 1 with open(evidence_file, "r") as ev_fd: with open(output_evidence_file, "w") as out_fd: for line in ev_fd: if line[0] == "#": out_fd.write(line) continue entry_id = line.split("\t")[column_id] if entry_id in ids: out_fd.write(line)
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from Routines import SequenceRoutines, FileRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
parser.add_argument("-o", "--output_file", action="store", dest="output_file", help="Output file with extracted_annotations") parser.add_argument("-d", "--ids_file", action="store", dest="ids_file", help="File with ids of annotations to extract") parser.add_argument("-t", "--annotation_types", action="store", dest="annotation_types", default=["gene"], type=lambda s: s.split(","), help="Comma-separated list of annotation types to extract") args = parser.parse_args() annotation_ids = IdList() annotation_ids.read(args.ids_file, comments_prefix="#") #print args.annotation_types out_fd = open(args.output_file, "w") GFF.write( record_with_extracted_annotations_generator(args.input_gff, args.annotation_types), out_fd) out_fd.close()