def run_vsearch_on_reads(se_fastq_files, pe_fastq_files, vsearch, dir_vsearch_results_fa_trim, fpatt, ss, seqtk): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: if vsearch is None: Log.err('vsearch is not available. Cannot continue. Exiting.') exit(0) if seqtk is None: Log.err('seqtk is not available. Cannot continue. Exiting.') exit(0) # FixMe: Expose in configuration files? ident = 0.85 for se in se_fastq_files: dir_results = opj(dir_vsearch_results_fa_trim, se) min_acc_len = se_fastq_files[se]['min_acc_len'] blast_results_fa_path = se_fastq_files[se]['blast_results_path' + '__' + ss] fq_path = se_fastq_files[se]['filter_path_fq'] out_f = opj(dir_results, se + '__' + ss + '.txt') out_f_fastq = out_f.replace('.txt', '.fastq') se_fastq_files[se]['vsearch_results_path' + '__' + ss] = out_f_fastq if ope(out_f_fastq): Log.msg('Vsearch results already exist:', se) else: make_dirs(dir_results) Log.msg('Running vsearch on: ' + basename(fq_path), ss) run_vsearch(vsearch, ident=ident, q_file=blast_results_fa_path, db_file=fq_path, out_file=out_f, minlen=min_acc_len) Log.msg('Extracting unique vsearch hits using Seqtk:', ss) keep_unique_lines_in_file(out_f) seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f) osremove(out_f) for pe in pe_fastq_files: dir_results = opj(dir_vsearch_results_fa_trim, pe) min_acc_len = pe_fastq_files[pe]['min_acc_len'] blast_results_fa_path = pe_fastq_files[pe]['blast_results_path' + '__' + ss] fq_paths = pe_fastq_files[pe]['filter_path_fq'] out_fs = [x.replace('@D@', dir_results) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] out_fs = [x.replace('@Q@', ss) for x in out_fs] out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs] pe_fastq_files[pe]['vsearch_results_path' + '__' + ss] = out_fs_fastq if ope(out_fs_fastq[0]) and ope(out_fs_fastq[1]) and \ ope(out_fs_fastq[2]) and ope(out_fs_fastq[3]): Log.msg('Vsearch results already exist:', pe) else: make_dirs(dir_results) pe_trim_files = zip(fq_paths, out_fs, out_fs_fastq) for x in pe_trim_files: Log.msg('Running vsearch on: ' + basename(x[0]), ss) run_vsearch(vsearch, ident=ident, q_file=blast_results_fa_path, db_file=x[0], out_file=x[1], minlen=min_acc_len) Log.msg( 'Extracting unique vsearch hits from paired files ' 'using Seqtk:', ss) p1txt = out_fs[0] p2txt = out_fs[1] p1fq = fq_paths[0] p2fq = fq_paths[1] p1fq_out = out_fs_fastq[0] p2fq_out = out_fs_fastq[1] p12txt_temp = opj(dir_results, pe + '__' + ss + '_paired.txt') combine_text_files([p1txt, p2txt], p12txt_temp) keep_unique_lines_in_file(p12txt_temp) seqtk_extract_reads(seqtk, p1fq, p1fq_out, p12txt_temp) seqtk_extract_reads(seqtk, p2fq, p2fq_out, p12txt_temp) osremove(p1txt) osremove(p2txt) osremove(p12txt_temp) Log.msg( 'Extracting unique vsearch hits from unpaired files ' 'using Seqtk:', ss) u1txt = out_fs[2] u2txt = out_fs[3] u1fq = fq_paths[2] u2fq = fq_paths[3] u1fq_out = out_fs_fastq[2] u2fq_out = out_fs_fastq[3] keep_unique_lines_in_file(u1txt) keep_unique_lines_in_file(u2txt) seqtk_extract_reads(seqtk, u1fq, u1fq_out, u1txt) seqtk_extract_reads(seqtk, u2fq, u2fq_out, u2txt) osremove(u1txt) osremove(u2txt)
def run_tblastn_on_reads(se_fastq_files, pe_fastq_files, aa_queries_file, tblastn, blast_1_evalue, blast_1_max_hsps, blast_1_qcov_hsp_perc, blast_1_best_hit_overhang, blast_1_best_hit_score_edge, blast_1_max_target_seqs, dir_blast_results_fa_trim, fpatt, ss, threads, seqtk, vsearch, dir_cache_prj): changed_blast_1 = False if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Running BLAST on reads:', ss) if tblastn is None: Log.err('tblastn is not available. Cannot continue. Exiting.') exit(0) if vsearch is None: Log.err('vsearch is not available. Cannot continue. Exiting.') exit(0) if seqtk is None: Log.err('seqtk is not available. Cannot continue. Exiting.') exit(0) cache_file = opj(dir_cache_prj, 'blast_1_settings_cache__' + ss) pickled = dict() settings = { 'blast_1_evalue': blast_1_evalue, 'blast_1_max_hsps': blast_1_max_hsps, 'blast_1_qcov_hsp_perc': blast_1_qcov_hsp_perc, 'blast_1_best_hit_overhang': blast_1_best_hit_overhang, 'blast_1_best_hit_score_edge': blast_1_best_hit_score_edge, 'blast_1_max_target_seqs': blast_1_max_target_seqs, 'queries': seq_records_to_dict(read_fasta(aa_queries_file, SEQ_TYPE_AA)) } Log.msg('evalue:', str(blast_1_evalue)) Log.msg('max_hsps:', str(blast_1_max_hsps)) Log.msg('qcov_hsp_perc:', str(blast_1_qcov_hsp_perc)) Log.msg('best_hit_overhang:', str(blast_1_best_hit_overhang)) Log.msg('best_hit_score_edge:', str(blast_1_best_hit_score_edge)) Log.msg('max_target_seqs:', str(blast_1_max_target_seqs)) print() # FixMe: Expose in configuration files? ident = 0.85 for se in se_fastq_files: dir_results = opj(dir_blast_results_fa_trim, se) blast_db_path = se_fastq_files[se]['blast_db_path'] fq_path = se_fastq_files[se]['filter_path_fq'] out_f = opj(dir_results, se + '__' + ss + '.txt') out_f_fastq = out_f.replace('.txt', '.fastq') out_f_fasta = out_f.replace('.txt', '.fasta') se_fastq_files[se]['blast_results_path' + '__' + ss] = out_f_fasta genetic_code = se_fastq_files[se]['gc_id'] if ope(out_f_fasta) and ope(cache_file): with open(cache_file, 'rb') as f: pickled = pickle.load(f) if ope(out_f_fasta) and pickled == settings: # Log.msg('The provided BLAST settings and query sequences did ' # 'not change since the previous run.') Log.msg('BLAST results already exist:', se) else: changed_blast_1 = True make_dirs(dir_results) Log.msg('Running tblastn on: ' + basename(blast_db_path), ss) run_blast(exec_file=tblastn, task='tblastn', threads=threads, db_path=blast_db_path, queries_file=aa_queries_file, out_file=out_f, evalue=blast_1_evalue, max_hsps=blast_1_max_hsps, qcov_hsp_perc=blast_1_qcov_hsp_perc, best_hit_overhang=blast_1_best_hit_overhang, best_hit_score_edge=blast_1_best_hit_score_edge, max_target_seqs=blast_1_max_target_seqs, db_genetic_code=genetic_code, out_cols=BLST_RES_COLS_1) Log.inf('Extracting unique BLAST hits using Seqtk:', ss) keep_unique_lines_in_file(out_f) seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f) seqtk_fq_to_fa(seqtk, out_f_fastq, out_f_fasta) osremove(out_f) osremove(out_f_fastq) out_f_fasta_temp = out_f_fasta + '_temp' copyfile(out_f_fasta, out_f_fasta_temp) run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta) osremove(out_f_fasta_temp) for pe in pe_fastq_files: dir_results = opj(dir_blast_results_fa_trim, pe) blast_db_paths = pe_fastq_files[pe]['blast_db_path'] fq_paths = pe_fastq_files[pe]['filter_path_fq'] out_fs = [x.replace('@D@', dir_results) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] out_fs = [x.replace('@Q@', ss) for x in out_fs] out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs] out_fs_fasta = [x.replace('.txt', '.fasta') for x in out_fs] out_f_fasta = opj(dir_results, pe + '__' + ss + '.fasta') pe_fastq_files[pe]['blast_results_path' + '__' + ss] = out_f_fasta genetic_code = pe_fastq_files[pe]['gc_id'] if ope(out_f_fasta) and ope(cache_file): with open(cache_file, 'rb') as f: pickled = pickle.load(f) if ope(out_f_fasta) and pickled == settings: # Log.msg('The provided BLAST settings and query sequences did ' # 'not change since the previous run.') Log.msg('BLAST results already exist:', pe) else: changed_blast_1 = True make_dirs(dir_results) pe_trim_files = zip(blast_db_paths, out_fs, fq_paths, out_fs_fastq, out_fs_fasta) for x in pe_trim_files: Log.msg('Running tblastn on: ' + basename(x[0]), ss) run_blast(exec_file=tblastn, task='tblastn', threads=threads, db_path=x[0], queries_file=aa_queries_file, out_file=x[1], evalue=blast_1_evalue, max_hsps=blast_1_max_hsps, qcov_hsp_perc=blast_1_qcov_hsp_perc, best_hit_overhang=blast_1_best_hit_overhang, best_hit_score_edge=blast_1_best_hit_score_edge, max_target_seqs=blast_1_max_target_seqs, db_genetic_code=genetic_code, out_cols=BLST_RES_COLS_1) Log.msg('Extracting unique BLAST hits using Seqtk:', ss) keep_unique_lines_in_file(x[1]) seqtk_extract_reads(seqtk, x[2], x[3], x[1]) seqtk_fq_to_fa(seqtk, x[3], x[4]) osremove(x[1]) osremove(x[3]) combine_text_files(out_fs_fasta, out_f_fasta) out_f_fasta_temp = out_f_fasta + '_temp' copyfile(out_f_fasta, out_f_fasta_temp) run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta) osremove(out_f_fasta_temp) for x in out_fs_fasta: osremove(x) with open(cache_file, 'wb') as f: pickle.dump(settings, f, protocol=PICKLE_PROTOCOL) return changed_blast_1
def find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk, dir_temp, prepend_assmbl, min_target_orf_len, max_target_orf_len, allow_non_aug, allow_no_strt_cod, allow_no_stop_cod, tax, tax_group, tax_ids_user, min_overlap, organelle): if len(assemblies) > 0: if seqtk is None: Log.err('seqtk is not available. Cannot continue. Exiting.') exit(0) for a in assemblies: if ('blast_hits_aa__' + ss) not in a: continue assmbl_name = a['name'] tax_id = a['tax_id'] parsed_hits = a['blast_hits_aa__' + ss] a_path = a['path'] gc_tt = a['gc_tt'] if tax.is_eukaryote(tax_id) is True: if organelle == 'mitochondrion': gc_tt = a['gc_tt_mito'] if tax.contains_plastid(tax_id) is True: if organelle == 'plastid': gc_tt = a['gc_tt_plastid'] transcripts_nt_fasta_file = opj( dir_prj_transcripts, assmbl_name + '_transcripts_nt__' + ss + '.fasta') transcripts_nt_orf_fasta_file = opj( dir_prj_transcripts, assmbl_name + '_transcripts_nt_orf__' + ss + '.fasta') transcripts_aa_orf_fasta_file = opj( dir_prj_transcripts, assmbl_name + '_transcripts_aa_orf__' + ss + '.fasta') transcripts_nt = {} transcripts_nt_orf = {} transcripts_aa_orf = {} transcripts_with_acceptable_orfs = set() ann_key = 'annotations__' a[ann_key + ss] = {} collated = collate_blast_results(parsed_hits) ###################################################################### # Use seqtk to sample the assembly FASTA file for sequences with # BLAST hits. This increases the speed substantially when the assembly # file is large. temp_a_file = opj(dir_temp, 'temp__' + ss + '.fasta') temp_s_file = opj(dir_temp, 'temp__' + ss + '.txt') sseqids_subsample = [] for hit in collated: target_name = hit['sseqid'] sseqids_subsample.append(target_name) sseqids_subsample_text = '\n'.join(sseqids_subsample) with open(temp_s_file, 'w') as f: f.write(sseqids_subsample_text) seqtk_extract_reads(seqtk, in_file=a_path, out_file=temp_a_file, ids_file=temp_s_file) with open(temp_a_file, 'r') as f: _ = f.read() if _.strip() == '': continue print() Log.inf('Analyzing BLAST hits', '=' * 113 + '\n') Log.msg('Assembly:', assmbl_name, False) Log.msg('Search Strategy:', ss + '\n\n' + '-' * 134 + '\n', False) parsed_fasta = trim_desc_to_first_space_in_fasta_text(_, SEQ_TYPE_DNA) parsed_fasta = seq_records_to_dict(parsed_fasta) ###################################################################### all_kakapo_results = {} json_dump_file_path = opj(dir_prj_transcripts, assmbl_name + '_ann_kakapo__' + ss + '.json') for hit in collated: target_name = hit['sseqid'] target_seq = parsed_fasta[target_name] query_name = hit['qseqid'] hit_evalue = hit['evalue'] # Prepend assembly name to the sequence name: if prepend_assmbl is True: target_name = assmbl_name + '__' + target_name # Also prepend taxonomic info to the sequence name: if tax_id is not None: fm = tax.higher_rank_for_taxid(tax_id, rank='family') if fm is not None: target_name = fm + '__' + target_name hit_start = hit['start'] hit_end = hit['end'] hit_frame = hit['frame'] if allow_non_aug is True: start_codons = gc_tt.start_codons_ambiguous else: start_codons = ['ATG'] stop_codons = gc_tt.stop_codons_ambiguous ################################################################## if tax_id is not None: tax_ids_for_orf = (tax_id, ) else: tax_ids_for_orf = tax_ids_user cntx_txids_avail = tuple( sorted( set( map(lambda x: int(x.split('_')[0]), atg_contexts.keys())))) cntx_taxid = set() for txid in tax_ids_for_orf: tax_path = partial(tax.path_between_taxids, txid) path_len = tuple( map(len, tuple(map(tax_path, cntx_txids_avail)))) cntx_taxid.add(cntx_txids_avail[path_len.index(min(path_len))]) cntx_taxid = tuple(cntx_taxid)[0] cntx_l_key = str(cntx_taxid) + '_L' cntx_r_key = str(cntx_taxid) + '_R' cntx_l = atg_contexts[cntx_l_key] cntx_r = atg_contexts[cntx_r_key] ################################################################## orf_log_str = ('grade'.rjust(5) + 'ovrlp'.rjust(7) + 'cntx'.rjust(6) + 'length'.center(9) + 'cntx_l'.rjust(7) + 'cntx_r'.rjust(15) + '\n') orf = find_orf_for_blast_hit(seq=target_seq, frame=hit_frame, hit_start=hit_start, hit_end=hit_end, stop_codons=stop_codons, start_codons=start_codons, context_l=cntx_l, context_r=cntx_r, min_overlap=min_overlap, min_len=min_target_orf_len, max_len=max_target_orf_len, allow_no_strt_cod=allow_no_strt_cod, allow_no_stop_cod=allow_no_stop_cod) orf_log_str += orf[2] rev_comp_def_str = '' if hit_frame > 0: ann_hit_b = hit_start ann_hit_e = hit_end else: target_seq = reverse_complement(target_seq) ann_hit_b = len(target_seq) - hit_start ann_hit_e = len(target_seq) - hit_end rev_comp_def_str = '; RevComp' target_def = target_name + ' ' + query_name + rev_comp_def_str a[ann_key + ss][target_name] = {} good_orfs = orf[0] bad_orfs = orf[1] if len(good_orfs) > 0: a[ann_key + ss][target_name]['orfs_good'] = dict() orfs_good_dict = a[ann_key + ss][target_name]['orfs_good'] orf_log_str += '\n' + 'VALID ' + '-' * 128 + '\n' for i, good_orf in enumerate(good_orfs): good_orf_frame = good_orf[2] if good_orf_frame > 0: ann_orf_b = good_orf[0] ann_orf_e = good_orf[1] + 3 orf_seq = target_seq[ann_orf_b:ann_orf_e] else: ann_orf_b = len(target_seq) - good_orf[1] ann_orf_e = len(target_seq) - good_orf[0] + 3 orf_seq = target_seq[ann_orf_b:ann_orf_e] orf_good_dict = dict() orf_good_dict['orf_begin'] = ann_orf_b orf_good_dict['orf_end'] = ann_orf_e orf_good_dict['orf_frame'] = abs(good_orf_frame) orf_good_dict['orf_grade'] = good_orf[3] orf_good_dict['orf_tt_id'] = str(gc_tt.gc_id) orf_good_dict['orf_tt_name'] = gc_tt.gc_name orfs_good_dict['ORF{:03d}'.format(i + 1)] = orf_good_dict target_def_orf = (target_name + '__ORF{:03d}'.format(i + 1) + ' ' + query_name + rev_comp_def_str) transcripts_nt_orf[target_def_orf] = orf_seq transcripts_with_acceptable_orfs.add(target_name) transl_seq = translate(orf_seq, gc_tt.table_ambiguous, start_codons) transcripts_aa_orf[target_def_orf] = transl_seq[:-1] else: orf_log_str += '\n' + 'NOT VALID ' + '-' * 124 + '\n' Log.msg('Transcript:', target_name, False) Log.msg(' Query:', query_name + '\n\n' + orf_log_str, False) if len(bad_orfs) > 0: a[ann_key + ss][target_name]['orfs_bad'] = dict() orfs_bad_dict = a[ann_key + ss][target_name]['orfs_bad'] for i, bad_orf in enumerate(bad_orfs): bad_orf_frame = bad_orf[2] if bad_orf_frame > 0: ann_orf_b = bad_orf[0] ann_orf_e = bad_orf[1] + 3 orf_seq = target_seq[ann_orf_b:ann_orf_e] else: ann_orf_b = len(target_seq) - bad_orf[1] ann_orf_e = len(target_seq) - bad_orf[0] + 3 orf_seq = target_seq[ann_orf_b:ann_orf_e] orf_bad_dict = dict() orf_bad_dict['orf_begin'] = ann_orf_b orf_bad_dict['orf_end'] = ann_orf_e orf_bad_dict['orf_frame'] = abs(bad_orf_frame) orf_bad_dict['orf_grade'] = bad_orf[3] orf_bad_dict['orf_tt_id'] = str(gc_tt.gc_id) orf_bad_dict['orf_tt_name'] = gc_tt.gc_name orfs_bad_dict['ORF{:03d}'.format(i + 1)] = orf_bad_dict transcripts_nt[target_def] = target_seq a[ann_key + ss][target_name]['blast_hit'] = dict() blast_hit_dict = a[ann_key + ss][target_name]['blast_hit'] blast_hit_dict['query_name'] = query_name blast_hit_dict['query_id'] = ss blast_hit_dict['evalue'] = hit_evalue blast_hit_dict['frame'] = abs(hit_frame) blast_hit_dict['blast_hit_begin'] = ann_hit_b blast_hit_dict['blast_hit_end'] = ann_hit_e # Collect ORF and BLAST hit annotations for downstream use. ###### kakapo_json = [{}] kakapo_json[0]['kakapo_annotations__' + ss] = (a[ann_key + ss][target_name]) all_kakapo_results[target_name] = kakapo_json ################################################################## # -------------------------------------------------------------------- Log.msg('Assembly:', assmbl_name, False) Log.msg('Search Strategy:', ss, False) Log.msg('Transcripts:', str(len(transcripts_nt)), False) Log.msg('Transcripts with acceptable ORFs:', str(len(transcripts_with_acceptable_orfs)) + '\n' + '=' * 134, False) if len(transcripts_nt) > 0: write_fasta(transcripts_nt, transcripts_nt_fasta_file) a['transcripts_nt_fasta_file__' + ss] = transcripts_nt_fasta_file else: a['transcripts_nt_fasta_file__' + ss] = None if len(transcripts_nt_orf) > 0: write_fasta(transcripts_nt_orf, transcripts_nt_orf_fasta_file) a['transcripts_nt_orf_fasta_file__' + ss] = transcripts_nt_orf_fasta_file else: a['transcripts_nt_orf_fasta_file__' + ss] = None if len(transcripts_aa_orf) > 0: write_fasta(transcripts_aa_orf, transcripts_aa_orf_fasta_file) a['transcripts_aa_orf_fasta_file__' + ss] = transcripts_aa_orf_fasta_file else: a['transcripts_aa_orf_fasta_file__' + ss] = None # Save ORF and BLAST hit annotations for downstream use.-------------- with open(json_dump_file_path, 'w') as f: json.dump(all_kakapo_results, f, sort_keys=True, indent=4)