def dnld_pfam_uniprot_seqs(ss, uniprot_acc, aa_uniprot_file, dir_cache_prj): if len(uniprot_acc) != 0: _ = opj(dir_cache_prj, 'aa_uniprot_acc_cache__' + ss) prev_uniprot_acc = [] if ope(_): with open(_, 'rb') as f: prev_uniprot_acc = pickle.load(f) with open(_, 'wb') as f: pickle.dump(uniprot_acc, f, protocol=PICKLE_PROTOCOL) if (set(uniprot_acc) != set(prev_uniprot_acc)) or \ (not ope(aa_uniprot_file)): Log.inf('Downloading Pfam protein sequences from UniProt:', ss) # Note: the number of sequences downloaded from UniProt may # be less than the total number of accessions. This is normal # as Pfam may return "obsolete" accessions, which will not be # downloaded here. _ = fasta_by_accession_list(uniprot_acc) _ = standardize_fasta_text(_, SEQ_TYPE_AA, pfam=True) write_fasta(_, aa_uniprot_file) else: if ope(aa_uniprot_file): osremove(aa_uniprot_file)
def dnld_cds_for_ncbi_prot_acc(ss, prot_acc_user, prot_cds_ncbi_file, tax, dir_cache_prj): pickle_file = opj(dir_cache_prj, 'ncbi_prot_cds_cache__' + ss) acc_old = set() if ope(pickle_file): with open(pickle_file, 'rb') as f: pickled = pickle.load(f) acc_old = set(pickled[0]) if acc_old == set(prot_acc_user): cds_rec_dict = pickled[1] Log.inf('The CDS for the dereplicated set of the user-provided ' 'NCBI protein accessions have already been ' 'downloaded:', ss) else: Log.inf('Downloading CDS for the dereplicated set of the user-provided ' 'NCBI protein accessions:', ss) cds_rec_dict = seq_records_to_dict(cds_for_prot(prot_acc_user), prepend_acc=True) with open(pickle_file, 'wb') as f: pickle.dump((prot_acc_user, cds_rec_dict), f, protocol=PICKLE_PROTOCOL) write_fasta(cds_rec_dict, prot_cds_ncbi_file)
def combine_aa_fasta(ss, fasta_files, aa_queries_file): Log.inf('Combining all AA query sequences:', ss) _ = '' for fasta_file in fasta_files: if ope(fasta_file): with open(fasta_file, 'r') as f: _ = _ + f.read() with open(aa_queries_file, 'w') as f: f.write(_)
def user_protein_accessions(ss, prot_acc_user, dir_cache_prj, taxonomy): if len(prot_acc_user) > 0: Log.inf('Reading user provided protein accessions:', ss) print() pickle_file = opj(dir_cache_prj, 'ncbi_prot_metadata_cache__' + ss) acc_old = set() if ope(pickle_file): with open(pickle_file, 'rb') as f: pickled = pickle.load(f) acc_old = set([x['accessionversion'] for x in pickled]) if acc_old == set(prot_acc_user): pa_info = pickled else: pa_info = summary_eutil('protein', prot_acc_user) prot_acc = [] prot_info_to_print = [] max_acc_len = 0 for pa in pa_info: acc = pa['accessionversion'] prot_acc.append(acc) title = pa['title'] title_split = title.split('[') taxid = pa['taxid'] if 'organism' in pa: organism = pa['organism'] else: organism = taxonomy.scientific_name_for_taxid(taxid) pa['organism'] = organism # title = title_split[0] # title = title.lower().strip() # title = title.replace('_', ' ').replace('-', ' ') # title = title.replace(',', '') # title = title[0].upper() + title[1:] + ' [' + organism + ']' max_acc_len = max(max_acc_len, len(acc)) prot_info_to_print.append((title, acc)) prot_info_to_print = sorted(prot_info_to_print) for pi in prot_info_to_print: title = pi[0] acc = pi[1] if len(title) > 80: title = title[:77] + '...' Log.msg(acc.rjust(max_acc_len) + ':', title, False) with open(pickle_file, 'wb') as f: pickle.dump(pa_info, f, protocol=PICKLE_PROTOCOL) return prot_acc else: return prot_acc_user
def user_aa_fasta(ss, user_queries, aa_prot_user_file): _ = '' if len(user_queries) > 0: print() Log.inf('Reading user provided AA sequences:', ss) for ap in user_queries: Log.msg(ap) with open(ap, 'r') as f: _ = _ + f.read() if _ != '': with open(aa_prot_user_file, 'w') as f: write_fasta(standardize_fasta_text(_, SEQ_TYPE_AA), f)
def user_fastq_files(fq_se, fq_pe): if len(fq_se) > 0 or len(fq_pe) > 0: print() Log.inf('Preparing user provided FASTQ files.') se_fastq_files = {} pe_fastq_files = {} fq_type_1_regex = r'(.*)_L\d\d\d(_R.)_\d\d\d(.*)' for se in fq_se: tax_id = se[0] path = se[1] base = basename(path) if plain_or_gzip(base)[4] != '': base = splitext(base)[0] base = splitext(base)[0] fq_type_1_match = re.findall(fq_type_1_regex, base) if len(fq_type_1_match) > 0 and len(fq_type_1_match[0]) == 3: base = fq_type_1_match[0][0] sample_base_name = base se_fastq_files[sample_base_name] = {'path': path} se_fastq_files[sample_base_name]['src'] = 'usr' se_fastq_files[sample_base_name]['avg_len'] = None se_fastq_files[sample_base_name]['tax_id'] = tax_id Log.msg(sample_base_name + ':', basename(path)) for pe in fq_pe: tax_id = pe[0] path = pe[1] base = basename(path[0]) if plain_or_gzip(base)[4] != '': base = splitext(base)[0] base = splitext(base)[0] fq_type_1_match = re.findall(fq_type_1_regex, base) if len(fq_type_1_match) > 0 and len(fq_type_1_match[0]) == 3: base = fq_type_1_match[0][0] else: base = basename(commonprefix(path)).rstrip('_- R') sample_base_name = base pe_fastq_files[sample_base_name] = {'path': path} pe_fastq_files[sample_base_name]['src'] = 'usr' pe_fastq_files[sample_base_name]['avg_len'] = None pe_fastq_files[sample_base_name]['tax_id'] = tax_id Log.msg( sample_base_name + ':', basename(path[0]) + '\n' + ' ' * (len(sample_base_name) + 2) + basename(path[1])) return se_fastq_files, pe_fastq_files
def dnld_prot_seqs(ss, prot_acc_user, aa_prot_ncbi_file, dir_cache_prj): if len(prot_acc_user) != 0: acc_old = set() if ope(aa_prot_ncbi_file): _ = read_fasta(aa_prot_ncbi_file, SEQ_TYPE_AA) acc_old = set([x.definition.split('|')[0] for x in _]) if acc_old == set(prot_acc_user): return prot_acc_user else: pickle_file = opj(dir_cache_prj, 'ncbi_prot_metadata_cache__' + ss) if ope(pickle_file): with open(pickle_file, 'rb') as f: pa_info = pickle.load(f) print() Log.inf('Downloading protein sequences from NCBI:', ss) _ = dnld_ncbi_seqs('protein', prot_acc_user, rettype='gb', retmode='xml') prot_acc_user_new = list() for rec in _: acc_ver = rec.accession_version defn = rec.definition organism = rec.organism prot_acc_user_new.append(acc_ver) defn_new = defn.split('[' + organism + ']')[0] defn_new = defn_new.lower().strip() defn_new = defn_new.replace(' ', '_').replace('-', '_') defn_new = defn_new.replace(',', '') defn_new = defn_new[0].upper() + defn_new[1:] defn_new = acc_ver + '|' + defn_new + '|' + organism defn_new = defn_new.replace(' ', '_').replace('-', '_') rec.definition = defn_new prot_acc_user = prot_acc_user_new write_fasta(_, aa_prot_ncbi_file) else: if ope(aa_prot_ncbi_file): osremove(aa_prot_ncbi_file) return prot_acc_user
def makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim, makeblastdb, fpatt): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Building BLAST databases for reads.') if makeblastdb is None: Log.err('makeblastdb is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, se) fa_path = se_fastq_files[se]['filter_path_fa'] out_f = opj(dir_blast_fa_trim_sample, se) se_fastq_files[se]['blast_db_path'] = out_f if ope(dir_blast_fa_trim_sample): Log.msg('BLAST database already exists:', se) else: make_dirs(dir_blast_fa_trim_sample) Log.msg(basename(fa_path)) make_blast_db(exec_file=makeblastdb, in_file=fa_path, out_file=out_f, title=se, dbtype='nucl') for pe in pe_fastq_files: dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, pe) fa_paths = pe_fastq_files[pe]['filter_path_fa'] out_fs = [x.replace('@D@', dir_blast_fa_trim_sample) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] pe_fastq_files[pe]['blast_db_path'] = out_fs if ope(dir_blast_fa_trim_sample): Log.msg('BLAST database already exists:', pe) else: make_dirs(dir_blast_fa_trim_sample) pe_trim_files = zip(fa_paths, out_fs) for x in pe_trim_files: Log.msg(basename(x[0])) make_blast_db(exec_file=makeblastdb, in_file=x[0], out_file=x[1], title=basename(x[1]), dbtype='nucl')
def filter_queries(ss, aa_queries_file, min_query_length, max_query_length, max_query_identity, vsearch, prot_acc_user, overwrite, logging=True): if logging is True: print() Log.inf('Filtering AA query sequences:', ss) Log.msg('min_query_length:', str(min_query_length)) Log.msg('max_query_length:', str(max_query_length)) Log.msg('max_query_identity:', str(max_query_identity)) parsed_fasta_1 = filter_fasta_by_length(aa_queries_file, SEQ_TYPE_AA, min_query_length, max_query_length) tmp1 = aa_queries_file + '_temp1' tmp2 = aa_queries_file + '_temp2' for rec in parsed_fasta_1: rec.seq.gc_code = 1 rec.seq = rec.seq.untranslate() write_fasta(parsed_fasta_1, tmp1) run_cluster_fast(vsearch, max_query_identity, tmp1, tmp2) parsed_fasta_2 = read_fasta(tmp2, SEQ_TYPE_DNA, parse_def=True) prot_acc_user_new = list() for rec in parsed_fasta_2: rec.seq.gc_code = 1 rec.seq = rec.seq.translate() acc = rec.accession_version if acc in prot_acc_user: prot_acc_user_new.append(acc) if overwrite is True: write_fasta(parsed_fasta_2, aa_queries_file, prepend_acc=True) osremove(tmp1) osremove(tmp2) return prot_acc_user_new
def pfam_uniprot_accessions(ss, pfam_acc, tax_ids, dir_cache_pfam_acc): if len(pfam_acc) > 0: Log.inf('Downloading UniProt accessions for Pfam accessions:', ss) pfam_seqs_list = [] for pa in pfam_acc: pfam_id = pfam_entry(pa)[0]['id'] Log.msg(pa + ':', pfam_id) _ = opj(dir_cache_pfam_acc, pa + '__' + ss) if ope(_): with open(_, 'rb') as f: acc = pickle.load(f) pfam_seqs_list = pfam_seqs_list + acc else: # Note: the results may include "obsolete" accessions. # This is not a problem, they will not appear in the set of # downloaded sequences from UniProt. acc = pfam_seqs(query=pa) pfam_seqs_list = pfam_seqs_list + acc with open(_, 'wb') as f: pickle.dump(acc, f, protocol=PICKLE_PROTOCOL) pfam_uniprot_acc = prot_ids_for_tax_ids(pfam_seqs_list, tax_ids) return pfam_uniprot_acc
def user_entrez_search(ss, queries, dir_cache_prj, requery_after): dnld_needed = True accs = [] if len(queries) != 0: time_stamp_now = datetime.datetime.now() time_stamp_file = opj(dir_cache_prj, 'ncbi_prot_time_stamp__' + ss) time_stamp = None if ope(time_stamp_file): with open(time_stamp_file, 'rb') as f: time_stamp = pickle.load(f) time_diff = time_stamp_now - time_stamp if time_diff < requery_after: dnld_needed = False if dnld_needed is True: Log.inf('Searching for protein sequences on NCBI:', ss) for q in queries: esearch_results = search_eutil(db='protein', term=q) accs = accs + accs_eutil(esearch_results) with open(time_stamp_file, 'wb') as f: pickle.dump(datetime.datetime.now(), f, protocol=PICKLE_PROTOCOL) else: days = requery_after.total_seconds() / 60 / 60 / 24 days = '{:.2f}'.format(days) Log.inf( 'NCBI results are less than ' + days + ' day(s) old. Will not search again.:', ss) pickle_file = opj(dir_cache_prj, 'ncbi_prot_metadata_cache__' + ss) if ope(pickle_file): with open(pickle_file, 'rb') as f: pickled = pickle.load(f) accs = [x['accessionversion'] for x in pickled] return accs
def filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk, fpatt): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Converting FASTQ to FASTA using Seqtk.') if seqtk is None: Log.err('seqtk is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fa_trim_data_sample = opj(dir_fa_trim_data, se) fq_path = se_fastq_files[se]['filter_path_fq'] out_f = opj(dir_fa_trim_data_sample, se + '.fasta') se_fastq_files[se]['filter_path_fa'] = out_f if ope(dir_fa_trim_data_sample): Log.msg('Filtered FASTA files already exist:', se) else: make_dirs(dir_fa_trim_data_sample) Log.msg(basename(fq_path)) seqtk_fq_to_fa(seqtk, fq_path, out_f) for pe in pe_fastq_files: dir_fa_trim_data_sample = opj(dir_fa_trim_data, pe) fq_paths = pe_fastq_files[pe]['filter_path_fq'] out_fs = [x.replace('@D@', dir_fa_trim_data_sample) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] pe_fastq_files[pe]['filter_path_fa'] = out_fs if ope(dir_fa_trim_data_sample): Log.msg('Filtered FASTA files already exist:', pe) else: make_dirs(dir_fa_trim_data_sample) pe_trim_files = zip(fq_paths, out_fs) for x in pe_trim_files: Log.msg(basename(x[0])) seqtk_fq_to_fa(seqtk, x[0], x[1])
def makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb): if len(assemblies) > 0: print() Log.inf('Building BLAST databases for assemblies.') if makeblastdb is None: Log.err('makeblastdb is not available. Cannot continue. Exiting.') exit(0) for a in assemblies: assmbl_name = a['name'] assmbl_blast_db_dir = opj(dir_prj_blast_assmbl, assmbl_name) assmbl_blast_db_file = opj(assmbl_blast_db_dir, assmbl_name) a['blast_db_path'] = assmbl_blast_db_file if ope(assmbl_blast_db_dir): Log.msg('BLAST database already exists:', assmbl_name) else: Log.msg(assmbl_name) make_dirs(assmbl_blast_db_dir) make_blast_db(exec_file=makeblastdb, in_file=a['path'], out_file=assmbl_blast_db_file, title=assmbl_name)
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector, threads, dir_temp, should_run): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() if should_run is False: Log.wrn('Skipping Rcorrector as requested.') else: Log.inf('Running Rcorrector.') if rcorrector is None: Log.err('Rcorrector is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, se) fq_path = se_fastq_files[se]['path'] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path) log_f = opj(dir_fq_cor_data_sample, se + '.txt') out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext) se_fastq_files[se]['cor_path_fq'] = out_f if should_run is False: se_fastq_files[se]['cor_path_fq'] = fq_path continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ file already exists:', se) else: make_dirs(dir_fq_cor_data_sample) Log.msg('SE mode:', se) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path)) fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f) remove(fq_cor_path) for pe in pe_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe) fq_path_1 = pe_fastq_files[pe]['path'][0] fq_path_2 = pe_fastq_files[pe]['path'][1] fq_path_3 = None out_f_3 = None r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1) log_f = opj(dir_fq_cor_data_sample, pe + '.txt') out_f_1 = opj(dir_fq_cor_data_sample, pe + '_R1.fastq' + ext) out_f_2 = opj(dir_fq_cor_data_sample, pe + '_R2.fastq' + ext) pe_fastq_files[pe]['cor_path_fq'] = [out_f_1, out_f_2] if len(pe_fastq_files[pe]['path']) == 3: fq_path_3 = pe_fastq_files[pe]['path'][2] out_f_3 = opj(dir_fq_cor_data_sample, pe + '_R3.fastq' + ext) pe_fastq_files[pe]['cor_path_fq'].append(out_f_3) if should_run is False: pe_fastq_files[pe]['cor_path_fq'] = [fq_path_1, fq_path_2] if fq_path_3 is not None: pe_fastq_files[pe]['cor_path_fq'].append(fq_path_3) continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ files already exist:', pe) else: make_dirs(dir_fq_cor_data_sample) Log.msg('PE mode:', pe) run_rcorrector_pe(rcorrector=rcorrector, in_file_1=fq_path_1, in_file_2=fq_path_2, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1)) fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2)) fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext filter_unc_pe(in_file_1=fq_cor_path_1, in_file_2=fq_cor_path_2, out_file_1=out_f_1, out_file_2=out_f_2, log_file=log_f) remove(fq_cor_path_1) remove(fq_cor_path_2) if fq_path_3 is not None: Log.msg( 'SE mode (Paired-read SRA run contains unpaired reads):', pe) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path_3, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_3 = opj(dir_fq_cor_data_sample, basename(fq_path_3)) fq_cor_path_3 = splitext_gz(fq_base_path_3)[0] + '.cor.fq' log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired.txt') filter_unc_se(in_file=fq_cor_path_3, out_file=out_f_3, log_file=log_f_3) remove(fq_cor_path_3)
def find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk, dir_temp, prepend_assmbl, min_target_orf_len, max_target_orf_len, allow_non_aug, allow_no_strt_cod, allow_no_stop_cod, tax, tax_group, tax_ids_user, min_overlap, organelle): if len(assemblies) > 0: if seqtk is None: Log.err('seqtk is not available. Cannot continue. Exiting.') exit(0) for a in assemblies: if ('blast_hits_aa__' + ss) not in a: continue assmbl_name = a['name'] tax_id = a['tax_id'] parsed_hits = a['blast_hits_aa__' + ss] a_path = a['path'] gc_tt = a['gc_tt'] if tax.is_eukaryote(tax_id) is True: if organelle == 'mitochondrion': gc_tt = a['gc_tt_mito'] if tax.contains_plastid(tax_id) is True: if organelle == 'plastid': gc_tt = a['gc_tt_plastid'] transcripts_nt_fasta_file = opj( dir_prj_transcripts, assmbl_name + '_transcripts_nt__' + ss + '.fasta') transcripts_nt_orf_fasta_file = opj( dir_prj_transcripts, assmbl_name + '_transcripts_nt_orf__' + ss + '.fasta') transcripts_aa_orf_fasta_file = opj( dir_prj_transcripts, assmbl_name + '_transcripts_aa_orf__' + ss + '.fasta') transcripts_nt = {} transcripts_nt_orf = {} transcripts_aa_orf = {} transcripts_with_acceptable_orfs = set() ann_key = 'annotations__' a[ann_key + ss] = {} collated = collate_blast_results(parsed_hits) ###################################################################### # Use seqtk to sample the assembly FASTA file for sequences with # BLAST hits. This increases the speed substantially when the assembly # file is large. temp_a_file = opj(dir_temp, 'temp__' + ss + '.fasta') temp_s_file = opj(dir_temp, 'temp__' + ss + '.txt') sseqids_subsample = [] for hit in collated: target_name = hit['sseqid'] sseqids_subsample.append(target_name) sseqids_subsample_text = '\n'.join(sseqids_subsample) with open(temp_s_file, 'w') as f: f.write(sseqids_subsample_text) seqtk_extract_reads(seqtk, in_file=a_path, out_file=temp_a_file, ids_file=temp_s_file) with open(temp_a_file, 'r') as f: _ = f.read() if _.strip() == '': continue print() Log.inf('Analyzing BLAST hits', '=' * 113 + '\n') Log.msg('Assembly:', assmbl_name, False) Log.msg('Search Strategy:', ss + '\n\n' + '-' * 134 + '\n', False) parsed_fasta = trim_desc_to_first_space_in_fasta_text(_, SEQ_TYPE_DNA) parsed_fasta = seq_records_to_dict(parsed_fasta) ###################################################################### all_kakapo_results = {} json_dump_file_path = opj(dir_prj_transcripts, assmbl_name + '_ann_kakapo__' + ss + '.json') for hit in collated: target_name = hit['sseqid'] target_seq = parsed_fasta[target_name] query_name = hit['qseqid'] hit_evalue = hit['evalue'] # Prepend assembly name to the sequence name: if prepend_assmbl is True: target_name = assmbl_name + '__' + target_name # Also prepend taxonomic info to the sequence name: if tax_id is not None: fm = tax.higher_rank_for_taxid(tax_id, rank='family') if fm is not None: target_name = fm + '__' + target_name hit_start = hit['start'] hit_end = hit['end'] hit_frame = hit['frame'] if allow_non_aug is True: start_codons = gc_tt.start_codons_ambiguous else: start_codons = ['ATG'] stop_codons = gc_tt.stop_codons_ambiguous ################################################################## if tax_id is not None: tax_ids_for_orf = (tax_id, ) else: tax_ids_for_orf = tax_ids_user cntx_txids_avail = tuple( sorted( set( map(lambda x: int(x.split('_')[0]), atg_contexts.keys())))) cntx_taxid = set() for txid in tax_ids_for_orf: tax_path = partial(tax.path_between_taxids, txid) path_len = tuple( map(len, tuple(map(tax_path, cntx_txids_avail)))) cntx_taxid.add(cntx_txids_avail[path_len.index(min(path_len))]) cntx_taxid = tuple(cntx_taxid)[0] cntx_l_key = str(cntx_taxid) + '_L' cntx_r_key = str(cntx_taxid) + '_R' cntx_l = atg_contexts[cntx_l_key] cntx_r = atg_contexts[cntx_r_key] ################################################################## orf_log_str = ('grade'.rjust(5) + 'ovrlp'.rjust(7) + 'cntx'.rjust(6) + 'length'.center(9) + 'cntx_l'.rjust(7) + 'cntx_r'.rjust(15) + '\n') orf = find_orf_for_blast_hit(seq=target_seq, frame=hit_frame, hit_start=hit_start, hit_end=hit_end, stop_codons=stop_codons, start_codons=start_codons, context_l=cntx_l, context_r=cntx_r, min_overlap=min_overlap, min_len=min_target_orf_len, max_len=max_target_orf_len, allow_no_strt_cod=allow_no_strt_cod, allow_no_stop_cod=allow_no_stop_cod) orf_log_str += orf[2] rev_comp_def_str = '' if hit_frame > 0: ann_hit_b = hit_start ann_hit_e = hit_end else: target_seq = reverse_complement(target_seq) ann_hit_b = len(target_seq) - hit_start ann_hit_e = len(target_seq) - hit_end rev_comp_def_str = '; RevComp' target_def = target_name + ' ' + query_name + rev_comp_def_str a[ann_key + ss][target_name] = {} good_orfs = orf[0] bad_orfs = orf[1] if len(good_orfs) > 0: a[ann_key + ss][target_name]['orfs_good'] = dict() orfs_good_dict = a[ann_key + ss][target_name]['orfs_good'] orf_log_str += '\n' + 'VALID ' + '-' * 128 + '\n' for i, good_orf in enumerate(good_orfs): good_orf_frame = good_orf[2] if good_orf_frame > 0: ann_orf_b = good_orf[0] ann_orf_e = good_orf[1] + 3 orf_seq = target_seq[ann_orf_b:ann_orf_e] else: ann_orf_b = len(target_seq) - good_orf[1] ann_orf_e = len(target_seq) - good_orf[0] + 3 orf_seq = target_seq[ann_orf_b:ann_orf_e] orf_good_dict = dict() orf_good_dict['orf_begin'] = ann_orf_b orf_good_dict['orf_end'] = ann_orf_e orf_good_dict['orf_frame'] = abs(good_orf_frame) orf_good_dict['orf_grade'] = good_orf[3] orf_good_dict['orf_tt_id'] = str(gc_tt.gc_id) orf_good_dict['orf_tt_name'] = gc_tt.gc_name orfs_good_dict['ORF{:03d}'.format(i + 1)] = orf_good_dict target_def_orf = (target_name + '__ORF{:03d}'.format(i + 1) + ' ' + query_name + rev_comp_def_str) transcripts_nt_orf[target_def_orf] = orf_seq transcripts_with_acceptable_orfs.add(target_name) transl_seq = translate(orf_seq, gc_tt.table_ambiguous, start_codons) transcripts_aa_orf[target_def_orf] = transl_seq[:-1] else: orf_log_str += '\n' + 'NOT VALID ' + '-' * 124 + '\n' Log.msg('Transcript:', target_name, False) Log.msg(' Query:', query_name + '\n\n' + orf_log_str, False) if len(bad_orfs) > 0: a[ann_key + ss][target_name]['orfs_bad'] = dict() orfs_bad_dict = a[ann_key + ss][target_name]['orfs_bad'] for i, bad_orf in enumerate(bad_orfs): bad_orf_frame = bad_orf[2] if bad_orf_frame > 0: ann_orf_b = bad_orf[0] ann_orf_e = bad_orf[1] + 3 orf_seq = target_seq[ann_orf_b:ann_orf_e] else: ann_orf_b = len(target_seq) - bad_orf[1] ann_orf_e = len(target_seq) - bad_orf[0] + 3 orf_seq = target_seq[ann_orf_b:ann_orf_e] orf_bad_dict = dict() orf_bad_dict['orf_begin'] = ann_orf_b orf_bad_dict['orf_end'] = ann_orf_e orf_bad_dict['orf_frame'] = abs(bad_orf_frame) orf_bad_dict['orf_grade'] = bad_orf[3] orf_bad_dict['orf_tt_id'] = str(gc_tt.gc_id) orf_bad_dict['orf_tt_name'] = gc_tt.gc_name orfs_bad_dict['ORF{:03d}'.format(i + 1)] = orf_bad_dict transcripts_nt[target_def] = target_seq a[ann_key + ss][target_name]['blast_hit'] = dict() blast_hit_dict = a[ann_key + ss][target_name]['blast_hit'] blast_hit_dict['query_name'] = query_name blast_hit_dict['query_id'] = ss blast_hit_dict['evalue'] = hit_evalue blast_hit_dict['frame'] = abs(hit_frame) blast_hit_dict['blast_hit_begin'] = ann_hit_b blast_hit_dict['blast_hit_end'] = ann_hit_e # Collect ORF and BLAST hit annotations for downstream use. ###### kakapo_json = [{}] kakapo_json[0]['kakapo_annotations__' + ss] = (a[ann_key + ss][target_name]) all_kakapo_results[target_name] = kakapo_json ################################################################## # -------------------------------------------------------------------- Log.msg('Assembly:', assmbl_name, False) Log.msg('Search Strategy:', ss, False) Log.msg('Transcripts:', str(len(transcripts_nt)), False) Log.msg('Transcripts with acceptable ORFs:', str(len(transcripts_with_acceptable_orfs)) + '\n' + '=' * 134, False) if len(transcripts_nt) > 0: write_fasta(transcripts_nt, transcripts_nt_fasta_file) a['transcripts_nt_fasta_file__' + ss] = transcripts_nt_fasta_file else: a['transcripts_nt_fasta_file__' + ss] = None if len(transcripts_nt_orf) > 0: write_fasta(transcripts_nt_orf, transcripts_nt_orf_fasta_file) a['transcripts_nt_orf_fasta_file__' + ss] = transcripts_nt_orf_fasta_file else: a['transcripts_nt_orf_fasta_file__' + ss] = None if len(transcripts_aa_orf) > 0: write_fasta(transcripts_aa_orf, transcripts_aa_orf_fasta_file) a['transcripts_aa_orf_fasta_file__' + ss] = transcripts_aa_orf_fasta_file else: a['transcripts_aa_orf_fasta_file__' + ss] = None # Save ORF and BLAST hit annotations for downstream use.-------------- with open(json_dump_file_path, 'w') as f: json.dump(all_kakapo_results, f, sort_keys=True, indent=4)
def run_tblastn_on_reads(se_fastq_files, pe_fastq_files, aa_queries_file, tblastn, blast_1_evalue, blast_1_max_hsps, blast_1_qcov_hsp_perc, blast_1_best_hit_overhang, blast_1_best_hit_score_edge, blast_1_max_target_seqs, dir_blast_results_fa_trim, fpatt, ss, threads, seqtk, vsearch, dir_cache_prj): changed_blast_1 = False if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Running BLAST on reads:', ss) if tblastn is None: Log.err('tblastn is not available. Cannot continue. Exiting.') exit(0) if vsearch is None: Log.err('vsearch is not available. Cannot continue. Exiting.') exit(0) if seqtk is None: Log.err('seqtk is not available. Cannot continue. Exiting.') exit(0) cache_file = opj(dir_cache_prj, 'blast_1_settings_cache__' + ss) pickled = dict() settings = { 'blast_1_evalue': blast_1_evalue, 'blast_1_max_hsps': blast_1_max_hsps, 'blast_1_qcov_hsp_perc': blast_1_qcov_hsp_perc, 'blast_1_best_hit_overhang': blast_1_best_hit_overhang, 'blast_1_best_hit_score_edge': blast_1_best_hit_score_edge, 'blast_1_max_target_seqs': blast_1_max_target_seqs, 'queries': seq_records_to_dict(read_fasta(aa_queries_file, SEQ_TYPE_AA)) } Log.msg('evalue:', str(blast_1_evalue)) Log.msg('max_hsps:', str(blast_1_max_hsps)) Log.msg('qcov_hsp_perc:', str(blast_1_qcov_hsp_perc)) Log.msg('best_hit_overhang:', str(blast_1_best_hit_overhang)) Log.msg('best_hit_score_edge:', str(blast_1_best_hit_score_edge)) Log.msg('max_target_seqs:', str(blast_1_max_target_seqs)) print() # FixMe: Expose in configuration files? ident = 0.85 for se in se_fastq_files: dir_results = opj(dir_blast_results_fa_trim, se) blast_db_path = se_fastq_files[se]['blast_db_path'] fq_path = se_fastq_files[se]['filter_path_fq'] out_f = opj(dir_results, se + '__' + ss + '.txt') out_f_fastq = out_f.replace('.txt', '.fastq') out_f_fasta = out_f.replace('.txt', '.fasta') se_fastq_files[se]['blast_results_path' + '__' + ss] = out_f_fasta genetic_code = se_fastq_files[se]['gc_id'] if ope(out_f_fasta) and ope(cache_file): with open(cache_file, 'rb') as f: pickled = pickle.load(f) if ope(out_f_fasta) and pickled == settings: # Log.msg('The provided BLAST settings and query sequences did ' # 'not change since the previous run.') Log.msg('BLAST results already exist:', se) else: changed_blast_1 = True make_dirs(dir_results) Log.msg('Running tblastn on: ' + basename(blast_db_path), ss) run_blast(exec_file=tblastn, task='tblastn', threads=threads, db_path=blast_db_path, queries_file=aa_queries_file, out_file=out_f, evalue=blast_1_evalue, max_hsps=blast_1_max_hsps, qcov_hsp_perc=blast_1_qcov_hsp_perc, best_hit_overhang=blast_1_best_hit_overhang, best_hit_score_edge=blast_1_best_hit_score_edge, max_target_seqs=blast_1_max_target_seqs, db_genetic_code=genetic_code, out_cols=BLST_RES_COLS_1) Log.inf('Extracting unique BLAST hits using Seqtk:', ss) keep_unique_lines_in_file(out_f) seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f) seqtk_fq_to_fa(seqtk, out_f_fastq, out_f_fasta) osremove(out_f) osremove(out_f_fastq) out_f_fasta_temp = out_f_fasta + '_temp' copyfile(out_f_fasta, out_f_fasta_temp) run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta) osremove(out_f_fasta_temp) for pe in pe_fastq_files: dir_results = opj(dir_blast_results_fa_trim, pe) blast_db_paths = pe_fastq_files[pe]['blast_db_path'] fq_paths = pe_fastq_files[pe]['filter_path_fq'] out_fs = [x.replace('@D@', dir_results) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] out_fs = [x.replace('@Q@', ss) for x in out_fs] out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs] out_fs_fasta = [x.replace('.txt', '.fasta') for x in out_fs] out_f_fasta = opj(dir_results, pe + '__' + ss + '.fasta') pe_fastq_files[pe]['blast_results_path' + '__' + ss] = out_f_fasta genetic_code = pe_fastq_files[pe]['gc_id'] if ope(out_f_fasta) and ope(cache_file): with open(cache_file, 'rb') as f: pickled = pickle.load(f) if ope(out_f_fasta) and pickled == settings: # Log.msg('The provided BLAST settings and query sequences did ' # 'not change since the previous run.') Log.msg('BLAST results already exist:', pe) else: changed_blast_1 = True make_dirs(dir_results) pe_trim_files = zip(blast_db_paths, out_fs, fq_paths, out_fs_fastq, out_fs_fasta) for x in pe_trim_files: Log.msg('Running tblastn on: ' + basename(x[0]), ss) run_blast(exec_file=tblastn, task='tblastn', threads=threads, db_path=x[0], queries_file=aa_queries_file, out_file=x[1], evalue=blast_1_evalue, max_hsps=blast_1_max_hsps, qcov_hsp_perc=blast_1_qcov_hsp_perc, best_hit_overhang=blast_1_best_hit_overhang, best_hit_score_edge=blast_1_best_hit_score_edge, max_target_seqs=blast_1_max_target_seqs, db_genetic_code=genetic_code, out_cols=BLST_RES_COLS_1) Log.msg('Extracting unique BLAST hits using Seqtk:', ss) keep_unique_lines_in_file(x[1]) seqtk_extract_reads(seqtk, x[2], x[3], x[1]) seqtk_fq_to_fa(seqtk, x[3], x[4]) osremove(x[1]) osremove(x[3]) combine_text_files(out_fs_fasta, out_f_fasta) out_f_fasta_temp = out_f_fasta + '_temp' copyfile(out_f_fasta, out_f_fasta_temp) run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta) osremove(out_f_fasta_temp) for x in out_fs_fasta: osremove(x) with open(cache_file, 'wb') as f: pickle.dump(settings, f, protocol=PICKLE_PROTOCOL) return changed_blast_1
def dnld_kraken2_dbs(dbs_path): Log.inf('Checking for available Kraken2 databases.') kraken2_dbs = download_kraken2_dbs(dbs_path) for db in sorted(kraken2_dbs.keys()): Log.msg('Found Kraken2 database:', db) return kraken2_dbs
def run_kraken2(order, dbs, se_fastq_files, pe_fastq_files, dir_fq_filter_data, confidence, kraken2, threads, dir_temp, fpatt): if (len(se_fastq_files) > 0 or len(pe_fastq_files) > 0) and len(order) > 0: print() Log.inf('Running Kraken2.', 'Confidence: ' + str(confidence)) if kraken2 is None: Log.err('kraken2 is not available. Cannot continue. Exiting.') exit(0) nuclear = None for nuc in order: if nuc[1] == 'nuclear': nuclear = nuc[0] break for se in se_fastq_files: if len(order) == 0: continue if se_fastq_files[se]['path'] is None: continue fq_path = se_fastq_files[se]['filter_path_fq'] dir_fq_filter_data_sample = opj(dir_fq_filter_data, se) if nuclear is None: out_f = opj(dir_fq_filter_data_sample, se + '.fastq') else: out_f = opj(dir_fq_filter_data_sample, nuclear, se + '.fastq') se_fastq_files[se]['filter_path_fq'] = out_f if ope(dir_fq_filter_data_sample): Log.msg('Kraken2 filtered FASTQ files already exist:', se) else: make_dirs(dir_fq_filter_data_sample) print() Log.msg('SE mode:', se) run_kraken_filters(order=order, dbs=dbs, base_name=se, in_files=fq_path, dir_out=dir_fq_filter_data_sample, confidence=confidence, kraken2=kraken2, threads=threads, dir_temp=dir_temp) for pe in pe_fastq_files: if len(order) == 0: continue if pe_fastq_files[pe]['path'] is None: continue fq_path = pe_fastq_files[pe]['filter_path_fq'] dir_fq_filter_data_sample = opj(dir_fq_filter_data, pe) if nuclear is None: dir_name_nuclear = dir_fq_filter_data_sample else: dir_name_nuclear = dir_fq_filter_data_sample + ops + nuclear out_fs = [x.replace('@D@', dir_name_nuclear) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] pe_fastq_files[pe]['filter_path_fq'] = out_fs if ope(dir_fq_filter_data_sample): Log.msg('Kraken2 filtered FASTQ files already exist:', pe) else: make_dirs(dir_fq_filter_data_sample) print() Log.msg('PE mode:', pe) run_kraken_filters(order=order, dbs=dbs, base_name=pe, in_files=fq_path, dir_out=dir_fq_filter_data_sample, confidence=confidence, kraken2=kraken2, threads=threads, dir_temp=dir_temp)
def run_inter_pro_scan(ss, assemblies, email, dir_prj_ips, dir_cache_prj, parallel_run_count, max_title_a_len, max_run_id_len): delay = 0.25 for a in assemblies: if 'transcripts_aa_orf_fasta_file__' + ss not in a: continue aa_file = a['transcripts_aa_orf_fasta_file__' + ss] if aa_file is None: continue assmbl_name = a['name'] json_dump_file_path = opj(dir_prj_ips, assmbl_name + '_ann_ips__' + ss + '.json') if ope(json_dump_file_path): Log.inf('InterProScan results for assembly ' + assmbl_name + ', ' 'search strategy ' + ss + ' have already been downloaded.') continue else: Log.inf('Running InterProScan on translated ' + ss + ' from ' + assmbl_name + '.') seqs = seq_records_to_dict(read_fasta(aa_file, SEQ_TYPE_AA)) # Filter all ORFs except the first one. for seq_def in tuple(seqs.keys()): seq_def_prefix = seq_def.split(' ')[0] if not seq_def_prefix.endswith('ORF001'): del seqs[seq_def] seqs = OrderedDict( sorted(seqs.items(), key=lambda x: x[0].split(' ')[1], reverse=True)) run_id = ss + '_' + assmbl_name _ = opj(dir_cache_prj, 'ips5_cache_done_' + run_id) if ope(_): with open(_, 'rb') as f: jobs = pickle.load(f) else: jobs = job_runner(email=email, dir_cache=dir_cache_prj, seqs=seqs, run_id=run_id, parallel_run_count=parallel_run_count, max_title_a_len=max_title_a_len, max_run_id_len=max_run_id_len) with open(_, 'wb') as f: pickle.dump(jobs, f, protocol=PICKLE_PROTOCOL) Log.inf('Downloading InterProScan results for ' + ss + ' in ' + assmbl_name + '.') all_ips_results = {} # Nicer printing for i, job in enumerate(jobs['finished']): job_id = jobs['finished'][job] titles_ab = split_seq_defn(job) title_a = titles_ab[0] progress = round(((i + 1) / len(jobs['finished'])) * 100) progress_str = '{:3d}'.format(progress) + '%' msg = (' ' * 12 + title_a.ljust(max_title_a_len) + run_id.ljust(max_run_id_len) + progress_str.rjust(4) + ' ' + job_id) Log.msg(msg) sleep(delay) ips_json = result_json(job_id) if ips_json is None: continue # ips_version = ips_json['interproscan-version'] ips_json = ips_json['results'] # These fields are set to 'EMBOSS_001' by default # Delete them del ips_json[0]['xref'] job_no_def = job.split(' ')[0] all_ips_results[job_no_def] = ips_json with open(json_dump_file_path, 'w') as f: json.dump(all_ips_results, f, sort_keys=True, indent=4) # Removes cached jobs file. osremove(_)
def run_bt2_fq(se_fastq_files, pe_fastq_files, dir_fq_filter_data, bowtie2, bowtie2_build, threads, dir_temp, bt2_order, fpatt, taxonomy, dir_cache_refseqs): new_se_fastq_files = dict() new_pe_fastq_files = dict() msg_printed = False # SE for se in se_fastq_files: taxid = se_fastq_files[se]['tax_id'] dbs = _should_run_bt2(taxid, taxonomy, bt2_order, bowtie2, bowtie2_build) in_f = se_fastq_files[se]['trim_path_fq'] in_f_orig = in_f if len(dbs) == 0: se_fastq_files[se]['filter_path_fq'] = in_f continue if msg_printed is False: print() Log.inf('Running Bowtie2.') msg_printed = True for i, db in enumerate(dbs): db_path = dbs[db] dir_fq_bt_data_sample = opj(dir_fq_filter_data, se, db) dir_fq_bt_data_sample_un = opj(dir_fq_filter_data, se) new_se = se + '_' + db out_f = opj(dir_fq_bt_data_sample, new_se + '.fastq') out_f_un = opj(dir_temp, new_se + '_bt2_unaligned' + '.fastq') sam_f = opj(dir_fq_bt_data_sample, new_se + '.sam') new_se_fastq_files[new_se] = deepcopy(se_fastq_files[se]) new_se_fastq_files[new_se]['path'] = None new_se_fastq_files[new_se]['cor_path_fq'] = None new_se_fastq_files[new_se]['trim_path_fq'] = None taxid = new_se_fastq_files[new_se]['tax_id'] gc = new_se_fastq_files[new_se]['gc_id'] if db == MT: gc = taxonomy.mito_genetic_code_for_taxid(taxid) new_se_fastq_files[new_se]['gc_id'] = gc elif db == PT: gc = taxonomy.plastid_genetic_code_for_taxid(taxid) new_se_fastq_files[new_se]['gc_id'] = gc new_se_fastq_files[new_se]['gc_tt'] = TranslationTable(gc) new_se_fastq_files[new_se]['filter_path_fq'] = out_f if ope(dir_fq_bt_data_sample): Log.msg('Bowtie2 filtered FASTQ file already exists:', new_se) in_f = opj(dir_fq_bt_data_sample_un, se + '.fastq') else: Log.msg('SE mode:', new_se) make_dirs(dir_fq_bt_data_sample) db_fasta_path = None bt2_idx_path = None if db_path in (MT, PT): db_fasta_path = dnld_refseqs_for_taxid(taxid, db, taxonomy, dir_cache_refseqs, query='', db='nuccore') bt2_idx_path = splitext(db_fasta_path)[0] else: db_fasta_path = db_path bt2_idx_path = opj(dir_cache_refseqs, splitext(basename(db_fasta_path))[0]) if not ope(bt2_idx_path + '.1.bt2'): build_bt2_index(bowtie2_build, [db_fasta_path], bt2_idx_path, threads) run_bowtie2_se(bowtie2=bowtie2, input_file=in_f, output_file=out_f, output_file_un=out_f_un, sam_output_file=sam_f, index=bt2_idx_path, threads=threads, dir_temp=dir_temp) if i > 0: remove(in_f) in_f = out_f_un out_f_un = opj(dir_fq_bt_data_sample_un, se + '.fastq') se_fastq_files[se]['filter_path_fq'] = out_f_un if in_f != in_f_orig: move(in_f, out_f_un) se_fastq_files.update(new_se_fastq_files) # PE for pe in pe_fastq_files: taxid = pe_fastq_files[pe]['tax_id'] dbs = _should_run_bt2(taxid, taxonomy, bt2_order, bowtie2, bowtie2_build) in_fs = pe_fastq_files[pe]['trim_path_fq'] in_fs_orig = tuple(in_fs) if len(dbs) == 0: pe_fastq_files[pe]['filter_path_fq'] = in_fs continue if msg_printed is False: print() Log.inf('Running Bowtie2.') msg_printed = True for i, db in enumerate(dbs): db_path = dbs[db] dir_fq_bt_data_sample = opj(dir_fq_filter_data, pe, db) dir_fq_bt_data_sample_un = opj(dir_fq_filter_data, pe) new_pe = pe + '_' + db out_fs = [x.replace('@D@', dir_fq_bt_data_sample) for x in fpatt] out_fs = [x.replace('@N@', new_pe) for x in out_fs] out_fs_un = [x.replace('@D@', dir_temp) for x in fpatt] out_fs_un = [ x.replace('@N@', new_pe + '_bt2_unaligned') for x in out_fs_un ] sam_f = opj(dir_fq_bt_data_sample, new_pe + '.sam') new_pe_fastq_files[new_pe] = deepcopy(pe_fastq_files[pe]) new_pe_fastq_files[new_pe]['path'] = None new_pe_fastq_files[new_pe]['cor_path_fq'] = None new_pe_fastq_files[new_pe]['trim_path_fq'] = None taxid = new_pe_fastq_files[new_pe]['tax_id'] gc = new_pe_fastq_files[new_pe]['gc_id'] if db == MT: gc = taxonomy.mito_genetic_code_for_taxid(taxid) new_pe_fastq_files[new_pe]['gc_id'] = gc elif db == PT: gc = taxonomy.plastid_genetic_code_for_taxid(taxid) new_pe_fastq_files[new_pe]['gc_id'] = gc new_pe_fastq_files[new_pe]['gc_tt'] = TranslationTable(gc) new_pe_fastq_files[new_pe]['filter_path_fq'] = out_fs if ope(dir_fq_bt_data_sample): Log.msg('Bowtie2 filtered FASTQ files already exist:', new_pe) in_fs = [ x.replace('@D@', dir_fq_bt_data_sample_un) for x in fpatt ] in_fs = [x.replace('@N@', pe) for x in in_fs] else: Log.msg('PE mode:', new_pe) make_dirs(dir_fq_bt_data_sample) db_fasta_path = None bt2_idx_path = None if db_path in (MT, PT): db_fasta_path = dnld_refseqs_for_taxid(taxid, db, taxonomy, dir_cache_refseqs, query='', db='nuccore') bt2_idx_path = splitext(db_fasta_path)[0] else: db_fasta_path = db_path bt2_idx_path = opj(dir_cache_refseqs, splitext(basename(db_fasta_path))[0]) if not ope(bt2_idx_path + '.1.bt2'): build_bt2_index(bowtie2_build, [db_fasta_path], bt2_idx_path, threads) paired_out_pattern = out_fs[0].replace('_paired_1.fastq', '_paired_%.fastq') paired_out_pattern_un = out_fs_un[0].replace( '_paired_1.fastq', '_paired_%.fastq') run_bowtie2_pe(bowtie2=bowtie2, input_files=in_fs, paired_out_pattern=paired_out_pattern, paired_out_pattern_un=paired_out_pattern_un, unpaired_out_1=out_fs[2], unpaired_out_2=out_fs[3], unpaired_out_1_un=out_fs_un[2], unpaired_out_2_un=out_fs_un[3], sam_output_file=sam_f, index=bt2_idx_path, threads=threads, dir_temp=dir_temp) if i > 0: remove(in_fs[0]) remove(in_fs[1]) remove(in_fs[2]) remove(in_fs[3]) in_fs = out_fs_un out_fs_un = [x.replace('@D@', dir_fq_bt_data_sample_un) for x in fpatt] out_fs_un = [x.replace('@N@', pe) for x in out_fs_un] pe_fastq_files[pe]['filter_path_fq'] = out_fs_un if tuple(in_fs) != in_fs_orig: move(in_fs[0], out_fs_un[0]) move(in_fs[1], out_fs_un[1]) move(in_fs[2], out_fs_un[2]) move(in_fs[3], out_fs_un[3]) pe_fastq_files.update(new_pe_fastq_files)
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector, threads, dir_temp, fpatt, should_run): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() if should_run is False: Log.wrn('Skipping Rcorrector as requested.') else: Log.inf('Running Rcorrector.') if rcorrector is None: Log.err( 'Rcorrector is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, se) fq_path = se_fastq_files[se]['trim_path_fq'] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path) log_f = opj(dir_fq_cor_data_sample, se + '.txt') out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext) se_fastq_files[se]['cor_path_fq'] = out_f if should_run is False: se_fastq_files[se]['cor_path_fq'] = fq_path continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ file already exists:', se) else: make_dirs(dir_fq_cor_data_sample) Log.msg('SE mode:', se) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path)) fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f) remove(fq_cor_path) for pe in pe_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe) fq_path_1 = pe_fastq_files[pe]['trim_path_fq'][0] fq_path_2 = pe_fastq_files[pe]['trim_path_fq'][1] fq_path_3 = pe_fastq_files[pe]['trim_path_fq'][2] fq_path_4 = pe_fastq_files[pe]['trim_path_fq'][3] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1) log_f = opj(dir_fq_cor_data_sample, pe + '_paired.txt') out_fs = [x.replace('@D@', dir_fq_cor_data_sample) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] out_fs = [x + ext for x in out_fs] pe_fastq_files[pe]['cor_path_fq'] = out_fs if should_run is False: pe_fastq_files[pe]['cor_path_fq'] = [ fq_path_1, fq_path_2, fq_path_3, fq_path_4 ] continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ files already exist:', pe) else: make_dirs(dir_fq_cor_data_sample) Log.msg('PE mode:', pe) run_rcorrector_pe(rcorrector=rcorrector, in_file_1=fq_path_1, in_file_2=fq_path_2, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1)) fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2)) fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext filter_unc_pe(in_file_1=fq_cor_path_1, in_file_2=fq_cor_path_2, out_file_1=out_fs[0], out_file_2=out_fs[1], log_file=log_f) remove(fq_cor_path_1) remove(fq_cor_path_2) # unpaired 1 if stat(fq_path_3).st_size != 0: run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path_3, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_3 = opj(dir_fq_cor_data_sample, basename(fq_path_3)) fq_cor_path_3 = splitext_gz( fq_base_path_3)[0] + '.cor.fq' + ext log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired_1.txt') filter_unc_se(in_file=fq_cor_path_3, out_file=out_fs[2], log_file=log_f_3) remove(fq_cor_path_3) else: with open(out_fs[2], 'w') as f: f.write('') # unpaired 2 if stat(fq_path_4).st_size != 0: run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path_4, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_4 = opj(dir_fq_cor_data_sample, basename(fq_path_4)) fq_cor_path_4 = splitext_gz( fq_base_path_4)[0] + '.cor.fq' + ext log_f_4 = opj(dir_fq_cor_data_sample, pe + '_unpaired_2.txt') filter_unc_se(in_file=fq_cor_path_4, out_file=out_fs[3], log_file=log_f_4) remove(fq_cor_path_4) else: with open(out_fs[3], 'w') as f: f.write('')
def main(): """Run the script.""" # Prepare initial logger (before we know the log file path) -------------- prj_log_file_suffix = time_stamp() + '.log' log_stream = StringIO() Log.set_colors(COLORS) Log.set_file(log_stream) Log.set_write(True) # Prepare configuration directory ---------------------------------------- if ope(DIR_CFG): Log.inf('Found configuration directory:', DIR_CFG) else: Log.wrn('Creating configuration directory:', DIR_CFG) make_dirs(DIR_CFG) print() # Check for dependencies ------------------------------------------------- Log.inf('Checking for dependencies.') make_dirs(DIR_DEP) make_dirs(DIR_KRK) seqtk = deps.dep_check_seqtk(DIR_DEP, FORCE_DEPS) trimmomatic, adapters = deps.dep_check_trimmomatic(DIR_DEP) fasterq_dump = deps.dep_check_sra_toolkit(DIR_DEP, OS_ID, DIST_ID, DEBIAN_DISTS, REDHAT_DISTS, FORCE_DEPS) makeblastdb, _, tblastn = deps.dep_check_blast(DIR_DEP, OS_ID, DIST_ID, DEBIAN_DISTS, REDHAT_DISTS, FORCE_DEPS) vsearch = deps.dep_check_vsearch(DIR_DEP, OS_ID, DIST_ID, DEBIAN_DISTS, REDHAT_DISTS, FORCE_DEPS) spades = deps.dep_check_spades(DIR_DEP, OS_ID, FORCE_DEPS) bowtie2, bowtie2_build = deps.dep_check_bowtie2(DIR_DEP, OS_ID, FORCE_DEPS) rcorrector = deps.dep_check_rcorrector(DIR_DEP, FORCE_DEPS) kraken2, kraken2_build = deps.dep_check_kraken2(DIR_DEP, OS_ID, RELEASE_NAME, FORCE_DEPS) print() kraken2_dbs = deps.dnld_kraken2_dbs(DIR_KRK) if INSTALL_DEPS is True or DNLD_KRAKEN_DBS is True: exit(0) print() # Initialize NCBI taxonomy database -------------------------------------- tax = Taxonomy() if tax.is_initialized() is False: tax.init(data_dir_path=DIR_TAX, logger=Log) print() # Parse configuration file ----------------------------------------------- Log.inf('Reading configuration file:', CONFIG_FILE_PATH) _ = config_file_parse(CONFIG_FILE_PATH, tax) allow_no_stop_cod = _['allow_no_stop_cod'] allow_no_strt_cod = _['allow_no_strt_cod'] allow_non_aug = _['allow_non_aug'] blast_1_evalue = _['blast_1_evalue'] blast_1_max_hsps = _['blast_1_max_hsps'] blast_1_qcov_hsp_perc = _['blast_1_qcov_hsp_perc'] blast_1_best_hit_overhang = _['blast_1_best_hit_overhang'] blast_1_best_hit_score_edge = _['blast_1_best_hit_score_edge'] blast_1_max_target_seqs = _['blast_1_max_target_seqs'] blast_2_evalue = _['blast_2_evalue'] blast_2_max_hsps = _['blast_2_max_hsps'] blast_2_qcov_hsp_perc = _['blast_2_qcov_hsp_perc'] blast_2_best_hit_overhang = _['blast_2_best_hit_overhang'] blast_2_best_hit_score_edge = _['blast_2_best_hit_score_edge'] blast_2_max_target_seqs = _['blast_2_max_target_seqs'] dir_out = _['output_directory'] email = _['email'] requery_after = _['requery_after'] fq_pe = _['fq_pe'] fq_se = _['fq_se'] should_run_rcorrector = _['should_run_rcorrector'] should_run_ipr = _['should_run_ipr'] bt2_order = _['bt2_order'] kraken_confidence = _['kraken_confidence'] krkn_order = _['krkn_order'] prepend_assmbl = _['prepend_assmbl'] prj_name = _['project_name'] sras = _['sras'] tax_group = _['tax_group'] # tax_group_name = _['tax_group_name'] tax_ids_user = _['tax_ids'] user_assemblies = _['assmbl'] print() # Parse search strategies file ------------------------------------------- if SS_FILE_PATH is not None: Log.inf('Reading search strategies file:', SS_FILE_PATH) sss = ss_file_parse(SS_FILE_PATH) else: Log.wrn('Search strategies file was not provided.\n' + 'Will process reads, assemblies and then stop.') sss = dict() print() # Create output directory ------------------------------------------------ if dir_out is not None: if ope(dir_out): Log.inf('Found output directory:', dir_out) else: Log.wrn('Creating output directory:', dir_out) make_dirs(dir_out) print() # Write Kakapo version information to the output directory --------------- version_file = opj(dir_out, 'kakapo_version.txt') if ope(version_file): with open(version_file, 'r') as f: version_prev = f.read().strip() if __version__ != version_prev: Log.wrn('The output directory contains data produced by a ' + 'different version of Kakapo: ' + version_prev + '.\nThe currently running version is: ' + __version__ + '.\n' + 'Delete "kakapo_version.txt" file located in the ' + 'output directory if you would like to continue.') exit(0) with open(version_file, 'w') as f: f.write(__version__) # Create subdirectories in the output directory -------------------------- _ = prepare_output_directories(dir_out, prj_name) dir_temp = _['dir_temp'] dir_cache_pfam_acc = _['dir_cache_pfam_acc'] dir_cache_fq_minlen = _['dir_cache_fq_minlen'] dir_cache_prj = _['dir_cache_prj'] dir_cache_refseqs = _['dir_cache_refseqs'] dir_prj_logs = _['dir_prj_logs'] dir_prj_queries = _['dir_prj_queries'] dir_fq_data = _['dir_fq_data'] dir_fq_cor_data = _['dir_fq_cor_data'] dir_fq_trim_data = _['dir_fq_trim_data'] dir_fq_filter_bt2_data = _['dir_fq_filter_bt2_data'] dir_fq_filter_krkn2_data = _['dir_fq_filter_krkn2_data'] dir_fa_trim_data = _['dir_fa_trim_data'] dir_blast_fa_trim = _['dir_blast_fa_trim'] dir_prj_blast_results_fa_trim = _['dir_prj_blast_results_fa_trim'] dir_prj_vsearch_results_fa_trim = _['dir_prj_vsearch_results_fa_trim'] dir_prj_spades_assemblies = _['dir_prj_spades_assemblies'] dir_prj_blast_assmbl = _['dir_prj_blast_assmbl'] dir_prj_assmbl_blast_results = _['dir_prj_assmbl_blast_results'] dir_prj_transcripts = _['dir_prj_transcripts'] dir_prj_ips = _['dir_prj_ips'] dir_prj_transcripts_combined = _['dir_prj_transcripts_combined'] # Prepare logger --------------------------------------------------------- prj_log_file = opj(dir_prj_logs, prj_name + '_' + prj_log_file_suffix) with open(prj_log_file, 'w') as f: f.write(SCRIPT_INFO.strip() + '\n\n' + log_stream.getvalue()) Log.set_colors(COLORS) Log.set_file(prj_log_file) Log.set_write(True) log_stream.close() # Resolve descending taxonomy nodes -------------------------------------- tax_ids = tax.all_descending_taxids_for_taxids([tax_group]) # Pfam uniprot accessions ------------------------------------------------ pfam_uniprot_acc = OrderedDict() for ss in sss: pfam_acc = sss[ss]['pfam_families'] pfam_uniprot_acc[ss] = pfam_uniprot_accessions(ss, pfam_acc, tax_ids, dir_cache_pfam_acc) # Download Pfam uniprot sequences if needed ------------------------------ aa_uniprot_files = OrderedDict() for ss in sss: aa_uniprot_files[ss] = opj(dir_prj_queries, 'aa_uniprot__' + ss + '.fasta') # ToDo: add support for the requery_after parameter. dnld_pfam_uniprot_seqs(ss, pfam_uniprot_acc[ss], aa_uniprot_files[ss], dir_cache_prj) # User provided entrez query --------------------------------------------- prot_acc_user_from_query = OrderedDict() for ss in sss: entrez_queries = sss[ss]['entrez_search_queries'] prot_acc_user_from_query[ss] = user_entrez_search( ss, entrez_queries, dir_cache_prj, requery_after) # User provided protein accessions --------------------------------------- prot_acc_user = OrderedDict() for ss in sss: print() prot_acc_all = sorted( set(sss[ss]['ncbi_accessions_aa'] + prot_acc_user_from_query[ss])) prot_acc_user[ss] = user_protein_accessions(ss, prot_acc_all, dir_cache_prj, tax) # Download from NCBI if needed ------------------------------------------- aa_prot_ncbi_files = OrderedDict() for ss in sss: aa_prot_ncbi_files[ss] = opj(dir_prj_queries, 'aa_prot_ncbi__' + ss + '.fasta') prot_acc_user[ss] = dnld_prot_seqs(ss, prot_acc_user[ss], aa_prot_ncbi_files[ss], dir_cache_prj) # User provided protein sequences ---------------------------------------- aa_prot_user_files = OrderedDict() for ss in sss: user_queries = sss[ss]['fasta_files_aa'] aa_prot_user_files[ss] = opj(dir_prj_queries, 'aa_prot_user__' + ss + '.fasta') user_aa_fasta(ss, user_queries, aa_prot_user_files[ss]) # Combine all AA queries ------------------------------------------------- print() aa_queries_files = OrderedDict() for ss in sss: aa_queries_files[ss] = opj(dir_prj_queries, 'aa_all__' + ss + '.fasta') combine_aa_fasta(ss, [ aa_uniprot_files[ss], aa_prot_ncbi_files[ss], aa_prot_user_files[ss] ], aa_queries_files[ss]) # Filter AA queries ------------------------------------------------------ prot_acc_user_filtered = OrderedDict() for ss in sss: min_query_length = sss[ss]['min_query_length'] max_query_length = sss[ss]['max_query_length'] max_query_identity = sss[ss]['max_query_identity'] # Dereplicate all queries filter_queries(ss, aa_queries_files[ss], min_query_length, max_query_length, max_query_identity, vsearch, prot_acc_user[ss], overwrite=True) # Dereplicate only NCBI queries. CDS for these will be downloaded # later for reference. if ope(aa_prot_ncbi_files[ss]): prot_acc_user_filtered[ss] = filter_queries(ss, aa_prot_ncbi_files[ss], min_query_length, max_query_length, max_query_identity, vsearch, prot_acc_user[ss], overwrite=False, logging=False) # Download SRA run metadata if needed ------------------------------------ sra_runs_info, sras_acceptable = dnld_sra_info(sras, dir_cache_prj) # Download SRA run FASTQ files if needed --------------------------------- x, y, z = dnld_sra_fastq_files(sras_acceptable, sra_runs_info, dir_fq_data, fasterq_dump, THREADS, dir_temp) se_fastq_files_sra = x pe_fastq_files_sra = y sra_runs_info = z # User provided FASTQ files ---------------------------------------------- se_fastq_files_usr, pe_fastq_files_usr = user_fastq_files(fq_se, fq_pe) # Collate FASTQ file info ------------------------------------------------ se_fastq_files = se_fastq_files_sra.copy() se_fastq_files.update(se_fastq_files_usr) pe_fastq_files = pe_fastq_files_sra.copy() pe_fastq_files.update(pe_fastq_files_usr) def gc_tt(k, d, tax): taxid = d[k]['tax_id'] gc = tax.genetic_code_for_taxid(taxid) d[k]['gc_id'] = gc d[k]['gc_tt'] = TranslationTable(gc) gc_mito = None tt_mito = None gc_plastid = None tt_plastid = None if tax.is_eukaryote(taxid) is True: gc_mito = tax.mito_genetic_code_for_taxid(taxid) if gc_mito != '0': tt_mito = TranslationTable(gc_mito) if tax.contains_plastid(taxid) is True: gc_plastid = tax.plastid_genetic_code_for_taxid(taxid) if gc_plastid != '0': tt_plastid = TranslationTable(gc_plastid) d[k]['gc_id_mito'] = gc_mito d[k]['gc_tt_mito'] = tt_mito d[k]['gc_id_plastid'] = gc_plastid d[k]['gc_tt_plastid'] = tt_plastid for se in se_fastq_files: gc_tt(se, se_fastq_files, tax) for pe in pe_fastq_files: gc_tt(pe, pe_fastq_files, tax) # Minimum acceptable read length ----------------------------------------- min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp, dir_cache_fq_minlen, vsearch) # Run Rcorrector --------------------------------------------------------- run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector, THREADS, dir_temp, should_run_rcorrector) # File name patterns ----------------------------------------------------- a, b, c, d, e = file_name_patterns() pe_trim_fq_file_patterns = a pe_trim_fa_file_patterns = b pe_blast_db_file_patterns = c pe_blast_results_file_patterns = d pe_vsearch_results_file_patterns = e # Run Trimmomatic -------------------------------------------------------- run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data, trimmomatic, adapters, pe_trim_fq_file_patterns, THREADS) # Run Bowtie 2 ----------------------------------------------------------- run_bt2_fq(se_fastq_files, pe_fastq_files, dir_fq_filter_bt2_data, bowtie2, bowtie2_build, THREADS, dir_temp, bt2_order, pe_trim_fq_file_patterns, tax, dir_cache_refseqs) # Run Kraken2 ------------------------------------------------------------ run_kraken2(krkn_order, kraken2_dbs, se_fastq_files, pe_fastq_files, dir_fq_filter_krkn2_data, kraken_confidence, kraken2, THREADS, dir_temp, pe_trim_fq_file_patterns) se_fastq_files = OrderedDict(se_fastq_files) pe_fastq_files = OrderedDict(pe_fastq_files) se_fastq_files = OrderedDict( sorted(se_fastq_files.items(), key=lambda x: x[1]['filter_path_fq'])) pe_fastq_files = OrderedDict( sorted(pe_fastq_files.items(), key=lambda x: x[1]['filter_path_fq'])) # Stop After Filter ------------------------------------------------------ if STOP_AFTER_FILTER is True: Log.wrn('Stopping after Kraken2/Bowtie2 filtering step as requested.') exit(0) # Convert filtered FASTQ files to FASTA ---------------------------------- filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk, pe_trim_fa_file_patterns) # Run makeblastdb on reads ----------------------------------------------- makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim, makeblastdb, pe_blast_db_file_patterns) # Check if there are any query sequences. any_queries = False for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue else: any_queries = True # Run tblastn on reads --------------------------------------------------- for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue changed_blast_1 = run_tblastn_on_reads( se_fastq_files, pe_fastq_files, aa_queries_files[ss], tblastn, blast_1_evalue, blast_1_max_hsps, blast_1_qcov_hsp_perc, blast_1_best_hit_overhang, blast_1_best_hit_score_edge, blast_1_max_target_seqs, dir_prj_blast_results_fa_trim, pe_blast_results_file_patterns, ss, THREADS, seqtk, vsearch, dir_cache_prj) if changed_blast_1 is True: if ope(dir_prj_vsearch_results_fa_trim): rmtree(dir_prj_vsearch_results_fa_trim) if ope(dir_prj_spades_assemblies): rmtree(dir_prj_spades_assemblies) if ope(dir_prj_blast_assmbl): rmtree(dir_prj_blast_assmbl) if ope(dir_prj_assmbl_blast_results): rmtree(dir_prj_assmbl_blast_results) if ope(dir_prj_transcripts): rmtree(dir_prj_transcripts) if ope(dir_prj_transcripts_combined): rmtree(dir_prj_transcripts_combined) prepare_output_directories(dir_out, prj_name) # Run vsearch on reads --------------------------------------------------- # should_run_vsearch = False # for ss in sss: # if stat(aa_queries_files[ss]).st_size == 0: # continue # else: # should_run_vsearch = True # break # if should_run_vsearch is True: # print() # Log.inf('Checking if Vsearch should be run.') for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue print() Log.inf('Checking if Vsearch should be run:', ss) run_vsearch_on_reads(se_fastq_files, pe_fastq_files, vsearch, dir_prj_vsearch_results_fa_trim, pe_vsearch_results_file_patterns, ss, seqtk) # Run SPAdes ------------------------------------------------------------- # should_run_spades = False # for ss in sss: # if stat(aa_queries_files[ss]).st_size == 0: # continue # else: # should_run_spades = True # break # if should_run_spades is True: # print() # Log.inf('Checking if SPAdes should be run.') for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: for se in se_fastq_files: se_fastq_files[se]['spades_assembly' + '__' + ss] = None for pe in pe_fastq_files: pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None continue print() Log.inf('Checking if SPAdes should be run:', ss) run_spades(se_fastq_files, pe_fastq_files, dir_prj_spades_assemblies, spades, dir_temp, ss, THREADS, RAM) # Combine SPAdes and user provided assemblies ---------------------------- assemblies = combine_assemblies(se_fastq_files, pe_fastq_files, user_assemblies, tax, sss) # Run makeblastdb on assemblies ----------------------------------------- makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb) if any_queries is False: Log.wrn('No query sequences were provided.') # Run tblastn on assemblies ---------------------------------------------- for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue should_run_tblastn = False for a in assemblies: assmbl_src = a['src'] assmbl_name = a['name'] if assmbl_src != 'user_fasta': if assmbl_name.endswith('__' + ss): should_run_tblastn = True break else: should_run_tblastn = True break if should_run_tblastn is False: print() Log.inf('Will not run BLAST. No transcripts exist:', ss) continue blast_2_evalue_ss = sss[ss]['blast_2_evalue'] blast_2_max_hsps_ss = sss[ss]['blast_2_max_hsps'] blast_2_qcov_hsp_perc_ss = sss[ss]['blast_2_qcov_hsp_perc'] blast_2_best_hit_overhang_ss = sss[ss]['blast_2_best_hit_overhang'] blast_2_best_hit_score_edge_ss = sss[ss]['blast_2_best_hit_score_edge'] blast_2_max_target_seqs_ss = sss[ss]['blast_2_max_target_seqs'] if blast_2_evalue_ss is None: blast_2_evalue_ss = blast_2_evalue if blast_2_max_hsps_ss is None: blast_2_max_hsps_ss = blast_2_max_hsps if blast_2_qcov_hsp_perc_ss is None: blast_2_qcov_hsp_perc_ss = blast_2_qcov_hsp_perc if blast_2_best_hit_overhang_ss is None: blast_2_best_hit_overhang_ss = blast_2_best_hit_overhang if blast_2_best_hit_score_edge_ss is None: blast_2_best_hit_score_edge_ss = blast_2_best_hit_score_edge if blast_2_max_target_seqs_ss is None: blast_2_max_target_seqs_ss = blast_2_max_target_seqs run_tblastn_on_assemblies( ss, assemblies, aa_queries_files[ss], tblastn, dir_prj_assmbl_blast_results, blast_2_evalue_ss, blast_2_max_hsps_ss, blast_2_qcov_hsp_perc_ss, blast_2_best_hit_overhang_ss, blast_2_best_hit_score_edge_ss, blast_2_max_target_seqs_ss, THREADS, dir_cache_prj, dir_prj_ips) # Prepare BLAST hits for analysis: find ORFs, translate ------------------ for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue min_target_orf_len_ss = sss[ss]['min_target_orf_length'] max_target_orf_len_ss = sss[ss]['max_target_orf_length'] organelle = sss[ss]['organelle'] blast_2_qcov_hsp_perc_ss = sss[ss]['blast_2_qcov_hsp_perc'] if blast_2_qcov_hsp_perc_ss is None: blast_2_qcov_hsp_perc_ss = blast_2_qcov_hsp_perc find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk, dir_temp, prepend_assmbl, min_target_orf_len_ss, max_target_orf_len_ss, allow_non_aug, allow_no_strt_cod, allow_no_stop_cod, tax, tax_group, tax_ids_user, blast_2_qcov_hsp_perc_ss, organelle) # GFF3 files from kakapo results JSON files ------------------------------ # print() for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue gff_from_json(ss, assemblies, dir_prj_ips, dir_prj_transcripts_combined, prj_name) # Run InterProScan 5 ----------------------------------------------------- if should_run_ipr is True: print() ss_names = tuple(sss.keys()) # Determine the length of printed strings, for better spacing -------- max_title_a_len = 0 max_run_id_len = 0 for a in assemblies: for ss in ss_names: if 'transcripts_aa_orf_fasta_file__' + ss not in a: continue aa_file = a['transcripts_aa_orf_fasta_file__' + ss] if aa_file is None: continue assmbl_name = a['name'] run_id = ss + '_' + assmbl_name max_run_id_len = max(len(run_id), max_run_id_len) seqs = seq_records_to_dict(read_fasta(aa_file, SEQ_TYPE_AA)) # Filter all ORFs except the first one. for seq_def in tuple(seqs.keys()): seq_def_prefix = seq_def.split(' ')[0] if seq_def_prefix.endswith('ORF001'): max_title_a_len = max(len(seq_def_prefix), max_title_a_len) max_title_a_len += 2 max_run_id_len += 2 # -------------------------------------------------------------------- parallel_run_count = min(THREADS, len(ss_names)) def run_inter_pro_scan_parallel(ss): if stat(aa_queries_files[ss]).st_size == 0: return run_inter_pro_scan(ss, assemblies, email, dir_prj_ips, dir_cache_prj, parallel_run_count, max_title_a_len, max_run_id_len) # GFF3 files from kakapo and InterProScan 5 results JSON files gff_from_json(ss, assemblies, dir_prj_ips, dir_prj_transcripts_combined, prj_name) Parallel(n_jobs=parallel_run_count, verbose=0, require='sharedmem')(delayed(run_inter_pro_scan_parallel)(ss) for ss in ss_names) # Download CDS for NCBI protein queries ---------------------------------- print() prot_cds_ncbi_files = OrderedDict() def dnld_cds_for_ncbi_prot_acc_parallel(ss): if stat(aa_queries_files[ss]).st_size == 0: return if ss not in prot_acc_user_filtered: return prot_cds_ncbi_files[ss] = opj( dir_prj_transcripts_combined, prj_name + '_ncbi_query_cds__' + ss + '.fasta') if len(prot_acc_user_filtered[ss]) > 0: dnld_cds_for_ncbi_prot_acc(ss, prot_acc_user_filtered[ss], prot_cds_ncbi_files[ss], tax, dir_cache_prj) ss_names = tuple(sss.keys()) Parallel(n_jobs=2, verbose=0, require='sharedmem')( delayed(dnld_cds_for_ncbi_prot_acc_parallel)(ss) for ss in ss_names) # ------------------------------------------------------------------------ rmtree(dir_temp) # ------------------------------------------------------------------------ rerun = input('\nRepeat ([y]/n)? ').lower().strip() if rerun.startswith('y') or rerun == '': print() return False else: print('\nExiting...') return True
def dnld_sra_fastq_files(sras, sra_runs_info, dir_fq_data, fasterq_dump, threads, dir_temp): if len(sras) > 0: if fasterq_dump is None: Log.err('fasterq-dump from SRA Toolkit is not available. ' + 'Cannot continue. Exiting.') exit(0) print() Log.inf('Downloading SRA read data.') se_fastq_files = {} pe_fastq_files = {} for sra in sras: sra_run_info = sra_runs_info[sra] sra_lib_layout = sra_run_info['LibraryLayout'].lower() sra_lib_layout_k = sra_run_info['KakapoLibraryLayout'].lower() sample_base_name = sra_run_info['KakapoSampleBaseName'] sra_taxid = int(sra_run_info['TaxID']) avg_len = int(sra_run_info['avgLength']) sra_dnld_needed = False if sra_lib_layout == 'single' or sra_lib_layout_k == 'single': se_file = opj(dir_fq_data, sra + '.fastq') se_fastq_files[sample_base_name] = {'path': se_file} se_fastq_files[sample_base_name]['src'] = 'sra' se_fastq_files[sample_base_name]['avg_len'] = avg_len se_fastq_files[sample_base_name]['tax_id'] = sra_taxid if not ope(se_file): sra_dnld_needed = True elif sra_lib_layout == 'paired': pe_file_1 = opj(dir_fq_data, sra + '_1.fastq') pe_file_2 = opj(dir_fq_data, sra + '_2.fastq') pe_file_1_renamed = opj(dir_fq_data, sra + '_R1.fastq') pe_file_2_renamed = opj(dir_fq_data, sra + '_R2.fastq') pe_fastq_files[sample_base_name] = { 'path': [pe_file_1_renamed, pe_file_2_renamed] } pe_fastq_files[sample_base_name]['src'] = 'sra' pe_fastq_files[sample_base_name]['avg_len'] = avg_len // 2 pe_fastq_files[sample_base_name]['tax_id'] = sra_taxid if sra_lib_layout_k == 'paired_unp': pe_file_3 = opj(dir_fq_data, sra + '.fastq') pe_file_3_renamed = opj(dir_fq_data, sra + '_R3.fastq') pe_fastq_files[sample_base_name]['path'].append( pe_file_3_renamed) if not ope(pe_file_1_renamed) or not ope(pe_file_2_renamed): sra_dnld_needed = True if not sra_dnld_needed: Log.msg('FASTQ reads are available locally:', sample_base_name) retry_count = 0 while sra_dnld_needed: if retry_count > 50: Log.err('Download failed. Exiting.') rmtree(dir_temp) exit(1) elif retry_count > 0: Log.wrn('Download failed. Retrying.') sleep(2) retry_count += 1 Log.msg('Downloading FASTQ reads for:', sample_base_name) cmd = [ fasterq_dump, '--threads', str(threads * 2), '--split-3', '--bufsize', '819200', '--outdir', dir_fq_data, '--temp', dir_temp, sra ] run(cmd, do_not_raise=True) if sra_lib_layout == 'single' or sra_lib_layout_k == 'single': if not ope(se_file): continue elif sra_lib_layout == 'paired': if not ope(pe_file_1) or not ope(pe_file_2): continue else: move(pe_file_1, pe_file_1_renamed) move(pe_file_2, pe_file_2_renamed) if sra_lib_layout_k == 'paired_unp': if not ope(pe_file_3): continue else: move(pe_file_3, pe_file_3_renamed) sra_dnld_needed = False if sra_lib_layout == 'single' or sra_lib_layout_k == 'single': if ope(se_file): Log.msg('Renaming FASTQ reads in:', se_file) rename_fq_seqs(se_file, sra, '1:N:0') elif sra_lib_layout == 'paired': if ope(pe_file_1_renamed): Log.msg('Renaming FASTQ reads in:', pe_file_1_renamed) rename_fq_seqs(pe_file_1_renamed, sra, '1:N:0') if ope(pe_file_2_renamed): Log.msg('Renaming FASTQ reads in:', pe_file_2_renamed) rename_fq_seqs(pe_file_2_renamed, sra, '2:N:0') if sra_lib_layout_k == 'paired_unp': if ope(pe_file_3_renamed): Log.msg('Renaming FASTQ reads in:', pe_file_3_renamed) rename_fq_seqs(pe_file_3_renamed, sra + '_unpaired', '1:N:0') return se_fastq_files, pe_fastq_files, sra_runs_info
def min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp, dir_cache_fq_minlen, vsearch): # lowest allowable low = 35 if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Calculating minimum acceptable read length.') if vsearch is None: Log.err('vsearch is not available. Cannot continue. Exiting.') exit(0) else: return None __ = opj(dir_cache_fq_minlen, 'minlen') pickled = {} if ope(__): with open(__, 'rb') as f: pickled = pickle.load(f) queue = [] for se in se_fastq_files: src = se_fastq_files[se]['src'] avg_len = se_fastq_files[se]['avg_len'] if src == 'sra': ml = max(avg_len // 3, low) se_fastq_files[se]['min_acc_len'] = ml Log.msg(str(ml) + ' nt:', se) continue fq_path = se_fastq_files[se]['path'] stats_file = opj(dir_temp, se + '_stats.txt') queue.append([se, fq_path, stats_file, 'se']) for pe in pe_fastq_files: src = pe_fastq_files[pe]['src'] avg_len = pe_fastq_files[pe]['avg_len'] if src == 'sra': ml = max(avg_len // 3, low) pe_fastq_files[pe]['min_acc_len'] = ml Log.msg(str(ml) + ' nt:', pe) continue fq_path = pe_fastq_files[pe]['path'][0] stats_file = opj(dir_temp, pe + '_stats.txt') queue.append([pe, fq_path, stats_file, 'pe']) for x in queue: if x[0] in pickled: ml = pickled[x[0]] else: # ---------------------------------------------------------------- # Use 'vsearch --fastq_stats'. About 2x slower than the # approx_avg_read_len_fq function. # # cmd = [vsearch, '--fastq_stats', x[1], '--log', x[2]] # run(cmd, do_not_raise=True) # with open(x[2]) as f: # stats = f.read() # remove(x[2]) # ml = re.findall(r'>=\s+(\d+)', stats) # if len(ml) != 0: # ml = max(int(ml[0]) // 3, low) # else: # ml = None # ---------------------------------------------------------------- # 22:59:12 50 nt: Hylocereus_polyrhizus_1195597_SRR7829961 # 22:59:46 50 nt: Schlumbergera_truncata_15H-02_pol_S47 34s # 23:00:30 50 nt: Schlumbergera_truncata_15H-02_sty_S49 44s # ---------------------------------------------------------------- # ---------------------------------------------------------------- ml = approx_avg_read_len_fq(x[1]) ml = max(int(ml) // 3, low) # ---------------------------------------------------------------- # 23:12:06 50 nt: Hylocereus_polyrhizus_1195597_SRR7829961 # 23:12:20 50 nt: Schlumbergera_truncata_15H-02_pol_S47 14s # 23:12:39 50 nt: Schlumbergera_truncata_15H-02_sty_S49 19s # ---------------------------------------------------------------- pickled[x[0]] = ml if ml is not None: Log.msg(str(ml) + ' nt:', x[0]) else: Log.msg(' ?' + ' nt:', x[0]) ml = low if x[3] == 'se': se_fastq_files[x[0]]['min_acc_len'] = ml elif x[3] == 'pe': pe_fastq_files[x[0]]['min_acc_len'] = ml with open(__, 'wb') as f: pickle.dump(pickled, f, protocol=PICKLE_PROTOCOL)
def dnld_sra_info(sras, dir_cache_prj): sra_runs_info = {} sras_acceptable = [] if len(sras) > 0: print() Log.inf('Downloading SRA run information.') else: return sra_runs_info, sras_acceptable __ = opj(dir_cache_prj, 'sra_runs_info_cache') if ope(__): with open(__, 'rb') as f: sra_runs_info = pickle.load(f) sras_local = [k for k in sra_runs_info.keys()] sras_to_dnld = set(sras).difference(set(sras_local)) if len(sras_to_dnld) > 0: temp = sra_run_info(list(sras_to_dnld)) new_sra_runs_info = {i['Run']: i for i in temp} sra_runs_info.update(new_sra_runs_info) for sra in sras: if sra in sra_runs_info: info = sra_runs_info[sra] sra_lib_layout = info['LibraryLayout'].lower() sra_lib_source = info['LibrarySource'].lower() sra_lib_strategy = info['LibraryStrategy'] sra_seq_platform = info['Platform'].lower().capitalize() sra_seq_platform_model = info['Model'] sra_species = info['ScientificName'] sra_taxid = info['TaxID'] sra_spots = int(info['spots']) sra_spots_with_mates = int(info['spots_with_mates']) sample_base_name = (sra_species.replace(' ', '_') + '_' + sra_taxid + '_' + sra) sra_runs_info[sra]['KakapoSampleBaseName'] = sample_base_name src_check = sra_lib_source.lower() strategy_check = sra_lib_strategy.lower() if not ('transcript' in src_check or 'rna' in src_check or 'rna' in strategy_check): sra_info_str = ('{sra}: the SRA library source type "{ltype}" ' 'or library strategy "{strategy}" ' 'is not supported.').format( sra=sra, ltype=sra_lib_source, strategy=sra_lib_strategy) Log.err(sra_info_str, 'Skipping.') elif sra_seq_platform != 'Illumina': sra_info_str = ('{sra}: the SRA library sequencing platform ' '"{plat}" is not supported').format( sra=sra, plat=sra_seq_platform) Log.err(sra_info_str, 'Skipping.') else: # sra_info_str = ('SRA run {sra} {strategy} ({source}) ' # '{layout}-end library.\n' # 'Sourced from {species} ' # '(TaxID: {txid}).\n' # 'Sequenced using {platform} platform on ' # '{model}.').format( # sra=sra, # source=sra_lib_source.title(), # strategy=sra_lib_strategy, # layout=sra_lib_layout, # platform=sra_seq_platform, # model=sra_seq_platform_model, # species=sra_species, # txid=sra_taxid) Log.msg( '{sra}:'.format(sra=sra), '{strategy} {layout}-end library ({source}).'.format( strategy=sra_lib_strategy, layout=sra_lib_layout, source=sra_lib_source.title())) Log.msg( ' Source:', '{species} (TaxID: {txid}).'.format(species=sra_species, txid=sra_taxid), False) Log.msg( 'Technology:', '{platform} platform on {model}.'.format( platform=sra_seq_platform, model=sra_seq_platform_model), False) sra_runs_info[sra]['KakapoLibraryLayout'] = \ sra_runs_info[sra]['LibraryLayout'] if sra_lib_layout == 'paired' and sra_spots_with_mates == 0: sra_runs_info[sra]['KakapoLibraryLayout'] = 'SINGLE' # sra_info_str = ( # sra_info_str + '\nListed as containing ' # 'paired-end reads, but only a single set of reads ' # 'is available. Treating as single-ended.') elif (sra_lib_layout == 'paired' and sra_spots != sra_spots_with_mates): sra_runs_info[sra]['KakapoLibraryLayout'] = 'PAIRED_UNP' # sra_info_str = ( # sra_info_str + '\nListed as containing ' # 'paired-end reads, but not all reads are paired.') sras_acceptable.append(sra) # Log.msg(sra_info_str) with open(__, 'wb') as f: pickle.dump(sra_runs_info, f, protocol=PICKLE_PROTOCOL) return sra_runs_info, sras_acceptable
def run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data, trimmomatic, adapters, fpatt, threads): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Running Trimmomatic.') if trimmomatic is None: Log.err('trimmomatic is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fq_trim_data_sample = opj(dir_fq_trim_data, se) fq_path = se_fastq_files[se]['cor_path_fq'] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path) min_acc_len = se_fastq_files[se]['min_acc_len'] stats_f = opj(dir_fq_trim_data_sample, se + '.txt') out_f = opj(dir_fq_trim_data_sample, se + '.fastq' + ext) se_fastq_files[se]['trim_path_fq'] = out_f if ope(dir_fq_trim_data_sample): Log.msg('Trimmed FASTQ file already exists:', se) else: make_dirs(dir_fq_trim_data_sample) Log.msg('SE mode:', se) trimmomatic_se(trimmomatic=trimmomatic, adapters=adapters, in_file=fq_path, out_file=out_f, stats_file=stats_f, threads=threads, minlen=min_acc_len) for pe in pe_fastq_files: dir_fq_trim_data_sample = opj(dir_fq_trim_data, pe) fq_path_1 = pe_fastq_files[pe]['cor_path_fq'][0] fq_path_2 = pe_fastq_files[pe]['cor_path_fq'][1] fq_path_3 = None r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1) if len(pe_fastq_files[pe]['cor_path_fq']) == 3: fq_path_3 = pe_fastq_files[pe]['cor_path_fq'][2] min_acc_len = pe_fastq_files[pe]['min_acc_len'] stats_f = opj(dir_fq_trim_data_sample, pe + '.txt') out_fs = [x.replace('@D@', dir_fq_trim_data_sample) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] out_fs = [x + ext for x in out_fs] pe_fastq_files[pe]['trim_path_fq'] = out_fs if ope(dir_fq_trim_data_sample): Log.msg('Trimmed FASTQ files already exist:', pe) else: make_dirs(dir_fq_trim_data_sample) Log.msg('PE mode:', pe) trimmomatic_pe(trimmomatic=trimmomatic, adapters=adapters, in_file_1=fq_path_1, in_file_2=fq_path_2, out_file_paired_1=out_fs[0], out_file_paired_2=out_fs[1], out_file_unpaired_1=out_fs[2], out_file_unpaired_2=out_fs[3], stats_file=stats_f, threads=threads, minlen=min_acc_len) if fq_path_3 is not None: out_f = opj(dir_fq_trim_data_sample, 'unpaired.fastq' + ext) stats_f = opj(dir_fq_trim_data_sample, pe + '_unpaired.txt') Log.msg( 'SE mode (Paired-read SRA run contains unpaired reads):', pe) trimmomatic_se(trimmomatic=trimmomatic, adapters=adapters, in_file=fq_path_3, out_file=out_f, stats_file=stats_f, threads=threads, minlen=min_acc_len) _ = opj(dir_fq_trim_data_sample, 'temp.fastq' + ext) f_temp = fqopen(_, w_mode) with fileinput.FileInput( files=[out_fs[2], out_f], openhook=fileinput.hook_compressed) as f: for line in f: f_temp.write(line) f_temp.close() remove(out_fs[2]) remove(out_f) copyfile(_, out_fs[2]) remove(_)
def run_tblastn_on_assemblies(ss, assemblies, aa_queries_file, tblastn, dir_prj_assmbl_blast_results, blast_2_evalue, blast_2_max_hsps, blast_2_qcov_hsp_perc, blast_2_best_hit_overhang, blast_2_best_hit_score_edge, blast_2_max_target_seqs, threads, dir_cache_prj, dir_prj_ips): if len(assemblies) > 0: print() Log.inf('Running BLAST on assemblies:', ss) if tblastn is None: Log.err('tblastn is not available. Cannot continue. Exiting.') exit(0) else: Log.wrn('There are no assemblies. Nothing to do, stopping.') exit(0) cache_file = opj(dir_cache_prj, 'blast_2_settings_cache__' + ss) pickled = dict() settings = {'blast_2_evalue': blast_2_evalue, 'blast_2_max_hsps': blast_2_max_hsps, 'blast_2_qcov_hsp_perc': blast_2_qcov_hsp_perc, 'blast_2_best_hit_overhang': blast_2_best_hit_overhang, 'blast_2_best_hit_score_edge': blast_2_best_hit_score_edge, 'blast_2_max_target_seqs': blast_2_max_target_seqs, 'queries': seq_records_to_dict( read_fasta(aa_queries_file, SEQ_TYPE_AA))} Log.msg('evalue:', str(blast_2_evalue)) Log.msg('max_hsps:', str(blast_2_max_hsps)) Log.msg('qcov_hsp_perc:', str(blast_2_qcov_hsp_perc)) Log.msg('best_hit_overhang:', str(blast_2_best_hit_overhang)) Log.msg('best_hit_score_edge:', str(blast_2_best_hit_score_edge)) Log.msg('max_target_seqs:', str(blast_2_max_target_seqs)) print() for a in assemblies: assmbl_src = a['src'] assmbl_name = a['name'] if assmbl_src != 'user_fasta': if assmbl_name.endswith('__' + ss): assmbl_name = assmbl_name.replace('__' + ss, '') else: continue assmbl_blast_db_path = a['blast_db_path'] assmbl_genetic_code = a['gc_id'] ips_json_dump_path = opj(dir_prj_ips, assmbl_name + '_ann_ips__' + ss + '.json') _ = opj(dir_prj_assmbl_blast_results, assmbl_name + '__' + ss + '.tsv') if ope(_) and ope(cache_file): with open(cache_file, 'rb') as f: pickled = pickle.load(f) if ope(_) and pickled == settings: # Log.msg('The provided BLAST settings and query sequences did ' # 'not change since the previous run.') Log.msg('BLAST results already exist:', assmbl_name) else: Log.msg('Running tblastn on: ' + assmbl_name, ss) if ope(ips_json_dump_path): osremove(ips_json_dump_path) run_blast(exec_file=tblastn, task='tblastn', threads=threads, db_path=assmbl_blast_db_path, queries_file=aa_queries_file, out_file=_, evalue=blast_2_evalue, max_hsps=blast_2_max_hsps, qcov_hsp_perc=blast_2_qcov_hsp_perc, best_hit_overhang=blast_2_best_hit_overhang, best_hit_score_edge=blast_2_best_hit_score_edge, max_target_seqs=blast_2_max_target_seqs, db_genetic_code=assmbl_genetic_code, out_cols=BLST_RES_COLS_2) a['blast_hits_aa__' + ss] = parse_blast_results_file(_, BLST_RES_COLS_2) with open(cache_file, 'wb') as f: pickle.dump(settings, f, protocol=PICKLE_PROTOCOL)
def min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp, dir_cache_fq_minlen): # lowest allowable low = 35 if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Calculating minimum acceptable read length.') else: return None __ = opj(dir_cache_fq_minlen, 'minlen') pickled = {} if ope(__): with open(__, 'rb') as f: pickled = pickle.load(f) queue = [] for se in se_fastq_files: src = se_fastq_files[se]['src'] avg_len = se_fastq_files[se]['avg_len'] if src == 'sra': ml = max(avg_len // 3, low) se_fastq_files[se]['min_acc_len'] = ml Log.msg(str(ml) + ' nt:', se) continue fq_path = se_fastq_files[se]['path'] stats_file = opj(dir_temp, se + '_stats.txt') queue.append([se, fq_path, stats_file, 'se']) for pe in pe_fastq_files: src = pe_fastq_files[pe]['src'] avg_len = pe_fastq_files[pe]['avg_len'] if src == 'sra': ml = max(avg_len // 3, low) pe_fastq_files[pe]['min_acc_len'] = ml Log.msg(str(ml) + ' nt:', pe) continue fq_path = pe_fastq_files[pe]['path'][0] stats_file = opj(dir_temp, pe + '_stats.txt') queue.append([pe, fq_path, stats_file, 'pe']) for x in queue: if x[0] in pickled: ml = pickled[x[0]] else: ml = avg_read_len_fq(x[1]) ml = max(int(ml) // 3, low) pickled[x[0]] = ml if ml is not None: Log.msg(str(ml) + ' nt:', x[0]) else: Log.msg(' ?' + ' nt:', x[0]) ml = low if x[3] == 'se': se_fastq_files[x[0]]['min_acc_len'] = ml elif x[3] == 'pe': pe_fastq_files[x[0]]['min_acc_len'] = ml with open(__, 'wb') as f: pickle.dump(pickled, f, protocol=PICKLE_PROTOCOL)