def MergeReads(R1, R2, outname, read_length): usearch = args.usearch pretrim_R1 = outname + '.pretrim_R1.fq' pretrim_R2 = outname + '.pretrim_R2.fq' ufitslib.log.debug("Removing index 3prime bp 'A' from reads") cmd = [ 'vsearch', '--fastq_filter', R1, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R1 ] ufitslib.runSubprocess(cmd, ufitslib.log) cmd = [ 'vsearch', '--fastq_filter', R2, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R2 ] ufitslib.runSubprocess(cmd, ufitslib.log) #next run USEARCH mergepe merge_out = outname + '.merged.fq' skip_for = outname + '.notmerged.R1.fq' ufitslib.log.debug("Now merging PE reads") cmd = [ usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads, '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for, '-minhsp', '12', '-fastq_maxdiffs', '8' ] ufitslib.runSubprocess(cmd, ufitslib.log) #now concatenate files for downstream pre-process_illumina.py script outname = outname + '.fq' final_out = os.path.join(args.out, outname) with open(final_out, 'w') as cat_file: shutil.copyfileobj(open(merge_out, 'rU'), cat_file) if args.rescue_forward == 'on': shutil.copyfileobj(open(skip_for, 'rU'), cat_file) #count output origcount = ufitslib.countfastq(R1) finalcount = ufitslib.countfastq(final_out) pct_out = finalcount / float(origcount) #clean and close up intermediate files os.remove(merge_out) os.remove(pretrim_R1) os.remove(pretrim_R2) os.remove(skip_for) return ufitslib.log.info('{0:,}'.format(finalcount) + ' reads passed (' + '{0:.1%}'.format(pct_out) + ')')
def MergeReads(R1, R2, outname, read_length): usearch = args.usearch pretrim_R1 = outname + '.pretrim_R1.fq' pretrim_R2 = outname + '.pretrim_R2.fq' ufitslib.log.debug("Removing index 3prime bp 'A' from reads") cmd = ['vsearch', '--fastq_filter', R1, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R1] ufitslib.runSubprocess(cmd, ufitslib.log) cmd = ['vsearch', '--fastq_filter', R2, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R2] ufitslib.runSubprocess(cmd, ufitslib.log) #next run USEARCH mergepe merge_out = outname + '.merged.fq' skip_for = outname + '.notmerged.R1.fq' ufitslib.log.debug("Now merging PE reads") cmd = [usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads, '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for,'-minhsp', '12','-fastq_maxdiffs', '8'] ufitslib.runSubprocess(cmd, ufitslib.log) #now concatenate files for downstream pre-process_illumina.py script outname = outname + '.fq' final_out = os.path.join(args.out, outname) with open(final_out, 'w') as cat_file: shutil.copyfileobj(open(merge_out,'rU'), cat_file) if args.rescue_forward == 'on': shutil.copyfileobj(open(skip_for,'rU'), cat_file) #count output origcount = ufitslib.countfastq(R1) finalcount = ufitslib.countfastq(final_out) pct_out = finalcount / float(origcount) #clean and close up intermediate files os.remove(merge_out) os.remove(pretrim_R1) os.remove(pretrim_R2) os.remove(skip_for) return ufitslib.log.info('{0:,}'.format(finalcount) + ' reads passed ('+'{0:.1%}'.format(pct_out)+')')
orig_total = ufitslib.countfastq(args.FASTQ) size = checkfastqsize(args.FASTQ) readablesize = ufitslib.convertSize(size) ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step and convert to fasta filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa') orig_fasta = os.path.join(tmp, args.out + '.orig.fa') ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55' ] ufitslib.runSubprocess(cmd, ufitslib.log) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55' ] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfastq(filter_out) ufitslib.log.info('{0:,}'.format(total) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa') ufitslib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_', '--sizeout', '--output', derep_out ]
if not args.taxonomy: #start with less common uses, i.e. Blast, rdp if args.method == 'blast': #check if command line blast installed if not ufitslib.which('blastn'): ufitslib.log.error("BLASTN not found in your PATH, exiting.") sys.exit(1) #now run blast remotely using NCBI nt database outformat = "6 qseqid sseqid pident stitle" if args.local_blast: #get number of cpus cpus = multiprocessing.cpu_count() - 2 ufitslib.log.info("Running local BLAST using db: %s" % args.local_blast) cmd = ['blastn', '-num_threads', str(cpus), '-query', args.fasta, '-db', os.path.abspath(args.local_blast), '-max_target_seqs', '1', '-outfmt', outformat, '-out', blast_out] ufitslib.runSubprocess(cmd, ufitslib.log) else: ufitslib.log.info("Running BLASTN using NCBI remote nt database, this may take awhile") cmd = ['blastn', '-query', args.fasta, '-db', 'nt', '-remote', '-max_target_seqs', '1', '-outfmt', outformat, '-out', blast_out] ufitslib.runSubprocess(cmd, ufitslib.log) #load results and reformat new = [] f = csv.reader(open(blast_out), delimiter='\t') for col in f: query = col[0] gbID = col[1].split("|")[3] pident = col[2] name = col[3] tax = gbID + ";" + name + " (" + pident + ")" line = [query,tax]