'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55' ] ufitslib.runSubprocess(cmd, ufitslib.log) qtrimtotal = ufitslib.countfastq(filter_out) ufitslib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa') ufitslib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out ] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfasta(derep_out) ufitslib.log.info('{0:,}'.format(total) + ' reads passed') #now run sort by size sort_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.sort.fa') ufitslib.log.info( "Sorting reads by size: removing reads seen less than %s times" % args.minsize) cmd = [ 'vsearch', '--sortbysize', derep_out, '--minsize', args.minsize, '--output', sort_out ] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfasta(sort_out) ufitslib.log.info('{0:,}'.format(total) + ' reads passed')
'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55' ] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfastq(filter_out) ufitslib.log.info('{0:,}'.format(total) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa') ufitslib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_', '--sizeout', '--output', derep_out ] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfasta(derep_out) ufitslib.log.info('{0:,}'.format(total) + ' reads passed') #now run de-noiser UNOISE2 ufitslib.log.info("Denoising reads with UNOISE2") unoise_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.unoise.fa') cmd = [ usearch, '-unoise2', derep_out, '-fastaout', unoise_out, '--minampsize', args.minampout ] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfasta(unoise_out) ufitslib.log.info('{0:,}'.format(total) + ' denoised sequences') #now cluster to biological OTUs with UCLUST radius = float(args.pct_otu) / 100.
if not utax_db: utax_db = args.utax_db if not usearch_db: usearch_db = args.usearch_db else: utax_db = args.utax_db usearch_db = args.usearch_db if args.method in ['hybrid', 'usearch', 'utax']: if not utax_db and not usearch_db and not args.fasta_db: ufitslib.log.error("You have not selected a database, need either --db, --utax_db, --usearch_db, or --fasta_db") sys.exit(1) #Count records ufitslib.log.info("Loading FASTA Records") total = ufitslib.countfasta(args.fasta) ufitslib.log.info('{0:,}'.format(total) + ' OTUs') #declare output files/variables here blast_out = base + '.blast.txt' rdp_out = base + '.rdp.txt' utax_out = base + '.usearch.txt' usearch_out = base + '.usearch.txt' sintax_out = base + '.sintax.txt' if not args.taxonomy: #start with less common uses, i.e. Blast, rdp if args.method == 'blast': #check if command line blast installed if not ufitslib.which('blastn'): ufitslib.log.error("BLASTN not found in your PATH, exiting.")
) sys.exit(1) #get default mock community value if args.mc == "mock3": mock = os.path.join(parentdir, 'DB', 'ufits_mock3.fa') elif args.mc == "mock2": mock = os.path.join(parentdir, 'DB', 'ufits_mock2.fa') elif args.mc == "mock1": mock = os.path.join(parentdir, 'DB', 'ufits_mock1.fa') elif args.mc == "synmock": mock = os.path.join(parentdir, 'DB', 'ufits_synmock.fa') else: mock = os.path.abspath(args.mc) #open mock community fasta and count records mock_ref_count = ufitslib.countfasta(mock) #map OTUs to mock community ufitslib.log.info("Mapping OTUs to Mock Community (USEARCH)") cmd = [ usearch, '-usearch_global', mock, '-strand', 'plus', '-id', '0.95', '-db', args.fasta, '-uc', mock_out, '-maxaccepts', '3' ] ufitslib.runSubprocess(cmd, ufitslib.log) #sort the output to avoid problems with open(mock_sort, 'w') as output: subprocess.call(['sort', '-k4,4nr', mock_out], stdout=output) #generate dictionary for name change found_dict = {} missing = []
uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref+'.extracted.fa') if not os.path.isfile(uchime_db): ufitslib.log.error("Database not properly configured, run `ufits install` to setup DB, skipping chimera filtering") uchime_out = fastaout else: if os.path.isfile(args.uchime_ref): uchime_db = os.path.abspath(args.uchime_ref) else: ufitslib.log.error("%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref) uchime_out = fastaout #now run chimera filtering if all checks out if not os.path.isfile(uchime_out): ufitslib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref) cmd = ['vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db', uchime_db, '--nonchimeras', uchime_out] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfasta(uchime_out) uchime_chimeras = validSeqs - total ufitslib.log.info('{0:,}'.format(total) + ' iSeqs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras removed') #now reformat OTUs and OTU table, dropping chimeric OTUs from table, sorting the output as well nonchimeras = ufitslib.fasta2list(uchime_out) inferredSeqs = SeqIO.index(uchime_out, 'fasta') with open(iSeqs, 'w') as iSeqout: for x in natsorted(nonchimeras): SeqIO.write(inferredSeqs[x], iSeqout, 'fasta') if not args.debug: #clean up chimeras fasta ufitslib.removefile(uchime_out) if os.path.isfile(fastaout): ufitslib.removefile(fastaout)
filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa') orig_fasta = os.path.join(tmp, args.out+'.orig.fa') ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55'] ufitslib.runSubprocess(cmd, ufitslib.log) cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55'] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfastq(filter_out) ufitslib.log.info('{0:,}'.format(total) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa') ufitslib.log.info("De-replication (remove duplicate reads)") cmd = ['vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_', '--sizeout', '--output', derep_out] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfasta(derep_out) ufitslib.log.info('{0:,}'.format(total) + ' reads passed') #now run de-noiser UNOISE2 ufitslib.log.info("Denoising reads with UNOISE2") unoise_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.unoise.fa') cmd = [usearch, '-unoise2', derep_out, '-fastaout', unoise_out, '--minampsize', args.minampout] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfasta(unoise_out) ufitslib.log.info('{0:,}'.format(total) + ' denoised sequences') #now cluster to biological OTUs with UCLUST radius = float(args.pct_otu) / 100. ufitslib.log.info("Clustering denoised sequences into OTUs at %s%%" % args.pct_otu) uclust_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.uclust.fa') cmd = [usearch, '-cluster_smallmem', unoise_out, '-id', str(radius), '-centroids', uclust_out, '-relabel', 'OTU']
ufitslib.log.error("If using the -b,--barcode option you must specify a fasta file of mock community via the --mc option") sys.exit(1) #get default mock community value if args.mc == "mock3": mock = os.path.join(parentdir, 'DB', 'ufits_mock3.fa') elif args.mc == "mock2": mock = os.path.join(parentdir, 'DB', 'ufits_mock2.fa') elif args.mc == "mock1": mock = os.path.join(parentdir, 'DB', 'ufits_mock1.fa') elif args.mc == "synmock": mock = os.path.join(parentdir, 'DB', 'ufits_synmock.fa') else: mock = os.path.abspath(args.mc) #open mock community fasta and count records mock_ref_count = ufitslib.countfasta(mock) #map OTUs to mock community ufitslib.log.info("Mapping OTUs to Mock Community (USEARCH)") cmd = [usearch, '-usearch_global', mock, '-strand', 'plus', '-id', '0.95', '-db', args.fasta, '-uc', mock_out, '-maxaccepts', '3'] ufitslib.runSubprocess(cmd, ufitslib.log) #sort the output to avoid problems with open(mock_sort, 'w') as output: subprocess.call(['sort', '-k4,4nr', mock_out], stdout = output) #generate dictionary for name change found_dict = {} missing = [] chimeras = [] seen = [] with open(mock_sort, 'rU') as map:
filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa') orig_fasta = os.path.join(tmp, args.out+'.orig.fa') ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55'] ufitslib.runSubprocess(cmd, ufitslib.log) cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55'] ufitslib.runSubprocess(cmd, ufitslib.log) qtrimtotal = ufitslib.countfastq(filter_out) ufitslib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa') ufitslib.log.info("De-replication (remove duplicate reads)") cmd = ['vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfasta(derep_out) ufitslib.log.info('{0:,}'.format(total) + ' reads passed') #now run sort by size sort_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.sort.fa') ufitslib.log.info("Sorting reads by size: removing reads seen less than %s times" % args.minsize) cmd = ['vsearch', '--sortbysize', derep_out, '--minsize', args.minsize, '--output', sort_out] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfasta(sort_out) ufitslib.log.info('{0:,}'.format(total) + ' reads passed') #chimera detection #first run through de novo chimera detection ufitslib.log.info("De novo chimera detection (VSEARCH)") chimera_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.chimera_check.fa') cmd = ['vsearch', '--uchime_denovo', sort_out, '--relabel', 'Seq', '--sizeout', '--nonchimeras', chimera_out]
filter_fasta, "--fastq_qmax", "55", ] ufitslib.runSubprocess(cmd, ufitslib.log) cmd = ["vsearch", "--fastq_filter", args.FASTQ, "--fastaout", orig_fasta, "--fastq_qmax", "55"] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfastq(filter_out) ufitslib.log.info("{0:,}".format(total) + " reads passed") # now run full length dereplication derep_out = os.path.join(tmp, args.out + ".EE" + args.maxee + ".derep.fa") ufitslib.log.info("De-replication (remove duplicate reads)") cmd = ["vsearch", "--derep_fulllength", filter_fasta, "--sizeout", "--output", derep_out] ufitslib.runSubprocess(cmd, ufitslib.log) total = ufitslib.countfasta(derep_out) ufitslib.log.info("{0:,}".format(total) + " reads passed") # optional run UNOISE if args.unoise: unoise_out = unoise_out = os.path.join(tmp, args.out + ".EE" + args.maxee + ".denoised.fa") ufitslib.log.info("Denoising Data with UNOISE") cmd = [ usearch, "-cluster_fast", derep_out, "-centroids", unoise_out, "-id", "0.9", "--maxdiffs",