示例#1
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-OTU_cluster_ref.py',
        usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
        description='''Script runs UPARSE OTU clustering.
		Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        dest="FASTQ",
                        required=True,
                        help='FASTQ file (Required)')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='Quality trim EE value')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="OTU Clustering Percent")
    parser.add_argument('--id', default='97', help="Threshold for alignment")
    parser.add_argument('-m',
                        '--minsize',
                        default='2',
                        help='Min identical seqs to process')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--map_filtered',
                        action='store_true',
                        help='map quality filtered reads back to OTUs')
    parser.add_argument(
        '-d',
        '--db',
        required=True,
        help='Reference Database [ITS,ITS1,ITS2,16S,LSU,COI,custom]')
    parser.add_argument('--utax_db', help='UTAX Reference Database')
    parser.add_argument('--utax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='UTAX confidence value threshold.')
    parser.add_argument('--utax_level',
                        default='k',
                        choices=['k', 'p', 'c', 'o', 'f', 'g', 's'],
                        help='UTAX classification level to retain')
    parser.add_argument('--mock',
                        default='synmock',
                        help='Spike-in mock community (fasta)')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--closed_ref_only',
                        action='store_true',
                        help='Only run closed reference clustering')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.FASTQ:
            base = os.path.basename(args.FASTQ).split('.demux')[0]
        else:
            base = os.path.basename(args.FASTQ).split('.f')[0]

    taxonomyLookup = {
        'k': 'Kingdom',
        'p': 'Phylum',
        'c': 'Class',
        'o': 'Order',
        'f': 'Family',
        'g': 'Genus',
        's': 'Species'
    }

    #remove logfile if exists
    log_name = base + '.amptk-cluster_ref.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #make tmp folder
    tmp = base + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    #Setup DB locations and names, etc
    DBdir = os.path.join(parentdir, 'DB')
    DataBase = {
        'ITS1':
        (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb')),
        'ITS2':
        (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb')),
        'ITS': (os.path.join(DBdir,
                             'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb')),
        '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir,
                                                             '16S.udb')),
        'LSU': (os.path.join(DBdir,
                             'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb')),
        'COI': (os.path.join(DBdir,
                             'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb'))
    }

    #setup refDB
    amptklib.log.info("Checking Reference Database")
    if args.db in DataBase:
        #need to write to fasta from vsearch UDB
        DB = os.path.join(tmp, args.db + '.extracted.fa')
        cmd = [
            'vsearch', '--udb2fasta',
            DataBase.get(args.db)[0], '--output', DB
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
    else:
        DB = os.path.abspath(args.db)
    refDB = os.path.join(tmp, 'reference_DB.fa')
    if args.mock:
        if args.mock == 'synmock':
            mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa')
        else:
            mock = os.path.abspath(args.mock)
    seen = []
    with open(refDB, 'w') as output:
        if args.mock:
            with open(mock) as input1:
                for rec in SeqIO.parse(input1, 'fasta'):
                    if not rec.id in seen:
                        SeqIO.write(rec, output, 'fasta')
                    else:
                        amptklib.log.error(
                            "Duplicate ID's in Ref DB: %s, exiting" % rec.id)
                        sys.exit(1)
        with open(DB) as input2:
            for rec in SeqIO.parse(input2, 'fasta'):
                if not rec.id in seen:
                    SeqIO.write(rec, output, 'fasta')
                else:
                    amptklib.log.error(
                        "Duplicate ID's in Ref DB: %s, exiting" % rec.id)
                    sys.exit(1)

    #get utax_database
    if args.db in DataBase:
        utaxDB = DataBase.get(args.db)[1]
    else:
        if not args.closed_ref_only:
            if args.utax_db:
                utaxDB = os.path.abspath(args.utax_db)
            else:
                amptklib.log.error(
                    "%s not pre-installed DB, must then also specify valid UTAX database via --utax_db"
                    % args.db)
                sys.exit(1)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    #convert to FASTA for mapping
    orig_fasta = os.path.join(tmp, base + '.orig.fa')
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(orig_fasta)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #Expected Errors filtering step
    filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq')
    filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa')
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
        str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    qtrimtotal = amptklib.countfastq(filter_out)
    amptklib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed')

    #now run full length dereplication
    derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa')
    amptklib.log.info("De-replication (remove duplicate reads)")
    cmd = [
        'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output',
        derep_out, '--threads',
        str(cpus), '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(derep_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run sort by size
    sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa')
    amptklib.log.info(
        "Sorting reads by size: removing reads seen less than %s times" %
        args.minsize)
    cmd = [
        'vsearch', '--sortbysize', derep_out, '--minsize', args.minsize,
        '--output', sort_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(sort_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #chimera detection
    #first run through de novo chimera detection
    amptklib.log.info("De novo chimera detection (VSEARCH)")
    chimera_out = os.path.join(tmp,
                               base + '.EE' + args.maxee + '.chimera_check.fa')
    cmd = [
        'vsearch', '--uchime_denovo', sort_out, '--relabel', 'Seq',
        '--sizeout', '--nonchimeras', chimera_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(chimera_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run uchime_ref
    uchime_out = os.path.join(tmp,
                              base + '.EE' + args.maxee + '.uchime.otus.fa')
    #now run chimera filtering if all checks out
    amptklib.log.info("Chimera Filtering (VSEARCH)")
    cmd = [
        'vsearch', '--mindiv', '1.0', '--uchime_ref', chimera_out, '--db',
        refDB, '--sizeout', '--nonchimeras', uchime_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(uchime_out)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs passed')

    #now run usearch_global versus reference database
    align_out = os.path.join(tmp, base + '.align.uc')
    pident = int(args.id) * 0.01
    amptklib.log.info(
        "Reference Clustering using Global Alignment, %s%% identity" % args.id)
    cmd = [
        'vsearch', '--usearch_global', uchime_out, '--db', refDB, '--id',
        str(pident), '--output_no_hits', '--top_hits_only', '--notrunclabels',
        '--uc', align_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #parse results
    ref_results = {}
    nohits = []
    with open(align_out, 'r') as alignment:
        for line in alignment:
            line = line.replace('\n', '')
            col = line.split('\t')
            counts = col[8].split(';')
            counts = int(counts[1].replace('size=', ''))
            if col[3] == '*':
                nohits.append(col[8])
                continue
            if float(col[3]) >= float(args.id):
                if not col[8] in ref_results:
                    ref_results[col[8]] = (col[9], col[3], counts)
                else:
                    print("Error: %s duplicated ID" % col[8])
            else:
                nohits.append(col[8])

    #summarize results from first ref clustering
    num_refcluster = len(ref_results)
    seqs_refcluster = 0
    for k, v in list(ref_results.items()):
        seqs_refcluster += v[2]
    amptklib.log.info("%i OTUs classified " % num_refcluster +
                      "({0:.0f}%".format(seqs_refcluster / float(qtrimtotal) *
                                         100) + " of reads)")

    #get ref clustered hits to file with taxonomy
    ref_clustered = os.path.join(tmp, base + '.ref_clustered.fa')
    with open(ref_clustered, 'w') as refoutput:
        with open(uchime_out, 'r') as input:
            otu_counter = 1
            for rec in SeqIO.parse(input, 'fasta'):
                if rec.id in ref_results:
                    res = ref_results.get(rec.id)
                    pident = res[1]
                    tax = res[0]
                    newID = 'OTU' + str(
                        otu_counter) + ';pident=' + pident + ';' + tax
                    rec.id = newID
                    rec.name = ''
                    rec.description = ''
                    SeqIO.write(rec, refoutput, 'fasta')
                    otu_counter += 1

    if not args.closed_ref_only:
        #get nohits file to run clustering
        utax_ref = os.path.join(tmp,
                                base + '.EE' + args.maxee + '.utax_ref.fa')
        with open(utax_ref, 'w') as output:
            with open(uchime_out, 'r') as input:
                for rec in SeqIO.parse(input, 'fasta'):
                    if rec.id in nohits:
                        SeqIO.write(rec, output, 'fasta')

        #input needs to be sorted, so
        ref_sort = os.path.join(tmp, base + '.utax_ref.sorted.fa')
        cmd = [
            'vsearch', '--sortbysize', utax_ref, '--minsize', args.minsize,
            '--output', ref_sort, '--threads',
            str(cpus)
        ]
        amptklib.runSubprocess(cmd, amptklib.log)

        #now run clustering algorithm on those not found in reference database
        radius = str(100 - int(args.pct_otu))
        otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa')
        amptklib.log.info("De novo Clustering remaining sequences (UPARSE)")
        cmd = [
            usearch, '-cluster_otus', ref_sort, '-relabel', 'OTU',
            '-otu_radius_pct', radius, '-otus', otu_out
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        total = amptklib.countfasta(otu_out)
        amptklib.log.info('{0:,}'.format(total) + ' de novo OTUs')

        #try utax reference clustering
        amptklib.log.info("Reference Clustering de novo OTUs using UTAX")
        cmd = [
            usearch, '-cluster_otus_utax', otu_out, '-db', utaxDB,
            '-utax_cutoff',
            str(args.utax_cutoff), '-utax_level', 's', '-strand', 'plus',
            '-utaxout',
            os.path.join(tmp, base + '.utax.out')
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        #setup tax filtering
        tax_values = ['k', 'p', 'c', 'o', 'f', 'g', 's']
        filter_index = tax_values.index(args.utax_level)
        filt_tax_values = [s + ':' for s in tax_values[filter_index:]]
        #get results from utax
        with open(ref_clustered, 'a') as output:
            seqDict = SeqIO.index(otu_out, 'fasta')
            utaxresults = []
            with open(os.path.join(tmp, base + '.utax.out'), 'r') as utax:
                for line in utax:
                    line = line.replace('\n', '')
                    col = line.split('\t')
                    ID = col[0]
                    tax = col[2]
                    if any(x in tax for x in filt_tax_values):
                        record = seqDict[ID]
                        record.id = 'OTU' + str(
                            otu_counter) + ';UTAX;tax=' + tax
                        record.name = ''
                        record.description = ''
                        SeqIO.write(record, output, 'fasta')
                        otu_counter += 1
        total = amptklib.countfasta(ref_clustered) - num_refcluster
        amptklib.log.info('{0:,}'.format(total) + ' classified to %s' %
                          taxonomyLookup.get(args.utax_level))

    #clean up padded N's
    amptklib.log.info("Cleaning up padding from OTUs")
    otu_clean = os.path.join(tmp, base + '.clean.otus.fa')
    amptklib.fasta_strip_padding(ref_clustered, otu_clean)
    total = amptklib.countfasta(otu_clean)
    amptklib.log.info('{0:,}'.format(total) + ' total OTUs')

    #now map reads back to OTUs
    uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc')
    otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt')
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', otu_clean, '--uc', uc_out, '--otutabout', otu_table,
        '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Move files around, delete tmp if argument passed.
    currentdir = os.getcwd()
    final_otu = os.path.join(currentdir, base + '.cluster.otus.fa')
    shutil.copyfile(otu_clean, final_otu)
    final_otu_table = os.path.join(currentdir, base + '.otu_table.txt')
    shutil.copyfile(otu_table, final_otu_table)

    if not args.debug:
        shutil.rmtree(tmp)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("OTU Clustering Script has Finished Successfully")
    print("-------------------------------------------------------")
    if not not args.debug:
        print("Tmp Folder of files: %s" % tmp)
    print("Clustered OTUs: %s" % os.path.basename(final_otu))
    print("OTU Table: %s" % os.path.basename(final_otu_table))
    print("-------------------------------------------------------")

    otu_print = final_otu.split('/')[-1]
    tab_print = final_otu_table.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
示例#2
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-assign_taxonomy.py',
        usage="%(prog)s [options] -f <FASTA File>",
        description='''assign taxonomy to OTUs''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--otu_table',
                        dest="otu_table",
                        help='Append Taxonomy to OTU table')
    parser.add_argument('-f', '--fasta', required=True, help='FASTA input')
    parser.add_argument('-o', '--out', help='Output file (FASTA)')
    parser.add_argument(
        '-m',
        '--mapping_file',
        help='Mapping file: QIIME format can have extra meta data columns')
    parser.add_argument(
        '--method',
        default='hybrid',
        choices=['utax', 'usearch', 'sintax', 'hybrid', 'rdp', 'blast'],
        help='Taxonomy method')
    parser.add_argument(
        '-d',
        '--db',
        help='Pre-installed Databases: [ITS,ITS1,ITS2,16S,LSU,COI]')
    parser.add_argument(
        '-t',
        '--taxonomy',
        help='Incorporate taxonomy calculated elsewhere, 2 column file')
    parser.add_argument('--fasta_db',
                        help='Alternative database of fasta sequences')
    parser.add_argument('--add2db',
                        help='Custom FASTA database to add to DB on the fly')
    parser.add_argument('--utax_db', help='UTAX Reference Database')
    parser.add_argument('--utax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='UTAX confidence value threshold.')
    parser.add_argument('--usearch_db', help='USEARCH Reference Database')
    parser.add_argument('--usearch_cutoff',
                        default=0.7,
                        type=restricted_float,
                        help='USEARCH percent ID threshold.')
    parser.add_argument(
        '-r',
        '--rdp',
        dest='rdp',
        default='/Users/jon/scripts/rdp_classifier_2.10.1/dist/classifier.jar',
        help='Path to RDP Classifier')
    parser.add_argument('--rdp_db',
                        dest='rdp_tax',
                        default='fungalits_unite',
                        choices=[
                            '16srrna', 'fungallsu', 'fungalits_warcup',
                            'fungalits_unite'
                        ],
                        help='Training set for RDP Classifier')
    parser.add_argument('--rdp_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='RDP confidence value threshold')
    parser.add_argument('--local_blast', help='Path to local Blast DB')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH8 EXE')
    parser.add_argument('--tax_filter',
                        help='Retain only OTUs with match in OTU table')
    parser.add_argument('--sintax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='SINTAX threshold.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    if not args.out:
        #get base name of files
        if 'filtered' in args.fasta:
            base = args.fasta.split(".filtered")[0]
        elif 'otu' in args.fasta:
            base = args.fasta.split('.otu')[0]
        else:
            base = args.fasta.split('.fa')[0]
    else:
        base = args.out

    #remove logfile if exists
    log_name = base + '.amptk-taxonomy.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #Setup DB locations and names, etc
    DBdir = os.path.join(parentdir, 'DB')
    DataBase = {
        'ITS1': (os.path.join(DBdir,
                              'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb'),
                 os.path.join(DBdir, 'ITS_SINTAX.udb')),
        'ITS2': (os.path.join(DBdir,
                              'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb'),
                 os.path.join(DBdir, 'ITS_SINTAX.udb')),
        'ITS': (os.path.join(DBdir,
                             'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb'),
                os.path.join(DBdir, 'ITS_SINTAX.udb')),
        '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S.udb'),
                os.path.join(DBdir, '16S_SINTAX.udb')),
        'LSU': (os.path.join(DBdir,
                             'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb'),
                os.path.join(DBdir, 'LSU_SINTAX.udb')),
        'COI': (os.path.join(DBdir,
                             'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb'),
                os.path.join(DBdir, 'COI_SINTAX.udb'))
    }

    #get DB names up front
    if args.db in DataBase:
        utax_db = DataBase.get(args.db)[1]
        usearch_db = DataBase.get(args.db)[0]
        sintax_db = DataBase.get(args.db)[2]
        if not utax_db:
            utax_db = args.utax_db
        if not usearch_db:
            usearch_db = args.usearch_db
    else:
        utax_db = args.utax_db
        usearch_db = args.usearch_db
        if args.fasta_db:
            sintax_db = args.fasta_db
        else:
            sintax_db = args.usearch_db

    if args.method in ['hybrid', 'usearch', 'utax']:
        if not utax_db and not usearch_db and not args.fasta_db:
            amptklib.log.error(
                "You have not selected a database, need either --db, --utax_db, --usearch_db, or --fasta_db"
            )
            sys.exit(1)
        else:  #check that the DB exists
            if args.method == 'usearch' and usearch_db:
                if not amptklib.checkfile(usearch_db):
                    amptklib.log.error(
                        'USEARCH DB not found: {:}'.format(usearch_db))
                    amptklib.log.derror(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)
            if args.method == 'sintax' and sintax_db:
                if not amptklib.checkfile(sintax_db):
                    amptklib.log.error(
                        'SINTAX DB not found: {:}'.format(sintax_db))
                    amptklib.log.derror(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)
            if args.method == 'utax' and utax_db:
                if not amptklib.checkfile(utax_db):
                    amptklib.log.error(
                        'UTAX DB not found: {:}'.format(utax_db))
                    amptklib.log.error(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)

    custom_db = None
    if args.add2db:  #means user wants to add sequences to the usearch database on the so will need to rebuild database
        custom_db = base + '.custom_database.fa'
        if amptklib.checkfile(custom_db):
            amptklib.SafeRemove(custom_db)
        if args.db:  #this means that the fasta files need to be extracted
            amptklib.log.info("Adding {:} to the {:} database".format(
                os.path.basename(args.add2db), os.path.basename(usearch_db)))
            cmd = ['vsearch', '--udb2fasta', usearch_db, '--output', custom_db]
            amptklib.runSubprocess(cmd, amptklib.log)
            with open(custom_db, 'a') as outfile:
                with open(args.add2db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)
        elif args.fasta_db:
            amptklib.log.info("Adding {:} to the {:} database".format(
                os.path.basename(args.add2db),
                os.path.basename(args.fasta_db)))
            with open(custom_db, 'w') as outfile:
                with open(args.fasta_db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)
                with open(args.add2db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)

    #Count records
    amptklib.log.info("Loading FASTA Records")
    total = amptklib.countfasta(args.fasta)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs')

    #declare output files/variables here
    blast_out = base + '.blast.txt'
    rdp_out = base + '.rdp.txt'
    utax_out = base + '.usearch.txt'
    usearch_out = base + '.usearch.txt'
    sintax_out = base + '.sintax.txt'
    otuDict = {}

    if not args.taxonomy:
        #start with less common uses, i.e. Blast, rdp
        if args.method == 'blast':
            #check if command line blast installed
            if not amptklib.which('blastn'):
                amptklib.log.error("BLASTN not found in your PATH, exiting.")
                sys.exit(1)

            #now run blast remotely using NCBI nt database
            outformat = "6 qseqid sseqid pident stitle"
            if args.local_blast:
                #get number of cpus
                amptklib.log.info("Running local BLAST using db: %s" %
                                  args.local_blast)
                cmd = [
                    'blastn', '-num_threads',
                    str(cpus), '-query', args.fasta, '-db',
                    os.path.abspath(args.local_blast), '-max_target_seqs', '1',
                    '-outfmt', outformat, '-out', blast_out
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                amptklib.log.info(
                    "Running BLASTN using NCBI remote nt database, this may take awhile"
                )
                cmd = [
                    'blastn', '-query', args.fasta, '-db', 'nt', '-remote',
                    '-max_target_seqs', '1', '-outfmt', outformat, '-out',
                    blast_out
                ]
                amptklib.runSubprocess(cmd, amptklib.log)

            #load results and reformat
            new = []
            f = csv.reader(open(blast_out), delimiter=str('\t'))
            for col in f:
                query = col[0]
                gbID = col[1].split("|")[3]
                pident = col[2]
                name = col[3]
                tax = gbID + ";" + name + " (" + pident + ")"
                line = [query, tax]
                new.append(line)
            otuDict = dict(new)
        elif args.method == 'rdp':
            #check that classifier is installed
            try:
                rdp_test = subprocess.Popen(
                    ['java', '-Xmx2000m', '-jar', args.rdp, 'classify'],
                    stdout=subprocess.PIPE).communicate()[0].rstrip()
            except OSError:
                amptklib.log.error("%s not found in your PATH, exiting." %
                                   args.rdp)
                sys.exit(1)

            #RDP database
            amptklib.log.info("Using RDP classifier %s training set" %
                              args.rdp_tax)

            #run RDP
            cmd = [
                'java', '-Xmx2000m', '-jar', args.rdp, 'classify', '-g',
                args.rdp_tax, '-o', rdp_out, '-f', 'fixrank', args.fasta
            ]
            amptklib.runSubprocess(cmd, amptklib.log)

            #load in results and put into dictionary
            new = []
            removal = ["unidentified", "Incertae", "uncultured", "incertae"]
            remove_exp = [re.compile(x) for x in removal]
            f = csv.reader(open(rdp_out), delimiter=str('\t'))
            for col in f:
                if float(col[19]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11] + ",f:" + col[14] + ",g:" + col[17]
                elif float(col[16]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11] + ",f:" + col[14]
                elif float(col[13]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11]
                elif float(col[10]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[8]
                elif float(col[7]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5]
                elif float(col[4]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2]
                else:
                    tax = "RDP;k:unclassified"
                tax_split = tax.split(",")
                tax = [
                    s for s in tax_split
                    if not any(re.search(s) for re in remove_exp)
                ]
                tax = ",".join(tax)
                line = [col[0], tax]
                new.append(line)
            otuDict = dict(new)
        else:
            #check status of USEARCH DB and run
            if args.method in ['hybrid', 'usearch']:
                if args.fasta_db:
                    #now run through usearch global
                    amptklib.log.info(
                        "Global alignment OTUs with usearch_global (VSEARCH) against {:}"
                        .format(os.path.basename(args.fasta_db)))
                    cmd = [
                        'vsearch', '--usearch_global', args.fasta, '--db',
                        os.path.abspath(args.fasta_db), '--userout',
                        usearch_out, '--id',
                        str(args.usearch_cutoff), '--strand', 'both',
                        '--output_no_hits', '--maxaccepts', '0',
                        '--top_hits_only', '--userfields', 'query+target+id',
                        '--notrunclabels', '--threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                elif custom_db:
                    #now run through usearch global
                    amptklib.log.info(
                        "Global alignment OTUs with usearch_global (VSEARCH) against custom DB"
                    )
                    cmd = [
                        'vsearch', '--usearch_global', args.fasta, '--db',
                        os.path.abspath(custom_db), '--userout', usearch_out,
                        '--id',
                        str(args.usearch_cutoff), '--strand', 'both',
                        '--output_no_hits', '--maxaccepts', '0',
                        '--top_hits_only', '--userfields', 'query+target+id',
                        '--notrunclabels', '--threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                else:
                    if usearch_db:
                        amptklib.log.info(
                            "Global alignment OTUs with usearch_global (VSEARCH) against {:}"
                            .format(os.path.basename(usearch_db)))
                        cmd = [
                            'vsearch', '--usearch_global', args.fasta, '--db',
                            os.path.abspath(usearch_db), '--userout',
                            usearch_out, '--id',
                            str(args.usearch_cutoff), '--strand', 'both',
                            '--output_no_hits', '--maxaccepts', '0',
                            '--top_hits_only', '--userfields',
                            'query+target+id', '--notrunclabels', '--threads',
                            str(cpus)
                        ]
                        amptklib.runSubprocess(cmd, amptklib.log)

            if args.method in ['hybrid', 'utax']:
                if utax_db:
                    #now run through UTAX
                    utax_out = base + '.utax.txt'
                    amptklib.log.info("Classifying OTUs with UTAX (USEARCH)")
                    cutoff = str(args.utax_cutoff)
                    cmd = [
                        usearch, '-utax', args.fasta, '-db', utax_db,
                        '-utaxout', utax_out, '-utax_cutoff', cutoff,
                        '-strand', 'plus', '-notrunclabels', '-threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                else:
                    amptklib.log.error("UTAX DB %s not found, skipping" %
                                       utax_db)

            if args.method in ['hybrid', 'sintax']:
                if args.fasta_db:  #if you pass fasta file here, over ride any auto detection
                    sintax_db = args.fasta_db
                #now run sintax
                amptklib.log.info("Classifying OTUs with SINTAX (USEARCH)")
                cmd = [
                    usearch, '-sintax', args.fasta, '-db',
                    os.path.abspath(sintax_db), '-tabbedout', sintax_out,
                    '-sintax_cutoff',
                    str(args.sintax_cutoff), '-strand', 'both', '-threads',
                    str(cpus)
                ]
                amptklib.runSubprocess(cmd, amptklib.log)

            #now process results, load into dictionary - slightly different depending on which classification was run.
            if args.method == 'hybrid':
                #run upgraded method, first load dictionaries with resuls
                if amptklib.checkfile(utax_out):
                    utaxDict = amptklib.classifier2dict(
                        utax_out, args.utax_cutoff)
                    amptklib.log.debug(
                        'UTAX results parsed, resulting in {:,} taxonomy predictions'
                        .format(len(utaxDict)))
                else:
                    amptklib.log.info('UTAX results empty')
                    utaxDict = {}
                if amptklib.checkfile(sintax_out):
                    sintaxDict = amptklib.classifier2dict(
                        sintax_out, args.sintax_cutoff)
                    amptklib.log.debug(
                        'SINTAX results parsed, resulting in {:,} taxonomy predictions'
                        .format(len(sintaxDict)))
                else:
                    amptklib.log.info('SINTAX results empty')
                    sintaxDict = {}
                usearchDict = amptklib.usearchglobal2dict(usearch_out)
                amptklib.log.debug(
                    'Global alignment results parsed, resulting in {:,} taxonomy predictions'
                    .format(len(usearchDict)))
                otuList = natsorted(list(usearchDict.keys()))
                #first compare classifier results, getting better of the two
                bestClassify = amptklib.bestclassifier(utaxDict, sintaxDict,
                                                       otuList)
                #now get best taxonomy by comparing to global alignment results
                otuDict = amptklib.bestTaxonomy(usearchDict, bestClassify)
                amptklib.log.debug(
                    'Combined OTU taxonomy dictionary contains {:,} taxonomy predictions'
                    .format(len(otuDict)))
                if len(otuDict) < 1:
                    amptklib.log.info('Parsing taxonomy failed -- see logfile')
                    sys.exit(1)

            elif args.method == 'utax' and amptklib.checkfile(utax_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug("Loading UTAX results into dictionary")
                with open(utax_out, 'r') as infile:
                    reader = csv.reader(infile, delimiter=str("\t"))
                    otuDict = {rows[0]: 'UTAX;' + rows[2] for rows in reader}

            elif args.method == 'usearch' and amptklib.checkfile(usearch_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug(
                    "Loading Global Alignment results into dictionary")
                otuDict = {}
                usearchDict = amptklib.usearchglobal2dict(usearch_out)
                for k, v in natsorted(list(usearchDict.items())):
                    pident = float(v[0]) * 100
                    pident = "{0:.1f}".format(pident)
                    ID = v[1]
                    tax = ','.join(v[-1])
                    LCA = v[2]
                    if LCA == '':
                        fulltax = 'GS|' + pident + '|' + ID + ';' + tax
                    else:
                        fulltax = 'GSL|' + pident + '|' + ID + ';' + tax
                    otuDict[k] = fulltax

            elif args.method == 'sintax' and amptklib.checkfile(sintax_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug("Loading SINTAX results into dictionary")
                with open(sintax_out, 'r') as infile:
                    reader = csv.reader(infile, delimiter=(str("\t")))
                    otuDict = {rows[0]: 'SINTAX;' + rows[3] for rows in reader}
    else:
        #you have supplied a two column taxonomy file, parse and build otuDict
        amptklib.log.debug("Loading custom Taxonomy into dictionary")
        with open(args.taxonomy, 'r') as infile:
            reader = csv.reader(infile, delimiter=str("\t"))
            otuDict = {rows[0]: rows[1] for rows in reader}

    #now format results
    if args.otu_table:
        #check if otu_table variable is empty, then load in otu table
        amptklib.log.info("Appending taxonomy to OTU table and OTUs")
        taxTable = base + '.otu_table.taxonomy.txt'
        tmpTable = base + '.otu_table.tmp'

        #append to OTU table
        counts = 0
        with open(taxTable, 'w') as outTable:
            with open(args.otu_table, 'r') as inTable:
                #guess the delimiter format
                firstline = inTable.readline()
                dialect = amptklib.guess_csv_dialect(firstline)
                inTable.seek(0)
                #parse OTU table
                reader = csv.reader(inTable, dialect)
                for line in reader:
                    if line[0].startswith(("#OTU", "OTUId")):
                        line.append('Taxonomy')
                    else:
                        tax = otuDict.get(line[0]) or "No Hit"
                        line.append(tax)
                    if args.tax_filter and not args.method == 'blast':
                        if line[0].startswith(("#OTU", "OTUId")):
                            join_line = ('\t'.join(str(x) for x in line))
                        else:
                            if args.tax_filter in line[-1]:
                                join_line = ('\t'.join(str(x) for x in line))
                                counts += 1
                            else:
                                continue
                    else:
                        join_line = ('\t'.join(str(x) for x in line))
                        counts += 1
                    outTable.write("%s\n" % join_line)

        if args.tax_filter:
            if args.method == 'blast':
                amptklib.log.info(
                    "Blast is incompatible with --tax_filter, use a different method"
                )
                tmpTable = args.otu_table
            else:
                nonfungal = total - counts
                amptklib.log.info(
                    "Found %i OTUs not matching %s, writing %i %s hits to taxonomy OTU table"
                    % (nonfungal, args.tax_filter, counts, args.tax_filter))
                #need to create a filtered table without taxonomy for BIOM output
                with open(tmpTable, 'w') as output:
                    with open(taxTable, 'r') as input:
                        firstline = input.readline()
                        dialect = amptklib.guess_csv_dialect(firstline)
                        input.seek(0)
                        #parse OTU table
                        reader = csv.reader(input, dialect)
                        for line in reader:
                            del line[-1]
                            join_line = '\t'.join(str(x) for x in line)
                            output.write("%s\n" % join_line)
        else:
            tmpTable = args.otu_table

    #append to OTUs
    otuTax = base + '.otus.taxonomy.fa'
    with open(otuTax, 'w') as output:
        with open(args.fasta, 'r') as input:
            SeqRecords = SeqIO.parse(input, 'fasta')
            for rec in SeqRecords:
                tax = otuDict.get(rec.id) or "No hit"
                rec.description = tax
                SeqIO.write(rec, output, 'fasta')

    if not args.taxonomy:
        #output final taxonomy in two-column format, followed by the hits for usearch/sintax/utax if hybrid is used.
        taxFinal = base + '.taxonomy.txt'
        with open(taxFinal, 'w') as finaltax:
            if args.method == 'hybrid':
                finaltax.write('#OTUID\ttaxonomy\tUSEARCH\tSINTAX\tUTAX\n')
                for k, v in natsorted(list(otuDict.items())):
                    if k in usearchDict:
                        usearchResult = usearchDict.get(k)
                        usearchResult = ','.join(usearchResult[-1])
                    else:
                        usearchResult = 'No hit'
                    if k in sintaxDict:
                        sintaxResult = sintaxDict.get(k)
                        sintaxResult = ','.join(sintaxResult[-1])
                    else:
                        sintaxResult = 'No hit'
                    if k in utaxDict:
                        utaxResult = utaxDict.get(k)
                        utaxResult = ','.join(utaxResult[-1])
                    else:
                        utaxResult = 'No hit'
                    finaltax.write('{:}\t{:}\t{:}\t{:}\t{:}\n'.format(
                        k, v, usearchResult, sintaxResult, utaxResult))
            else:
                finaltax.write('#OTUID\ttaxonomy\n')
                for k, v in natsorted(list(otuDict.items())):
                    finaltax.write('%s\t%s\n' % (k, v))
    else:
        taxFinal = args.taxonomy
    #convert taxonomy to qiime format for biom
    qiimeTax = None
    if not args.method == 'blast':
        qiimeTax = base + '.qiime.taxonomy.txt'
        amptklib.utax2qiime(taxFinal, qiimeTax)
    else:
        amptklib.log.error(
            "Blast taxonomy is not compatible with BIOM output, use a different method"
        )

    #create OTU phylogeny for downstream processes
    amptklib.log.info("Generating phylogenetic tree")
    tree_out = base + '.tree.phy'
    cmd = [usearch, '-cluster_agg', args.fasta, '-treeout', tree_out]
    amptklib.runSubprocess(cmd, amptklib.log)

    #print some summary file locations
    amptklib.log.info("Taxonomy finished: %s" % taxFinal)
    if args.otu_table and not args.method == 'blast':
        amptklib.log.info("Classic OTU table with taxonomy: %s" % taxTable)
        #output final OTU table in Biom v1.0 (i.e. json format if biom installed)
        outBiom = base + '.biom'
        if amptklib.which('biom'):
            amptklib.removefile(outBiom)
            cmd = [
                'biom', 'convert', '-i', tmpTable, '-o', outBiom + '.tmp',
                '--table-type', "OTU table", '--to-json'
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            if args.mapping_file:
                mapSamples = []
                repeatSamples = []
                with open(args.mapping_file, 'r') as mapin:
                    for line in mapin:
                        line = line.rstrip()
                        if line.startswith('#'):
                            continue
                        sampleID = line.split('\t')[0]
                        if not sampleID in mapSamples:
                            mapSamples.append(sampleID)
                        else:
                            repeatSamples.append(sampleID)
                otuSamples = []
                with open(tmpTable, 'r') as otuin:
                    for line in otuin:
                        line = line.rstrip()
                        if line.startswith('#'):
                            otuSamples = line.split('\t')[1:]
                missingMap = []
                for otu in otuSamples:
                    if not otu in mapSamples:
                        missingMap.append(otu)
                if len(missingMap) > 0:
                    amptklib.log.error(
                        "%s are missing from mapping file (metadata), skipping biom file creation"
                        % ', '.join(missingMap))
                elif len(repeatSamples) > 0:
                    amptklib.log.error(
                        '%s duplicate sample IDs in mapping file, skipping biom file creation'
                        % ', '.join(repeatSamples))
                else:
                    if qiimeTax:
                        cmd = [
                            'biom', 'add-metadata', '-i', outBiom + '.tmp',
                            '-o', outBiom, '--observation-metadata-fp',
                            qiimeTax, '-m', args.mapping_file,
                            '--sc-separated', 'taxonomy', '--output-as-json'
                        ]
                    else:
                        cmd = [
                            'biom', 'add-metadata', '-i', outBiom + '.tmp',
                            '-o', outBiom, '-m', args.mapping_file,
                            '--output-as-json'
                        ]
                    amptklib.runSubprocess(cmd, amptklib.log)
            else:
                cmd = [
                    'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o',
                    outBiom, '--observation-metadata-fp', qiimeTax,
                    '--sc-separated', 'taxonomy', '--output-as-json'
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            amptklib.removefile(outBiom + '.tmp')
            amptklib.log.info("BIOM OTU table created: %s" % outBiom)
        else:
            amptklib.log.info(
                "biom program not installed, install via `pip install biom-format` or `conda install biom-format`"
            )
    amptklib.log.info("OTUs with taxonomy: %s" % otuTax)
    amptklib.log.info("OTU phylogeny: %s" % tree_out)

    #clean up intermediate files
    if not args.debug:
        for i in [
                utax_out, usearch_out, sintax_out, qiimeTax,
                base + '.otu_table.tmp'
        ]:
            if i:
                amptklib.removefile(i)
    print("-------------------------------------------------------")
示例#3
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-dada2.py',
        description=
        '''Script takes output from amptk pre-processing and runs DADA2''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        required=True,
                        help='Input Demuxed containing FASTQ')
    parser.add_argument('-o', '--out', help='Output Basename')
    parser.add_argument(
        '-m',
        '--min_reads',
        default=10,
        type=int,
        help="Minimum number of reads after Q filtering to run DADA2 on")
    parser.add_argument('-l',
                        '--length',
                        type=int,
                        help='Length to truncate reads')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='MaxEE quality filtering')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="Biological OTU Clustering Percent")
    parser.add_argument('--platform',
                        default='ion',
                        choices=['ion', 'illumina', '454'],
                        help='Sequencing platform')
    parser.add_argument('--chimera_method',
                        default='consensus',
                        choices=['consensus', 'pooled', 'per-sample'],
                        help='bimera removal method')
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--pool',
                        action='store_true',
                        help='Pool all sequences together for DADA2')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Keep all intermediate files')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))
    dada2script = os.path.join(parentdir, 'dada2_pipeline_nofilt.R')

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.fastq:
            base = os.path.basename(args.fastq).split('.demux')[0]
        else:
            base = os.path.basename(args.fastq).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-dada2.log'
    if os.path.isfile(log_name):
        amptklib.removefile(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cores
    if args.cpus:
        CORES = str(args.cpus)
    else:
        CORES = str(amptklib.getCPUS())

    #check dependencies
    programs = ['Rscript']
    amptklib.CheckDependencies(programs)
    Rversions = amptklib.checkRversion()
    R_pass = '******'
    dada2_pass = '******'

    #check dada2 first, if good move on, otherwise issue warning
    if not amptklib.gvc(Rversions[1], dada2_pass):
        amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" %
                           (Rversions[0], Rversions[1], dada2_pass))
        amptklib.log.error(
            "See: http://benjjneb.github.io/dada2/dada-installation.html")
        sys.exit(1)
    amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1]))

    #Count FASTQ records and remove 3' N's as dada2 can't handle them
    amptklib.log.info("Loading FASTQ Records")
    no_ns = base + '.cleaned_input.fq'
    if args.fastq.endswith('.gz'):
        fastqInput = args.fastq.replace('.gz', '')
        amptklib.Funzip(os.path.abspath(args.fastq),
                        os.path.basename(fastqInput), CORES)
    else:
        fastqInput = os.path.abspath(args.fastq)
    amptklib.fastq_strip_padding(os.path.basename(fastqInput), no_ns)
    demuxtmp = base + '.original.fa'
    cmd = [
        'vsearch', '--fastq_filter',
        os.path.abspath(no_ns), '--fastq_qmax', '55', '--fastaout', demuxtmp,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(demuxtmp)
    size = amptklib.checkfastqsize(no_ns)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #quality filter
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    derep = base + '.qual-filtered.fq'
    filtercmd = [
        'vsearch', '--fastq_filter', no_ns, '--fastq_maxee',
        str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55',
        '--fastq_maxns', '0', '--threads', CORES
    ]
    amptklib.runSubprocess(filtercmd, amptklib.log)
    total = amptklib.countfastq(derep)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #split into individual files
    amptklib.log.info("Splitting FASTQ file by Sample into individual files")
    filtfolder = base + '_filtered'
    if os.path.isdir(filtfolder):
        shutil.rmtree(filtfolder)
    os.makedirs(filtfolder)
    splitDemux2(derep, filtfolder, args=args)

    #check for minimum number of reads in each sample
    remove = []
    files = [i for i in os.listdir(filtfolder) if i.endswith('.fastq')]
    for x in files:
        if amptklib.countfastq(os.path.join(filtfolder, x)) < args.min_reads:
            remove.append(x)
    if len(remove) > 0:
        amptklib.log.info("Dropping %s as fewer than %i reads" %
                          (', '.join(remove), args.min_reads))
        for y in remove:
            os.remove(os.path.join(filtfolder, y))

    #now run DADA2 on filtered folder
    amptklib.log.info("Running DADA2 pipeline")
    dada2log = base + '.dada2.Rscript.log'
    dada2out = base + '.dada2.csv'
    #check pooling vs notpooled, default is not pooled.
    if args.pool:
        POOL = 'TRUE'
    else:
        POOL = 'FALSE'
    with open(dada2log, 'w') as logfile:
        subprocess.call([
            'Rscript', '--vanilla', dada2script, filtfolder, dada2out,
            args.platform, POOL, CORES, args.chimera_method
        ],
                        stdout=logfile,
                        stderr=logfile)

    #check for results
    if not os.path.isfile(dada2out):
        amptklib.log.error("DADA2 run failed, please check %s logfile" %
                           dada2log)
        sys.exit(1)

    #now process the output, pull out fasta, rename, etc
    fastaout = base + '.otus.tmp'
    OTUCounts = {}
    counter = 1
    with open(fastaout, 'w') as writefasta:
        with open(dada2out, 'r') as input:
            next(input)
            for line in input:
                line = line.replace('\n', '')
                line = line.replace('"', '')
                cols = line.split(',')
                Seq = cols[0]
                countList = [int(x) for x in cols[1:]]
                counts = sum(countList)
                ID = 'ASV' + str(counter)
                if not ID in OTUCounts:
                    OTUCounts[ID] = counts
                writefasta.write(">%s\n%s\n" % (ID, Seq))
                counter += 1

    #get number of bimeras from logfile
    with open(dada2log, 'r') as bimeracheck:
        for line in bimeracheck:
            if line.startswith('Identified '):
                bimeraline = line.split(' ')
                bimeras = int(bimeraline[1])
                totalSeqs = int(bimeraline[5])
    validSeqs = totalSeqs - bimeras
    amptklib.log.info('{0:,}'.format(totalSeqs) +
                      ' total amplicon sequence variants (ASVs)')
    amptklib.log.info('{0:,}'.format(bimeras) + ' denovo chimeras removed')
    amptklib.log.info('{0:,}'.format(validSeqs) + ' valid ASVs')

    #optional UCHIME Ref
    uchime_out = base + '.nonchimeras.fa'
    chimeraFreeTable = base + '.otu_table.txt'
    iSeqs = base + '.ASVs.fa'
    if not args.uchime_ref:
        os.rename(fastaout, iSeqs)
    else:
        #check if file is present, remove from previous run if it is.
        if os.path.isfile(iSeqs):
            amptklib.removefile(iSeqs)
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = fastaout
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            if os.path.isfile(args.uchime_ref):
                uchime_db = os.path.abspath(args.uchime_ref)
            else:
                amptklib.log.error(
                    "%s is not a valid file, skipping reference chimera filtering"
                    % args.uchime_ref)
                iSeqs = fastaout
        #now run chimera filtering if all checks out
        if not os.path.isfile(iSeqs):
            amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" %
                              args.uchime_ref)
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db',
                uchime_db, '--nonchimeras', iSeqs, '--threads', CORES
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(iSeqs)
            uchime_chimeras = validSeqs - total
            amptklib.log.info('{0:,}'.format(total) + ' ASVs passed, ' +
                              '{0:,}'.format(uchime_chimeras) +
                              ' ref chimeras removed')
            if os.path.isfile(fastaout):
                amptklib.removefile(fastaout)

    #setup output files
    dadademux = base + '.dada2.map.uc'
    bioSeqs = base + '.cluster.otus.fa'
    bioTable = base + '.cluster.otu_table.txt'
    uctmp = base + '.map.uc'
    ClusterComp = base + '.ASVs2clusters.txt'

    #Filter out ASVs in wrong orientation
    amptklib.log.info('Validating ASV orientation')
    os.rename(iSeqs, iSeqs + '.bak')
    numKept, numDropped = amptklib.validateorientationDADA2(
        OTUCounts, iSeqs + '.bak', iSeqs)
    amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format(
        numKept, numDropped))
    amptklib.SafeRemove(iSeqs + '.bak')

    #map reads to DADA2 OTUs
    amptklib.log.info("Mapping reads to DADA2 ASVs")
    cmd = [
        'vsearch', '--usearch_global', demuxtmp, '--db', iSeqs, '--id', '0.97',
        '--uc', dadademux, '--strand', 'plus', '--otutabout', chimeraFreeTable,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.line_count2(dadademux)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #cluster
    amptklib.log.info("Clustering ASVs at %s%% to generate biological OTUs" %
                      args.pct_otu)
    radius = float(args.pct_otu) / 100.
    cmd = [
        'vsearch', '--cluster_smallmem', iSeqs, '--centroids', bioSeqs, '--id',
        str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none',
        '--usersort', '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(bioSeqs)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs generated')

    #determine where iSeqs clustered
    iSeqmap = base + '.ASV_map.uc'
    cmd = [
        'vsearch', '--usearch_global', iSeqs, '--db', bioSeqs, '--id',
        str(radius), '--uc', iSeqmap, '--strand', 'plus', '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    iSeqMapped = {}
    with open(iSeqmap, 'r') as mapping:
        for line in mapping:
            line = line.replace('\n', '')
            cols = line.split('\t')
            OTU = cols[9]
            Hit = cols[8]
            if not OTU in iSeqMapped:
                iSeqMapped[OTU] = [Hit]
            else:
                iSeqMapped[OTU].append(Hit)
    with open(ClusterComp, 'w') as clusters:
        clusters.write('OTU\tASVs\n')
        for k, v in natsorted(list(iSeqMapped.items())):
            clusters.write('%s\t%s\n' % (k, ', '.join(v)))
    #create OTU table
    amptklib.log.info("Mapping reads to OTUs")
    cmd = [
        'vsearch', '--usearch_global', demuxtmp, '--db', bioSeqs, '--id',
        '0.97', '--uc', uctmp, '--strand', 'plus', '--otutabout', bioTable,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.line_count2(uctmp)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    if not args.debug:
        amptklib.removefile(no_ns)
        shutil.rmtree(filtfolder)
        amptklib.removefile(dada2out)
        amptklib.removefile(derep)
        amptklib.removefile(demuxtmp)
        amptklib.removefile(uctmp)
        amptklib.removefile(iSeqmap)
        amptklib.removefile(dadademux)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("DADA2 Script has Finished Successfully")
    print("-------------------------------------------------------")
    if args.debug:
        print("Tmp Folder of files: %s" % filtfolder)
    print("Amplicon sequence variants: %s" % iSeqs)
    print("ASV OTU Table: %s" % chimeraFreeTable)
    print("Clustered OTUs: %s" % bioSeqs)
    print("OTU Table: %s" % bioTable)
    print("ASVs 2 OTUs: %s" % ClusterComp)
    print("-------------------------------------------------------")

    otu_print = bioSeqs.split('/')[-1]
    tab_print = bioTable.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
示例#4
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-OTU_cluster.py',
        usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
        description='''Script runs UPARSE OTU clustering.
		Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        dest="FASTQ",
                        required=True,
                        help='FASTQ file (Required)')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='Quality trim EE value')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="OTU Clustering Percent")
    parser.add_argument('-m',
                        '--minsize',
                        default='2',
                        help='Min size to keep for clustering')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--map_filtered',
                        action='store_true',
                        help='map quality filtered reads back to OTUs')
    parser.add_argument('--unoise',
                        action='store_true',
                        help='Run De-noising (UNOISE)')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.FASTQ:
            base = os.path.basename(args.FASTQ).split('.demux')[0]
        else:
            base = os.path.basename(args.FASTQ).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-cluster.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #make tmp folder
    tmp = base + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    #convert to FASTA for mapping
    orig_fasta = os.path.join(tmp, base + '.orig.fa')
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(orig_fasta)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #Expected Errors filtering step
    filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq')
    filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa')
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
        str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfastq(filter_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run full length dereplication
    derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa')
    amptklib.log.info("De-replication (remove duplicate reads)")
    cmd = [
        'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output',
        derep_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(derep_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #optional run UNOISE
    if args.unoise:
        unoise_out = unoise_out = os.path.join(
            tmp, base + '.EE' + args.maxee + '.denoised.fa')
        amptklib.log.info("Denoising Data with UNOISE")
        cmd = [
            usearch, '-cluster_fast', derep_out, '-centroids', unoise_out,
            '-id', '0.9', '--maxdiffs', '5', '-abskew', '10', '-sizein',
            '-sizeout', '-sort', 'size', '-threads',
            str(cpus)
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        total = amptklib.countfasta(unoise_out)
        amptklib.log.info('{0:,}'.format(total) + ' reads passed')
    else:
        unoise_out = derep_out

    #now sort by size remove singletons
    sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa')
    cmd = [
        'vsearch', '--sortbysize', unoise_out, '--minsize', args.minsize,
        '--output', sort_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #now run clustering algorithm
    radius = str(100 - int(args.pct_otu))
    otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa')
    amptklib.log.info("Clustering OTUs (UPARSE)")
    cmd = [
        usearch, '-cluster_otus', sort_out, '-relabel', 'OTU',
        '-otu_radius_pct', radius, '-otus', otu_out, '-threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    numOTUs = amptklib.countfasta(otu_out)
    amptklib.log.info('{0:,}'.format(numOTUs) + ' OTUs')

    #clean up padded N's
    amptklib.log.info("Cleaning up padding from OTUs")
    otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.otus.fa')
    amptklib.fasta_strip_padding(otu_out, otu_clean)

    #optional UCHIME Ref
    if not args.uchime_ref:
        uchime_out = otu_clean
    else:
        uchime_out = os.path.join(
            tmp, base + '.EE' + args.maxee + '.uchime.otus.fa')
        #check if file is present, remove from previous run if it is.
        if os.path.isfile(uchime_out):
            os.remove(uchime_out)
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = otu_clean
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            if os.path.isfile(args.uchime_ref):
                uchime_db = os.path.abspath(args.uchime_ref)
            else:
                amptklib.log.error(
                    "%s is not a valid file, skipping reference chimera filtering"
                    % args.uchime_ref)
                uchime_out = otu_clean
        #now run chimera filtering if all checks out
        if not os.path.isfile(uchime_out):
            amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" %
                              args.uchime_ref)
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean,
                '--db', uchime_db, '--nonchimeras', uchime_out, '--threads',
                str(cpus)
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(uchime_out)
            uchime_chimeras = numOTUs - total
            amptklib.log.info('{0:,}'.format(total) + ' OTUs passed, ' +
                              '{0:,}'.format(uchime_chimeras) +
                              ' ref chimeras')

    #Filter out OTUs in wrong orientation
    amptklib.log.info('Validating OTU orientation')
    passingOTUs = os.path.join(tmp, base + '.passed.otus.fa')
    numKept, numDropped = amptklib.validateorientation(tmp, sort_out,
                                                       uchime_out, passingOTUs)
    amptklib.log.info('{:,} OTUs validated ({:,} dropped)'.format(
        numKept, numDropped))

    #now map reads back to OTUs and build OTU table
    uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc')
    otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt')
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', passingOTUs, '--uc', uc_out, '--otutabout', otu_table,
        '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Move files around, delete tmp if argument passed.
    currentdir = os.getcwd()
    final_otu = os.path.join(currentdir, base + '.cluster.otus.fa')
    shutil.copyfile(passingOTUs, final_otu)
    final_otu_table = os.path.join(currentdir, base + '.otu_table.txt')
    shutil.copyfile(otu_table, final_otu_table)
    if not args.debug:
        shutil.rmtree(tmp)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("OTU Clustering Script has Finished Successfully")
    print("-------------------------------------------------------")
    if not not args.debug:
        print("Tmp Folder of files: %s" % tmp)
    print("Clustered OTUs: %s" % os.path.basename(final_otu))
    print("OTU Table: %s" % os.path.basename(final_otu_table))
    print("-------------------------------------------------------")

    otu_print = final_otu.split('/')[-1]
    tab_print = final_otu_table.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
示例#5
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-unoise2.py',
        usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
        description='''Script runs UNOISE2 algorithm.
		Requires USEARCH9 by Robert C. Edgar: http://drive5.com/usearch''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        dest="FASTQ",
                        required=True,
                        help='FASTQ file (Required)')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='Quality trim EE value')
    parser.add_argument('-m',
                        '--minsize',
                        default='8',
                        help='Min size to keep for denoising')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="Biological OTU Clustering Percent")
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME2 REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--map_filtered',
                        action='store_true',
                        help='map quality filtered reads back to OTUs')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.FASTQ:
            base = os.path.basename(args.FASTQ).split('.demux')[0]
        else:
            base = os.path.basename(args.FASTQ).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-unoise2.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #make tmp folder
    tmp = base + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    #convert to FASTA for mapping
    orig_fasta = os.path.join(tmp, base + '.orig.fa')
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(orig_fasta)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #Expected Errors filtering step
    filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq')
    filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa')
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
        str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfastq(filter_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run full length dereplication
    derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa')
    amptklib.log.info("De-replication (remove duplicate reads)")
    cmd = [
        'vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_',
        '--sizeout', '--output', derep_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(derep_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run de-noiser UNOISE2
    amptklib.log.info("Denoising reads with UNOISE2")
    unoise_out = os.path.join(tmp, base + '.EE' + args.maxee + '.unoise.fa')
    cmd = [
        usearch, '-unoise2', derep_out, '-fastaout', unoise_out, '-minampsize',
        args.minsize, '-threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(unoise_out)
    amptklib.log.info('{0:,}'.format(total) + ' denoised sequences')

    #strip N's
    amptklib.log.info("Cleaning up padding from OTUs")
    otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.fa')
    amptklib.fasta_strip_padding(unoise_out, otu_clean)

    #run optional uchime_ref
    if not args.uchime_ref:
        uchime_out = otu_clean
    else:
        uchime_out = os.path.join(
            tmp, base + '.EE' + args.maxee + '.uchime.otus.fa')
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = otu_clean
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            uchime_db = os.path.abspath(args.uchime_ref)
        #now run chimera filtering if all checks out
        if not os.path.isfile(uchime_out):
            amptklib.log.info("Chimera Filtering (VSEARCH)")
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean,
                '--db', uchime_db, '--nonchimeras', uchime_out, '--threads',
                str(cpus)
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(uchime_out)
            amptklib.log.info('{0:,}'.format(total) + ' OTUs passed')

    #inferred sequences
    iSeqs = base + '.ASVs.fa'
    amptklib.fastarename(uchime_out, 'ASV', iSeqs)

    #Filter out ASVs in wrong orientation
    amptklib.log.info('Validating ASV orientation')
    passingOTUs = os.path.join(tmp, base + '.passed.asvs.fa')
    numKept, numDropped = amptklib.validateorientation(tmp, derep_out,
                                                       uchime_out, passingOTUs)
    amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format(
        numKept, numDropped))

    #build OTU table with iSeqs
    uc_iSeq_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc')
    iSeq_otu_table = base + '.otu_table.txt'
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to ASVs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', passingOTUs, '--uc', uc_iSeq_out, '--otutabout',
        iSeq_otu_table, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_iSeq_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #now cluster to biological OTUs with UCLUST
    radius = float(args.pct_otu) / 100.
    amptklib.log.info(
        "Clustering denoised sequences into biological OTUs at %s%%" %
        args.pct_otu)
    uclust_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uclust.fa')
    cmd = [
        'vsearch', '--cluster_smallmem', passingOTUs, '--centroids',
        uclust_out, '--id',
        str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none',
        '--usersort', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(uclust_out)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs generated')

    #determine where denoised sequences clustered
    ClusterComp = base + '.ASVs2clusters.txt'
    iSeqmap = base + '.unoise_map.uc'
    cmd = [
        usearch, '-usearch_global', passingOTUs, '-db', uclust_out, '-id',
        str(radius), '-uc', iSeqmap, '-strand', 'plus', '-threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    iSeqMapped = {}
    with open(iSeqmap, 'r') as mapping:
        for line in mapping:
            line = line.replace('\n', '')
            cols = line.split('\t')
            OTU = cols[9]
            Hit = cols[8]
            if not OTU in iSeqMapped:
                iSeqMapped[OTU] = [Hit]
            else:
                iSeqMapped[OTU].append(Hit)
    with open(ClusterComp, 'w') as clusters:
        clusters.write('OTU\tASVs\n')
        for k, v in natsorted(list(iSeqMapped.items())):
            clusters.write('%s\t%s\n' % (k, ', '.join(v)))

    #now map reads back to OTUs and build OTU table
    uc_out = os.path.join(tmp,
                          base + '.EE' + args.maxee + '.cluster.mapping.uc')
    otu_table = os.path.join(
        tmp, base + '.EE' + args.maxee + '.cluster.otu_table.txt')
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', uclust_out, '--uc', uc_out, '--otutabout', otu_table,
        '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Move files around, delete tmp if argument passed.
    currentdir = os.getcwd()
    final_otu = os.path.join(currentdir, base + '.cluster.otus.fa')
    shutil.copyfile(uclust_out, final_otu)
    final_otu_table = os.path.join(currentdir, base + '.cluster.otu_table.txt')
    shutil.copyfile(otu_table, final_otu_table)
    if not args.debug:
        shutil.rmtree(tmp)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("UNOISE2 Script has Finished Successfully")
    print("-------------------------------------------------------")
    if not not args.debug:
        print("Tmp Folder of files: %s" % tmp)
    print("Amplicon sequence variants: %s" % passingOTUs)
    print("ASV OTU Table: %s" % iSeq_otu_table)
    print("Clustered OTUs: %s" % os.path.basename(final_otu))
    print("OTU Table: %s" % os.path.basename(final_otu_table))
    print("ASVs 2 OTUs: %s" % ClusterComp)
    print("-------------------------------------------------------")

    otu_print = final_otu.split('/')[-1]
    tab_print = final_otu_table.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))