Пример #1
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='fix',
        usage="%(prog)s [options] -i genome.GBK -t genome.tbl",
        description=
        '''Script will update annotation of a Genbank file with new tbl.''',
        epilog="""Written by Jon Palmer (2017) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='Genome in GBK format')
    parser.add_argument('-t',
                        '--tbl',
                        required=True,
                        help='Genome annotation in NCBI tbl format')
    parser.add_argument(
        '-d',
        '--drop',
        help='List of locus_tag to remove/drop from annotation')
    parser.add_argument('-o', '--out', help='Basename of output files')
    parser.add_argument('--tbl2asn',
                        default='-l paired-ends',
                        help='Parameters for tbl2asn, linkage and gap info')
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(__file__))

    # create log file
    log_name = 'funannotate-fix.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running %s" % version)

    # create output and temporary directory
    if args.out:
        basedir = args.out
    else:
        # get location from tbl file
        basedir = os.path.dirname(args.tbl)
        if basedir == '':
            basedir = '.'

    if not os.path.isdir(basedir):
        os.makedirs(basedir)
    if not os.path.isdir(os.path.join(basedir, 'tbl2asn')):
        os.makedirs(os.path.join(basedir, 'tbl2asn'))

    # copy over the annotation file to tbl2asn folder, or process if args.drop passed
    if args.drop:
        lib.tblfilter(args.tbl, args.drop,
                      os.path.join(basedir, 'tbl2asn', 'genome.tbl'))
    else:
        shutil.copyfile(args.tbl, os.path.join(basedir, 'tbl2asn',
                                               'genome.tbl'))

    # get information info from GBK file
    organism, strain, isolate, accession, WGS_accession, gb_gi, version = lib.getGBKinfo(
        args.input)
    locustag, genenum, justify = lib.getGBKLocusTag(args.input)
    if strain:
        organism_name = organism + '_' + strain
    elif isolate:
        organism_name = organism + '_' + isolate
    else:
        organism_name = organism
    organism_name = organism_name.replace(' ', '_')

    # extract fasta file from genbank file,
    lib.log.info('Extracting genome sequence and parsing meta information')
    contigs, genes, trnas = lib.countGenBank(args.input)
    lib.log.info(
        '{:,} contigs containing {:,} protein coding genes and {:,} tRNA genes'
        .format(contigs, genes, trnas))
    lib.gb2dna(args.input, os.path.join(basedir, 'tbl2asn', 'genome.fsa'))

    # assuming that this is the predict_results dir or update_results dir, but check first and then archive
    if '_results' in basedir:
        archivedir = os.path.join(basedir, 'archive_' + str(os.getpid()))
        lib.log.info('Found pre-existing funannotate files, archiving to %s' %
                     archivedir)
        os.makedirs(archivedir)
        # move files in results to archive dir
        for file in os.listdir(basedir):
            if 'pasa-reannotation' in file or 'WGS_accession' in file or 'ncbi.p2g' in file or '.parameters.json' in file:
                continue
            if os.path.isfile(os.path.join(basedir, file)):
                os.rename(os.path.join(basedir, file),
                          os.path.join(archivedir, file))

    # now we can run tbl2asn
    SBT = os.path.join(parentdir, 'config', 'test.sbt')
    discrep = os.path.join(basedir, organism_name + '.discrepency.txt')
    if not version:
        version = 1
    lib.log.info('Converting to GenBank format')
    # have to run as subprocess because of multiprocessing issues
    cmd = [
        sys.executable,
        os.path.join(parentdir, 'aux_scripts', 'tbl2asn_parallel.py'), '-i',
        os.path.join(basedir, 'tbl2asn', 'genome.tbl'), '-f',
        os.path.join(basedir, 'tbl2asn', 'genome.fsa'), '-o',
        os.path.join(basedir, 'tbl2asn'), '--sbt', SBT, '-d', discrep, '-s',
        organism, '-t', args.tbl2asn, '-v',
        str(version), '-c', '4'
    ]
    if isolate:
        cmd += ['--isolate', isolate]
    if strain:
        cmd += ['--strain', strain]
    lib.log.debug(' '.join(cmd))
    subprocess.call(cmd)

    # now get GBK files from folder
    lib.log.info('Generating output files.')
    # setup final output files
    final_fasta = os.path.join(basedir, organism_name + '.scaffolds.fa')
    final_gff = os.path.join(basedir, organism_name + '.gff3')
    final_gbk = os.path.join(basedir, organism_name + '.gbk')
    final_tbl = os.path.join(basedir, organism_name + '.tbl')
    final_proteins = os.path.join(basedir, organism_name + '.proteins.fa')
    final_transcripts = os.path.join(basedir,
                                     organism_name + '.mrna-transcripts.fa')
    final_cds_transcripts = os.path.join(basedir,
                                         organism_name + '.cds-transcripts.fa')
    final_validation = os.path.join(basedir, organism_name + '.validation.txt')
    final_error = os.path.join(basedir, organism_name + '.error.summary.txt')
    final_fixes = os.path.join(basedir,
                               organism_name + '.models-need-fixing.txt')

    # retrieve files/reorganize
    shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.gbf'), final_gbk)
    shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.tbl'), final_tbl)
    shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.val'),
                    final_validation)
    shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'errorsummary.val'),
                    final_error)
    lib.tbl2allout(final_tbl, os.path.join(basedir, 'tbl2asn', 'genome.fsa'),
                   final_gff, final_proteins, final_transcripts,
                   final_cds_transcripts, final_fasta)
    errors = lib.ncbiCheckErrors(final_error, final_validation, locustag,
                                 final_fixes)
    if errors > 0:
        lib.log.info(
            "Manually edit the tbl file %s, then run:\n\nfunannotate fix -i %s -t %s\n"
            % (final_tbl, final_gbk, final_tbl))
    else:
        contigs, genes, trnas = lib.countGenBank(final_gbk)
        lib.log.info(
            'Output genome consists of: {:,} contigs containing {:,} protein coding genes and {:,} tRNA genes'
            .format(contigs, genes, trnas))

    # clean up
    shutil.rmtree(os.path.join(basedir, 'tbl2asn'))
Пример #2
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-mask.py',
        description='''Wrapper for RepeatModeler/RepeatMasker''',
        epilog="""Written by Jon Palmer (2018) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='genome assembly FASTA format')
    parser.add_argument('-o',
                        '--out',
                        required=True,
                        help='Output softmasked FASTA file')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Keep intermediate files')
    parser.add_argument('-m',
                        '--method',
                        default='tantan',
                        choices=['repeatmodeler', 'repeatmasker', 'tantan'],
                        help='Method to mask repeats with')
    parser.add_argument('-s',
                        '--repeatmasker_species',
                        help='RepeatMasker species, will skip repeatmodeler')
    parser.add_argument(
        '-l',
        '--repeatmodeler_lib',
        help='Pre-computed RepeatModeler (or other) repetitive elements')
    parser.add_argument('--cpus',
                        default=2,
                        type=int,
                        help='Number of CPUs to use')
    args = parser.parse_args(args)

    # create log file for Repeats(capture stderr)
    log_name = 'funannotate-mask.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running funanotate v{:}".format(version))

    repeats = None
    tmpdir = None
    if args.method == 'tantan':
        programs = ['tantan']
        lib.CheckDependencies(programs)
        lib.log.info('Soft-masking simple repeats with tantan')
        runTanTan(args.input, args.out)
    else:
        programs = ['RepeatMasker']
        if args.method == 'repeatmodeler':
            programs += ['BuildDatabase', 'RepeatModeler']
        lib.CheckDependencies(programs)

        # create tmpdir
        pid = uuid.uuid4()
        tmpdir = 'mask_' + str(pid)
        os.makedirs(tmpdir)

        # parse options which dictates how repeatmodeler/masker are run
        if not args.repeatmodeler_lib:  # no fasta file given, so
            if not args.repeatmasker_species:  # no species given, so run entire repeatmodler + repeat masker
                repeats = 'repeatmodeler-library.' + str(pid) + '.fasta'
                RepeatModelMask(args.input, args.cpus, tmpdir, args.out,
                                repeats, args.repeatmasker_species, log_name)
            else:
                RepeatMaskSpecies(args.input, args.repeatmasker_species,
                                  args.cpus, tmpdir, args.out, log_name)
        else:
            if lib.checkannotations(args.repeatmodeler_lib):
                RepeatMask(args.input, args.repeatmodeler_lib, args.cpus,
                           tmpdir, args.out, log_name)
            else:
                lib.log.error(
                    'ERROR: repeat library is not a valid file: {:}'.format(
                        args.repeatmodeler_lib))
                sys.exit(1)

    # output some stats on %reads masked.
    scaffolds = 0
    maskedSize = 0
    GenomeLength = 0
    with open(args.out, 'r') as input:
        for rec, Seq in SimpleFastaParser(input):
            scaffolds += 1
            GenomeLength += len(Seq)
            maskedSize += lib.n_lower_chars(Seq)

    percentMask = maskedSize / float(GenomeLength)
    lib.log.info(
        'Repeat soft-masking finished: \nMasked genome: {:}\nnum scaffolds: {:,}\nassembly size: {:,} bp\nmasked repeats: {:,} bp ({:.2f}%)'
        .format(os.path.abspath(args.out), scaffolds, GenomeLength, maskedSize,
                percentMask * 100))
    if repeats:
        lib.log.info('RepeatModeler library: {:}'.format(repeats))
    # clean up
    if not args.debug:
        if tmpdir:
            lib.SafeRemove(tmpdir)
    print("-------------------------------------------------------")
Пример #3
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-remote.py',
        description=
        '''Script that adds functional annotation to a genome using remote searches.''',
        epilog="""Written by Jon Palmer (2016-2017) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        help='Folder from funannotate predict.')
    parser.add_argument('-g',
                        '--genbank',
                        help='Annotated genome in GenBank format')
    parser.add_argument('-m',
                        '--methods',
                        required=True,
                        nargs='+',
                        choices=['all', 'phobius', 'antismash'],
                        help='Method to run')
    parser.add_argument('-o', '--out', help='Basename of output files')
    parser.add_argument('-e',
                        '--email',
                        required=True,
                        help='Email address for IPRSCAN server')
    parser.add_argument('--force',
                        action='store_true',
                        help='Over-write output folder')
    parser.add_argument('-a',
                        '--antismash',
                        default='fungi',
                        choices=['fungi', 'plants'],
                        help='antiSMASH server')
    args = parser.parse_args(args)

    global parentdir, RUNIPRSCAN, XMLCombine
    parentdir = os.path.join(os.path.dirname(__file__))
    RUNIPRSCAN = os.path.join(parentdir, 'aux_scripts', 'runIPRscan.py')
    XMLCombine = os.path.join(parentdir, 'aux_scripts', 'xmlcombine.py')

    # create log file
    log_name = 'funannotate-remote.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print "-------------------------------------------------------"
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running %s" % version)

    # need to do some checks here of the input
    genbank = ''
    Proteins = ''
    tablefile = ''
    Fastafile = ''
    if not args.input:
        # did not parse folder of funannotate results, so need either gb + gff or fasta + proteins, + gff and also need to have args.out for output folder
        if not args.out:
            lib.log.error(
                "If you are not providing funannotate predict input folder, then you need to provide an output folder (--out)"
            )
            sys.exit(1)
        else:
            outputdir = args.out
            # create outputdir and subdirs
            if not os.path.isdir(outputdir):
                os.makedirs(outputdir)
                os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                os.makedirs(os.path.join(outputdir, 'annotate_results'))
                os.makedirs(os.path.join(outputdir, 'logfiles'))
        if not args.genbank:
            lib.log.error(
                "You did not specifiy the apropriate input files, either: \n1) Funannotate input \n2) GenBank"
            )
            sys.exit(1)
        else:
            # create output directories
            if not os.path.isdir(outputdir):
                os.makedirs(outputdir)
                os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                os.makedirs(os.path.join(outputdir, 'annotate_results'))
                os.makedirs(os.path.join(outputdir, 'logfiles'))
            else:
                lib.log.error("Output directory %s already exists" %
                              (outputdir))
                if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')):
                    os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                if not os.path.isdir(
                        os.path.join(outputdir, 'annotate_results')):
                    os.makedirs(os.path.join(outputdir, 'annotate_results'))
                if not os.path.isdir(os.path.join(outputdir, 'logfiles')):
                    os.makedirs(os.path.join(outputdir, 'logfiles'))
            genbank = args.genbank
            Scaffolds = os.path.join(outputdir, 'annotate_misc',
                                     'genome.scaffolds.fasta')
            Proteins = os.path.join(outputdir, 'annotate_misc',
                                    'genome.proteins.fasta')
            Transcripts = os.path.join(outputdir, 'annotate_misc',
                                       'genome.transcripts.fasta')
            GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3')
            lib.log.info("Checking GenBank file for annotation")
            if not lib.checkGenBank(genbank):
                lib.log.error("Found no annotation in GenBank file, exiting")
                sys.exit(1)
            lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds)

    else:
        # should be a folder, with funannotate files, thus store results there, no need to create output folder
        if not os.path.isdir(args.input):
            lib.log.error("%s directory does not exist" % args.input)
            sys.exit(1)
        # funannotate results should be here
        if os.path.isdir(os.path.join(args.input, 'update_results')):
            inputdir = os.path.join(args.input, 'update_results')
            outputdir = args.input
        elif os.path.isdir(os.path.join(args.input, 'predict_results')):
            inputdir = os.path.join(args.input, 'predict_results')
            outputdir = args.input
        else:
            # here user specified the predict_results folder, or it is a custom folder
            inputdir = os.path.join(args.input)

        # get files that you need
        for file in os.listdir(inputdir):
            if file.endswith('.gbk'):
                genbank = os.path.join(inputdir, file)
            elif file.endswith('.tbl'):
                tablefile = os.path.join(inputdir, file)
            elif file.endswith('.scaffolds.fa'):
                Fastafile = os.path.join(inputdir, file)
        # now create the files from genbank input file for consistency in gene naming, etc
        if not genbank:
            lib.log.error(
                "Properly formatted 'funannotate predict' files do no exist in this directory"
            )
            sys.exit(1)
        else:
            # if user gave predict_results folder, then set output to up one directory
            if 'predict_results' in inputdir or 'update_results' in inputdir:
                outputdir = lib.get_parent_dir(inputdir)
            else:
                if not args.out:
                    outputdir = inputdir  # output the results in the input directory
                else:
                    outputdir = args.out
                    if not os.path.isdir(outputdir):
                        os.makedirs(outputdir)
            # create output directories
            if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')):
                os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                os.makedirs(os.path.join(outputdir, 'annotate_results'))
            else:
                lib.log.error(
                    "Output directory %s already exists, will use any existing data.  If this is not what you want, exit, and provide a unique name for output folder"
                    % (outputdir))
            lib.log.info("Parsing input files")
            Scaffolds = os.path.join(outputdir, 'annotate_misc',
                                     'genome.scaffolds.fasta')
            Proteins = os.path.join(outputdir, 'annotate_misc',
                                    'genome.proteins.fasta')
            Transcripts = os.path.join(outputdir, 'annotate_misc',
                                       'genome.mrna-transcripts.fasta')
            CDSTranscripts = os.path.join(outputdir, 'annotate_misc',
                                          'genome.cds-transcripts.fasta')
            GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3')
            if tablefile and Fastafile:
                lib.log.debug("Generating files from %s" % tablefile)
                lib.tbl2allout(tablefile, Fastafile, GFF, Proteins,
                               Transcripts, CDSTranscripts, Scaffolds)
            else:
                lib.log.debug("Generating files from %s" % genbank)
                lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds)

    # make sure logfiles directory is present, will need later
    if not os.path.isdir(os.path.join(outputdir, 'logfiles')):
        os.makedirs(os.path.join(outputdir, 'logfiles'))

    # get absolute path for all input so there are no problems later, not using Transcripts yet could be error? so take out here
    Proteins = os.path.abspath(Proteins)
    genbank = os.path.abspath(genbank)

    if 'phobius' in args.methods or 'all' in args.methods:
        # run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote
        phobius_out = os.path.join(outputdir, 'annotate_misc',
                                   'phobius.results.txt')
        phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log')
        lib.log.info(
            "Predicting secreted and transmembrane proteins using Phobius")
        if not lib.checkannotations(phobius_out):
            if args.email:
                subprocess.call([
                    os.path.join(parentdir, 'aux_scripts',
                                 'phobius-multiproc.py'), '-i', Proteins, '-o',
                    phobius_out, '-e',
                    str(args.email), '-l', phobiusLog
                ])
            else:
                subprocess.call([
                    os.path.join(parentdir, 'aux_scripts',
                                 'phobius-multiproc.py'), '-i', Proteins, '-o',
                    phobius_out, '-l', phobiusLog
                ])

    if 'antismash' in args.methods or 'all' in args.methods:
        if args.antismash == 'fungi':
            base_address = "https://fungismash.secondarymetabolites.org"
            job_parameters = {
                'email': args.email,
                'ncbi': '',
                'smcogs': 'on',
                'knownclusterblast': 'on',
                'activesitefinder': 'on',
                'subclusterblast': 'on',
                'jobtype': 'antismash5',
                'hmmdetection_strictness': 'relaxed'
            }
        elif args.antismash == 'plants':
            base_address = "https://plantismash.secondarymetabolites.org"
            job_parameters = {
                'email': args.email,
                'knownclusterblast': 'on',
                'subclusterblast': 'on'
            }
        version = requests.get(base_address + "/api/v1.0/version")
        as_vers = version.json()['antismash_generation']
        tax = version.json()['taxon']
        as_status = requests.get(base_address + "/api/v1.0/stats")
        queue = as_status.json()['queue_length']
        running = as_status.json()['running']
        lib.log.info("Connecting to antiSMASH %s v%s webserver" %
                     (tax, as_vers))
        lib.log.info("Queue Length: %s; Jobs Running: %s" % (queue, running))
        lib.log.info("PLEASE to not abuse the webserver, be considerate!")
        if int(queue) > 10 and not args.force:
            lib.log.error(
                "There are more than 10 antiSMASH jobs in queue, use --force to submit anyway"
            )
            sys.exit(1)
        job_files = {'seq': open(genbank, 'rb')}

        lib.log.info("Uploading %s to webserver" % genbank)
        postjob = requests.post(base_address + "/api/v1.0/submit",
                                files=job_files,
                                data=job_parameters)
        jobid = postjob.json()['id']
        # now we can query the job every so often, not sure what is reasonable here, start with 2 minutes?
        lib.log.info("Waiting for results from job: %s" % jobid)
        while True:
            job_status = requests.get(base_address + "/api/v1.0/status/" +
                                      jobid)
            if job_status.json()['status'] == 'done':
                break
            time.sleep(60)  # check every minute
        result_url = job_status.json()['result_url']
        base_url = result_url.replace('index.html', '')
        lib.log.info("antiSMASH v%s job finished" % (as_vers))
        lib.log.debug("%s" % job_status.json())
        # need to retrieve results, have to find link, seems like this might be first scaffold name?
        # after asking Kai Blin - there is no "easy" way to identify the output name, however, think I can grab the html file and parse it
        job_html = requests.get(base_address + result_url)
        link = None
        for line in job_html.iter_lines():
            if 'Download all results' in line:
                cols = line.split('a href="')
        for x in cols:
            if '.zip' in x:
                link = x.split('"')[0]
        if not link:
            lib.log.error('Error parsing output zip file from antismash')
            sys.exit(1)
        baselink = link.replace('.zip', '')
        download_url = base_address + base_url + link
        download(download_url, 'antiSMASH.zip')
        # now unzip and move folder
        zipref = zipfile.ZipFile('antiSMASH.zip', 'r')
        zipref.extractall(os.path.join(outputdir, jobid))
        zipref.close()
        os.remove('antiSMASH.zip')
        lib.log.info("Results folder: %s/%s" % (outputdir, jobid))
        # now grab the GBK files from folder as you will need just that for annotation, place in annotate_misc folder for auto-detection
        anti_GBK = os.path.join(outputdir, jobid, os.path.basename(genbank))
        final = os.path.join(outputdir, 'annotate_misc',
                             'antiSMASH.results.gbk')
        shutil.copyfile(anti_GBK, final)
        lib.log.info("Results GBK: %s" % final)

    lib.log.info("Remote searches complete")
    # move logfile
    if os.path.isfile(log_name):
        shutil.copyfile(log_name, os.path.join(outputdir, 'logfiles',
                                               log_name))
        os.remove(log_name)
Пример #4
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-predict.py',
        usage="%(prog)s [options] -i genome.fasta",
        description='''Script that adds a proteome to the outgroups.''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='Proteome in FASTA format')
    parser.add_argument('-s',
                        '--species',
                        required=True,
                        help='Species name "binomial in quotes"')
    parser.add_argument(
        '-b',
        '--busco_db',
        default='dikarya',
        choices=[
            'fungi', 'microsporidia', 'dikarya', 'ascomycota',
            'pezizomycotina', 'eurotiomycetes', 'sordariomycetes',
            'saccharomycetes', 'saccharomycetales', 'basidiomycota',
            'eukaryota', 'protists', 'alveolata_stramenophiles', 'metazoa',
            'nematoda', 'arthropoda', 'insecta', 'endopterygota',
            'hymenoptera', 'diptera', 'vertebrata', 'actinopterygii',
            'tetrapoda', 'aves', 'mammalia', 'euarchontoglires',
            'laurasiatheria', 'embryophyta'
        ],
        help='BUSCO database to use')
    parser.add_argument('-c',
                        '--cpus',
                        default=2,
                        type=int,
                        help='Number of CPUs to use')
    parser.add_argument('-d',
                        '--database',
                        help='Path to funannotate database, $FUNANNOTATE_DB')
    args = parser.parse_args(args)

    if args.database:
        FUNDB = args.database
    else:
        try:
            FUNDB = os.environ["FUNANNOTATE_DB"]
        except KeyError:
            lib.log.error(
                'Funannotate database not properly configured, run funannotate setup.'
            )
            sys.exit(1)

    parentdir = os.path.join(os.path.dirname(__file__))

    # get base name
    species = args.species.replace(' ', '_').lower() + '.' + args.busco_db
    OUTGROUPS = os.path.join(FUNDB, 'outgroups')

    # create log file
    log_name = species + '-add2outgroups.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running %s" % version)

    # check buscos, download if necessary
    if not os.path.isdir(os.path.join(FUNDB, args.busco_db)):
        lib.log.error(
            "%s busco database is missing, install with funannotate setup -b %s"
            % (args.busco_db, args.busco_db))
        sys.exit(1)

    ProtCount = lib.countfasta(args.input)
    lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')

    # convert to proteins and screen with busco
    lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db)
    BUSCODB = os.path.join(FUNDB, args.busco_db)
    BUSCO = os.path.join(parentdir, 'aux_scripts', 'funannotate-BUSCO2.py')
    cmd = [
        sys.executable, BUSCO, '-i',
        os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB,
        '-o', species, '--cpu',
        str(args.cpus), '-f'
    ]
    lib.runSubprocess(cmd, '.', lib.log)

    # check that it ran correctly
    busco_results = os.path.join('run_' + species,
                                 'full_table_' + species + '.tsv')
    if not lib.checkannotations(busco_results):
        lib.log.error("BUSCO failed, check logfile")
        sys.exit(1)
    nameChange = {}
    with open(busco_results, 'rU') as input:
        for line in input:
            if line.startswith('#'):
                continue
            cols = line.split('\t')
            if cols[1] == 'Complete':
                if not cols[2] in nameChange:
                    nameChange[cols[2]] = cols[0]
                else:
                    lib.log.error(
                        "Duplicate ID found: %s %s. Removing from results" %
                        (cols[2], cols[0]))
                    del nameChange[cols[2]]

    # output counts
    lib.log.info('{0:,}'.format(len(nameChange)) + ' BUSCO models found')

    # index the proteome for parsing
    SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta'))

    # setup output proteome
    busco_out = os.path.join(OUTGROUPS, species + '_buscos.fa')
    with open(busco_out, 'w') as output:
        for k, v in list(nameChange.items()):
            rec = SeqRecords[k]
            output.write('>%s\n%s\n' % (v, rec.seq))
    lib.log.info("Results written to: %s" % busco_out)

    # clean up your mess
    shutil.rmtree('run_' + species)
    shutil.rmtree('tmp')