예제 #1
0
def runtbl2asn(folder, template, discrepency, organism, isolate, strain,
               parameters, version):
    '''
    function to run NCBI tbl2asn
    '''
    # get funannotate version
    fun_version = lib.get_version()
    # input should be a folder
    if not os.path.isdir(folder):
        print(("tbl2asn error: %s is not a directory, exiting" % folder))
        sys.exit(1)
    # based on organism, isolate, strain, construct meta info for -j flag
    if not organism:
        print("tbl2asn error: organism not specified")
        sys.exit(1)
    meta = "[organism=" + organism + "]"
    if isolate:
        isolate_meta = "[isolate=" + isolate + "]"
        meta = meta + " " + isolate_meta
    if strain:
        strain_meta = "[strain=" + strain + "]"
        meta = meta + " " + strain_meta
    cmd = [
        'tbl2asn', '-y', '"Annotated using ' + fun_version + '"', '-N',
        str(version), '-p', folder, '-t', template, '-M', 'n', '-Z',
        discrepency, '-j', '"' + meta + '"', '-V', 'b', '-c', 'fx', '-T', '-a',
        'r10u'
    ]
    # check for custom parameters
    if parameters:
        params = parameters.split(' ')
        cmd = cmd + params
    runSubprocess(cmd, '.')
    return ' '.join(cmd)
예제 #2
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='fix',
        usage="%(prog)s [options] -i genome.GBK -t genome.tbl",
        description=
        '''Script will update annotation of a Genbank file with new tbl.''',
        epilog="""Written by Jon Palmer (2017) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='Genome in GBK format')
    parser.add_argument('-t',
                        '--tbl',
                        required=True,
                        help='Genome annotation in NCBI tbl format')
    parser.add_argument(
        '-d',
        '--drop',
        help='List of locus_tag to remove/drop from annotation')
    parser.add_argument('-o', '--out', help='Basename of output files')
    parser.add_argument('--tbl2asn',
                        default='-l paired-ends',
                        help='Parameters for tbl2asn, linkage and gap info')
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(__file__))

    # create log file
    log_name = 'funannotate-fix.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running %s" % version)

    # create output and temporary directory
    if args.out:
        basedir = args.out
    else:
        # get location from tbl file
        basedir = os.path.dirname(args.tbl)
        if basedir == '':
            basedir = '.'

    if not os.path.isdir(basedir):
        os.makedirs(basedir)
    if not os.path.isdir(os.path.join(basedir, 'tbl2asn')):
        os.makedirs(os.path.join(basedir, 'tbl2asn'))

    # copy over the annotation file to tbl2asn folder, or process if args.drop passed
    if args.drop:
        lib.tblfilter(args.tbl, args.drop,
                      os.path.join(basedir, 'tbl2asn', 'genome.tbl'))
    else:
        shutil.copyfile(args.tbl, os.path.join(basedir, 'tbl2asn',
                                               'genome.tbl'))

    # get information info from GBK file
    organism, strain, isolate, accession, WGS_accession, gb_gi, version = lib.getGBKinfo(
        args.input)
    locustag, genenum, justify = lib.getGBKLocusTag(args.input)
    if strain:
        organism_name = organism + '_' + strain
    elif isolate:
        organism_name = organism + '_' + isolate
    else:
        organism_name = organism
    organism_name = organism_name.replace(' ', '_')

    # extract fasta file from genbank file,
    lib.log.info('Extracting genome sequence and parsing meta information')
    contigs, genes, trnas = lib.countGenBank(args.input)
    lib.log.info(
        '{:,} contigs containing {:,} protein coding genes and {:,} tRNA genes'
        .format(contigs, genes, trnas))
    lib.gb2dna(args.input, os.path.join(basedir, 'tbl2asn', 'genome.fsa'))

    # assuming that this is the predict_results dir or update_results dir, but check first and then archive
    if '_results' in basedir:
        archivedir = os.path.join(basedir, 'archive_' + str(os.getpid()))
        lib.log.info('Found pre-existing funannotate files, archiving to %s' %
                     archivedir)
        os.makedirs(archivedir)
        # move files in results to archive dir
        for file in os.listdir(basedir):
            if 'pasa-reannotation' in file or 'WGS_accession' in file or 'ncbi.p2g' in file or '.parameters.json' in file:
                continue
            if os.path.isfile(os.path.join(basedir, file)):
                os.rename(os.path.join(basedir, file),
                          os.path.join(archivedir, file))

    # now we can run tbl2asn
    SBT = os.path.join(parentdir, 'config', 'test.sbt')
    discrep = os.path.join(basedir, organism_name + '.discrepency.txt')
    if not version:
        version = 1
    lib.log.info('Converting to GenBank format')
    # have to run as subprocess because of multiprocessing issues
    cmd = [
        sys.executable,
        os.path.join(parentdir, 'aux_scripts', 'tbl2asn_parallel.py'), '-i',
        os.path.join(basedir, 'tbl2asn', 'genome.tbl'), '-f',
        os.path.join(basedir, 'tbl2asn', 'genome.fsa'), '-o',
        os.path.join(basedir, 'tbl2asn'), '--sbt', SBT, '-d', discrep, '-s',
        organism, '-t', args.tbl2asn, '-v',
        str(version), '-c', '4'
    ]
    if isolate:
        cmd += ['--isolate', isolate]
    if strain:
        cmd += ['--strain', strain]
    lib.log.debug(' '.join(cmd))
    subprocess.call(cmd)

    # now get GBK files from folder
    lib.log.info('Generating output files.')
    # setup final output files
    final_fasta = os.path.join(basedir, organism_name + '.scaffolds.fa')
    final_gff = os.path.join(basedir, organism_name + '.gff3')
    final_gbk = os.path.join(basedir, organism_name + '.gbk')
    final_tbl = os.path.join(basedir, organism_name + '.tbl')
    final_proteins = os.path.join(basedir, organism_name + '.proteins.fa')
    final_transcripts = os.path.join(basedir,
                                     organism_name + '.mrna-transcripts.fa')
    final_cds_transcripts = os.path.join(basedir,
                                         organism_name + '.cds-transcripts.fa')
    final_validation = os.path.join(basedir, organism_name + '.validation.txt')
    final_error = os.path.join(basedir, organism_name + '.error.summary.txt')
    final_fixes = os.path.join(basedir,
                               organism_name + '.models-need-fixing.txt')

    # retrieve files/reorganize
    shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.gbf'), final_gbk)
    shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.tbl'), final_tbl)
    shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.val'),
                    final_validation)
    shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'errorsummary.val'),
                    final_error)
    lib.tbl2allout(final_tbl, os.path.join(basedir, 'tbl2asn', 'genome.fsa'),
                   final_gff, final_proteins, final_transcripts,
                   final_cds_transcripts, final_fasta)
    errors = lib.ncbiCheckErrors(final_error, final_validation, locustag,
                                 final_fixes)
    if errors > 0:
        lib.log.info(
            "Manually edit the tbl file %s, then run:\n\nfunannotate fix -i %s -t %s\n"
            % (final_tbl, final_gbk, final_tbl))
    else:
        contigs, genes, trnas = lib.countGenBank(final_gbk)
        lib.log.info(
            'Output genome consists of: {:,} contigs containing {:,} protein coding genes and {:,} tRNA genes'
            .format(contigs, genes, trnas))

    # clean up
    shutil.rmtree(os.path.join(basedir, 'tbl2asn'))
예제 #3
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-remote.py',
        description=
        '''Script that adds functional annotation to a genome using remote searches.''',
        epilog="""Written by Jon Palmer (2016-2017) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        help='Folder from funannotate predict.')
    parser.add_argument('-g',
                        '--genbank',
                        help='Annotated genome in GenBank format')
    parser.add_argument('-m',
                        '--methods',
                        required=True,
                        nargs='+',
                        choices=['all', 'phobius', 'antismash'],
                        help='Method to run')
    parser.add_argument('-o', '--out', help='Basename of output files')
    parser.add_argument('-e',
                        '--email',
                        required=True,
                        help='Email address for IPRSCAN server')
    parser.add_argument('--force',
                        action='store_true',
                        help='Over-write output folder')
    parser.add_argument('-a',
                        '--antismash',
                        default='fungi',
                        choices=['fungi', 'plants'],
                        help='antiSMASH server')
    args = parser.parse_args(args)

    global parentdir, RUNIPRSCAN, XMLCombine
    parentdir = os.path.join(os.path.dirname(__file__))
    RUNIPRSCAN = os.path.join(parentdir, 'aux_scripts', 'runIPRscan.py')
    XMLCombine = os.path.join(parentdir, 'aux_scripts', 'xmlcombine.py')

    # create log file
    log_name = 'funannotate-remote.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print "-------------------------------------------------------"
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running %s" % version)

    # need to do some checks here of the input
    genbank = ''
    Proteins = ''
    tablefile = ''
    Fastafile = ''
    if not args.input:
        # did not parse folder of funannotate results, so need either gb + gff or fasta + proteins, + gff and also need to have args.out for output folder
        if not args.out:
            lib.log.error(
                "If you are not providing funannotate predict input folder, then you need to provide an output folder (--out)"
            )
            sys.exit(1)
        else:
            outputdir = args.out
            # create outputdir and subdirs
            if not os.path.isdir(outputdir):
                os.makedirs(outputdir)
                os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                os.makedirs(os.path.join(outputdir, 'annotate_results'))
                os.makedirs(os.path.join(outputdir, 'logfiles'))
        if not args.genbank:
            lib.log.error(
                "You did not specifiy the apropriate input files, either: \n1) Funannotate input \n2) GenBank"
            )
            sys.exit(1)
        else:
            # create output directories
            if not os.path.isdir(outputdir):
                os.makedirs(outputdir)
                os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                os.makedirs(os.path.join(outputdir, 'annotate_results'))
                os.makedirs(os.path.join(outputdir, 'logfiles'))
            else:
                lib.log.error("Output directory %s already exists" %
                              (outputdir))
                if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')):
                    os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                if not os.path.isdir(
                        os.path.join(outputdir, 'annotate_results')):
                    os.makedirs(os.path.join(outputdir, 'annotate_results'))
                if not os.path.isdir(os.path.join(outputdir, 'logfiles')):
                    os.makedirs(os.path.join(outputdir, 'logfiles'))
            genbank = args.genbank
            Scaffolds = os.path.join(outputdir, 'annotate_misc',
                                     'genome.scaffolds.fasta')
            Proteins = os.path.join(outputdir, 'annotate_misc',
                                    'genome.proteins.fasta')
            Transcripts = os.path.join(outputdir, 'annotate_misc',
                                       'genome.transcripts.fasta')
            GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3')
            lib.log.info("Checking GenBank file for annotation")
            if not lib.checkGenBank(genbank):
                lib.log.error("Found no annotation in GenBank file, exiting")
                sys.exit(1)
            lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds)

    else:
        # should be a folder, with funannotate files, thus store results there, no need to create output folder
        if not os.path.isdir(args.input):
            lib.log.error("%s directory does not exist" % args.input)
            sys.exit(1)
        # funannotate results should be here
        if os.path.isdir(os.path.join(args.input, 'update_results')):
            inputdir = os.path.join(args.input, 'update_results')
            outputdir = args.input
        elif os.path.isdir(os.path.join(args.input, 'predict_results')):
            inputdir = os.path.join(args.input, 'predict_results')
            outputdir = args.input
        else:
            # here user specified the predict_results folder, or it is a custom folder
            inputdir = os.path.join(args.input)

        # get files that you need
        for file in os.listdir(inputdir):
            if file.endswith('.gbk'):
                genbank = os.path.join(inputdir, file)
            elif file.endswith('.tbl'):
                tablefile = os.path.join(inputdir, file)
            elif file.endswith('.scaffolds.fa'):
                Fastafile = os.path.join(inputdir, file)
        # now create the files from genbank input file for consistency in gene naming, etc
        if not genbank:
            lib.log.error(
                "Properly formatted 'funannotate predict' files do no exist in this directory"
            )
            sys.exit(1)
        else:
            # if user gave predict_results folder, then set output to up one directory
            if 'predict_results' in inputdir or 'update_results' in inputdir:
                outputdir = lib.get_parent_dir(inputdir)
            else:
                if not args.out:
                    outputdir = inputdir  # output the results in the input directory
                else:
                    outputdir = args.out
                    if not os.path.isdir(outputdir):
                        os.makedirs(outputdir)
            # create output directories
            if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')):
                os.makedirs(os.path.join(outputdir, 'annotate_misc'))
                os.makedirs(os.path.join(outputdir, 'annotate_results'))
            else:
                lib.log.error(
                    "Output directory %s already exists, will use any existing data.  If this is not what you want, exit, and provide a unique name for output folder"
                    % (outputdir))
            lib.log.info("Parsing input files")
            Scaffolds = os.path.join(outputdir, 'annotate_misc',
                                     'genome.scaffolds.fasta')
            Proteins = os.path.join(outputdir, 'annotate_misc',
                                    'genome.proteins.fasta')
            Transcripts = os.path.join(outputdir, 'annotate_misc',
                                       'genome.mrna-transcripts.fasta')
            CDSTranscripts = os.path.join(outputdir, 'annotate_misc',
                                          'genome.cds-transcripts.fasta')
            GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3')
            if tablefile and Fastafile:
                lib.log.debug("Generating files from %s" % tablefile)
                lib.tbl2allout(tablefile, Fastafile, GFF, Proteins,
                               Transcripts, CDSTranscripts, Scaffolds)
            else:
                lib.log.debug("Generating files from %s" % genbank)
                lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds)

    # make sure logfiles directory is present, will need later
    if not os.path.isdir(os.path.join(outputdir, 'logfiles')):
        os.makedirs(os.path.join(outputdir, 'logfiles'))

    # get absolute path for all input so there are no problems later, not using Transcripts yet could be error? so take out here
    Proteins = os.path.abspath(Proteins)
    genbank = os.path.abspath(genbank)

    if 'phobius' in args.methods or 'all' in args.methods:
        # run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote
        phobius_out = os.path.join(outputdir, 'annotate_misc',
                                   'phobius.results.txt')
        phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log')
        lib.log.info(
            "Predicting secreted and transmembrane proteins using Phobius")
        if not lib.checkannotations(phobius_out):
            if args.email:
                subprocess.call([
                    os.path.join(parentdir, 'aux_scripts',
                                 'phobius-multiproc.py'), '-i', Proteins, '-o',
                    phobius_out, '-e',
                    str(args.email), '-l', phobiusLog
                ])
            else:
                subprocess.call([
                    os.path.join(parentdir, 'aux_scripts',
                                 'phobius-multiproc.py'), '-i', Proteins, '-o',
                    phobius_out, '-l', phobiusLog
                ])

    if 'antismash' in args.methods or 'all' in args.methods:
        if args.antismash == 'fungi':
            base_address = "https://fungismash.secondarymetabolites.org"
            job_parameters = {
                'email': args.email,
                'ncbi': '',
                'smcogs': 'on',
                'knownclusterblast': 'on',
                'activesitefinder': 'on',
                'subclusterblast': 'on',
                'jobtype': 'antismash5',
                'hmmdetection_strictness': 'relaxed'
            }
        elif args.antismash == 'plants':
            base_address = "https://plantismash.secondarymetabolites.org"
            job_parameters = {
                'email': args.email,
                'knownclusterblast': 'on',
                'subclusterblast': 'on'
            }
        version = requests.get(base_address + "/api/v1.0/version")
        as_vers = version.json()['antismash_generation']
        tax = version.json()['taxon']
        as_status = requests.get(base_address + "/api/v1.0/stats")
        queue = as_status.json()['queue_length']
        running = as_status.json()['running']
        lib.log.info("Connecting to antiSMASH %s v%s webserver" %
                     (tax, as_vers))
        lib.log.info("Queue Length: %s; Jobs Running: %s" % (queue, running))
        lib.log.info("PLEASE to not abuse the webserver, be considerate!")
        if int(queue) > 10 and not args.force:
            lib.log.error(
                "There are more than 10 antiSMASH jobs in queue, use --force to submit anyway"
            )
            sys.exit(1)
        job_files = {'seq': open(genbank, 'rb')}

        lib.log.info("Uploading %s to webserver" % genbank)
        postjob = requests.post(base_address + "/api/v1.0/submit",
                                files=job_files,
                                data=job_parameters)
        jobid = postjob.json()['id']
        # now we can query the job every so often, not sure what is reasonable here, start with 2 minutes?
        lib.log.info("Waiting for results from job: %s" % jobid)
        while True:
            job_status = requests.get(base_address + "/api/v1.0/status/" +
                                      jobid)
            if job_status.json()['status'] == 'done':
                break
            time.sleep(60)  # check every minute
        result_url = job_status.json()['result_url']
        base_url = result_url.replace('index.html', '')
        lib.log.info("antiSMASH v%s job finished" % (as_vers))
        lib.log.debug("%s" % job_status.json())
        # need to retrieve results, have to find link, seems like this might be first scaffold name?
        # after asking Kai Blin - there is no "easy" way to identify the output name, however, think I can grab the html file and parse it
        job_html = requests.get(base_address + result_url)
        link = None
        for line in job_html.iter_lines():
            if 'Download all results' in line:
                cols = line.split('a href="')
        for x in cols:
            if '.zip' in x:
                link = x.split('"')[0]
        if not link:
            lib.log.error('Error parsing output zip file from antismash')
            sys.exit(1)
        baselink = link.replace('.zip', '')
        download_url = base_address + base_url + link
        download(download_url, 'antiSMASH.zip')
        # now unzip and move folder
        zipref = zipfile.ZipFile('antiSMASH.zip', 'r')
        zipref.extractall(os.path.join(outputdir, jobid))
        zipref.close()
        os.remove('antiSMASH.zip')
        lib.log.info("Results folder: %s/%s" % (outputdir, jobid))
        # now grab the GBK files from folder as you will need just that for annotation, place in annotate_misc folder for auto-detection
        anti_GBK = os.path.join(outputdir, jobid, os.path.basename(genbank))
        final = os.path.join(outputdir, 'annotate_misc',
                             'antiSMASH.results.gbk')
        shutil.copyfile(anti_GBK, final)
        lib.log.info("Results GBK: %s" % final)

    lib.log.info("Remote searches complete")
    # move logfile
    if os.path.isfile(log_name):
        shutil.copyfile(log_name, os.path.join(outputdir, 'logfiles',
                                               log_name))
        os.remove(log_name)
예제 #4
0
def main(args):
    funannotate_perl = [
        'Getopt::Long', 'Pod::Usage', 'File::Basename', 'threads',
        'threads::shared', 'Thread::Queue', 'Carp', 'Data::Dumper', 'YAML',
        'Hash::Merge', 'Logger::Simple', 'Parallel::ForkManager', 'DBI',
        'Text::Soundex', 'Scalar::Util::Numeric', 'Tie::File', 'POSIX',
        'Storable', 'Clone', 'Bio::Perl', 'DBD::mysql', 'JSON',
        'LWP::UserAgent', 'DB_File', 'URI::Escape', 'File::Which',
        'DBD::SQLite'
    ]

    funannotate_python = [
        'numpy', 'pandas', 'matplotlib', 'scipy', 'scikit-learn', 'psutil',
        'natsort', 'goatools', 'seaborn', 'biopython', 'requests'
    ]

    programs1 = ['tblastn', 'makeblastdb', 'java', 'trimmomatic']  # -version
    programs2 = [
        'exonerate', 'bedtools', 'bamtools', 'augustus', 'samtools', 'gmap',
        'hisat2', 'Trinity', 'tbl2asn', 'emapper.py', 'minimap2', 'mafft',
        'trimal', 'stringtie', 'salmon', 'proteinortho', 'tantan'
    ]  # --version
    programs3 = []  # -v
    programs4 = ['diamond', 'ete3', 'kallisto']  # version
    programs5 = [
        'gmes_petap.pl', 'blat', 'pslCDnaFilter', 'fasta', 'CodingQuarry',
        'snap', 'glimmerhmm'
    ]  # no version option at all, a$$holes
    programs6 = ['hmmsearch', 'hmmscan', 'tRNAscan-SE']  # -h
    programs7 = ['signalp']  # -V

    PyVers = sys.version.split(' ')[0]
    PerlVers = perlVersion()
    PyDeps = {}
    PerlDeps = {}
    ExtDeps = {}

    # loop through lists and build dictionary of results so you can print out later
    print("-------------------------------------------------------")
    print("Checking dependencies for %s" % lib.get_version())
    print("-------------------------------------------------------")
    global show
    show = False
    if '--show-versions' in sys.argv:
        show = True
    else:
        print(
            "To print all dependencies and versions: funannotate check --show-versions\n"
        )

    print('You are running Python v %s. Now checking python packages...' %
          PyVers)
    for mod in funannotate_python:
        if not mod in PyDeps:
            PyDeps[mod] = checkPyModule(mod)
    missing = []
    for k, v in natsorted(PyDeps.items()):
        if not v:
            missing.append(k)
        elif show:
            print(k + ': ' + v)
    if len(missing) > 0:
        for x in missing:
            print(
                '   ERROR: %s not installed, pip install %s or conda install %s'
                % (x, x, x))
    else:
        print("All %i python packages installed" % len(funannotate_python))
    print("\n")

    for mod in funannotate_perl:
        if not mod in PerlDeps:
            PerlDeps[mod] = checkPerlModule(mod)

    missing = []
    print('You are running Perl v %s. Now checking perl modules...' % PerlVers)
    for k, v in natsorted(PerlDeps.items()):
        if not v:
            missing.append(k)
        elif show:
            print(k + ': ' + v)
    if len(missing) > 0:
        for x in missing:
            print('   ERROR: %s not installed, install with cpanm %s ' %
                  (x, x))
    else:
        print("All %i Perl modules installed" % len(funannotate_perl))
    print("\n")

    # check ENV variables
    variables = [
        'FUNANNOTATE_DB', 'PASAHOME', 'TRINITYHOME', 'EVM_HOME',
        'AUGUSTUS_CONFIG_PATH', 'GENEMARK_PATH'
    ]
    print('Checking Environmental Variables...')
    missing = []
    for var in variables:
        try:
            VARI = os.environ[var]
            if show:
                print('$%s=%s' % (var, VARI))
        except KeyError:
            if var == 'TRINITYHOME':
                try:
                    VARI = os.environ['TRINITY_HOME']
                    if show:
                        print('$%s=%s' % ('TRINITY_HOME', VARI))
                except KeyError:
                    missing.append(var)
            else:
                missing.append(var)
            pass
    if len(missing) > 0:
        for x in missing:
            print('\tERROR: %s not set. export %s=/path/to/dir' % (x, x))
    else:
        print("All %i environmental variables are set" % (len(variables)))
    print("-------------------------------------------------------")

    if not 'PASAHOME' in missing:
        LAUNCHPASA = os.path.join(os.environ['PASAHOME'],
                                  'Launch_PASA_pipeline.pl')
        programs2.append(LAUNCHPASA)
    print('Checking external dependencies...')
    for prog in programs1:
        if not prog in ExtDeps:
            ExtDeps[prog] = check_version1(prog)
    for prog in programs2:
        if not prog in ExtDeps:
            ExtDeps[prog] = check_version2(prog)
    for prog in programs3:
        if not prog in ExtDeps:
            ExtDeps[prog] = check_version3(prog)
    for prog in programs4:
        if not prog in ExtDeps:
            ExtDeps[prog] = check_version4(prog)
    for prog in programs5:
        if not prog in ExtDeps:
            ExtDeps[prog] = check_version5(prog)
    for prog in programs6:
        if not prog in ExtDeps:
            ExtDeps[prog] = check_version6(prog)
    for prog in programs7:
        if not prog in ExtDeps:
            ExtDeps[prog] = check_version7(prog)

    missing = []
    for k, v in natsorted(ExtDeps.items()):
        if not v or v.startswith('dyld:'):
            missing.append(k)
        elif show:
            if 'Launch_PASA_pipeline.pl' in k:
                k = 'PASA'
            print(k + ': ' + v)
    if len(missing) > 0:
        for x in missing:
            print('\tERROR: %s not installed' % (x))
    else:
        print("All %i external dependencies are installed\n" % (len(ExtDeps)))
예제 #5
0
파일: mask.py 프로젝트: kfuku52/funannotate
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-mask.py',
        description='''Wrapper for RepeatModeler/RepeatMasker''',
        epilog="""Written by Jon Palmer (2018) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='genome assembly FASTA format')
    parser.add_argument('-o',
                        '--out',
                        required=True,
                        help='Output softmasked FASTA file')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Keep intermediate files')
    parser.add_argument('-m',
                        '--method',
                        default='tantan',
                        choices=['repeatmodeler', 'repeatmasker', 'tantan'],
                        help='Method to mask repeats with')
    parser.add_argument('-s',
                        '--repeatmasker_species',
                        help='RepeatMasker species, will skip repeatmodeler')
    parser.add_argument(
        '-l',
        '--repeatmodeler_lib',
        help='Pre-computed RepeatModeler (or other) repetitive elements')
    parser.add_argument('--cpus',
                        default=2,
                        type=int,
                        help='Number of CPUs to use')
    args = parser.parse_args(args)

    # create log file for Repeats(capture stderr)
    log_name = 'funannotate-mask.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running funanotate v{:}".format(version))

    repeats = None
    tmpdir = None
    if args.method == 'tantan':
        programs = ['tantan']
        lib.CheckDependencies(programs)
        lib.log.info('Soft-masking simple repeats with tantan')
        runTanTan(args.input, args.out)
    else:
        programs = ['RepeatMasker']
        if args.method == 'repeatmodeler':
            programs += ['BuildDatabase', 'RepeatModeler']
        lib.CheckDependencies(programs)

        # create tmpdir
        pid = uuid.uuid4()
        tmpdir = 'mask_' + str(pid)
        os.makedirs(tmpdir)

        # parse options which dictates how repeatmodeler/masker are run
        if not args.repeatmodeler_lib:  # no fasta file given, so
            if not args.repeatmasker_species:  # no species given, so run entire repeatmodler + repeat masker
                repeats = 'repeatmodeler-library.' + str(pid) + '.fasta'
                RepeatModelMask(args.input, args.cpus, tmpdir, args.out,
                                repeats, args.repeatmasker_species, log_name)
            else:
                RepeatMaskSpecies(args.input, args.repeatmasker_species,
                                  args.cpus, tmpdir, args.out, log_name)
        else:
            if lib.checkannotations(args.repeatmodeler_lib):
                RepeatMask(args.input, args.repeatmodeler_lib, args.cpus,
                           tmpdir, args.out, log_name)
            else:
                lib.log.error(
                    'ERROR: repeat library is not a valid file: {:}'.format(
                        args.repeatmodeler_lib))
                sys.exit(1)

    # output some stats on %reads masked.
    scaffolds = 0
    maskedSize = 0
    GenomeLength = 0
    with open(args.out, 'r') as input:
        for rec, Seq in SimpleFastaParser(input):
            scaffolds += 1
            GenomeLength += len(Seq)
            maskedSize += lib.n_lower_chars(Seq)

    percentMask = maskedSize / float(GenomeLength)
    lib.log.info(
        'Repeat soft-masking finished: \nMasked genome: {:}\nnum scaffolds: {:,}\nassembly size: {:,} bp\nmasked repeats: {:,} bp ({:.2f}%)'
        .format(os.path.abspath(args.out), scaffolds, GenomeLength, maskedSize,
                percentMask * 100))
    if repeats:
        lib.log.info('RepeatModeler library: {:}'.format(repeats))
    # clean up
    if not args.debug:
        if tmpdir:
            lib.SafeRemove(tmpdir)
    print("-------------------------------------------------------")
예제 #6
0
def runtbl2asn_parallel(folder, template, discrepency, organism, isolate,
                        strain, parameters, version, cpus):
    '''
    function to run NCBI tbl2asn
    '''
    # make sure ouput that will be appended to is not there
    for file in [
            os.path.join(folder, 'genome.val'),
            os.path.join(folder, 'errorsummary.val'),
            os.path.join(folder, 'genome.gbf'), discrepency
    ]:
        lib.SafeRemove(file)
    # get funannotate version
    fun_version = lib.get_version()
    # input should be a folder
    if not os.path.isdir(folder):
        lib.log.error("tbl2asn error: %s is not a directory, exiting" % folder)
        sys.exit(1)
    # based on organism, isolate, strain, construct meta info for -j flag
    if not organism:
        lib.log.error("tbl2asn error: organism not specified")
        sys.exit(1)
    meta = "[organism=" + organism + "]"
    if isolate:
        isolate_meta = "[isolate=" + isolate + "]"
        meta = meta + " " + isolate_meta
    if strain:
        strain_meta = "[strain=" + strain + "]"
        meta = meta + " " + strain_meta
    cmd = [
        'tbl2asn', '-y', '"Annotated using ' + fun_version + '"', '-N',
        str(version), '-t', template, '-M', 'n', '-j', '"' + meta + '"', '-V',
        'b', '-c', 'f', '-T', '-a', 'r10u'
    ]
    # check for custom parameters
    if parameters:
        params = parameters.split(' ')
        cmd = cmd + params
    # check for folders in the input folder, if present, run tbl2asn on each folder and then combine
    multiple = []
    for file in os.listdir(folder):
        if os.path.isdir(os.path.join(folder, file)):
            multiple.append(os.path.join(folder, file))
    if len(multiple) == 0:
        multiple.append(folder)
    p = multiprocessing.Pool(cpus)
    results = []
    for i in multiple:
        results.append(p.apply_async(tbl2asn_safe_run, (cmd, i)))
    p.close()
    p.join()
    # now collect the results make in main folder
    # first delete any of the outputs you might be appending to
    with open(os.path.join(folder, 'genome.val'), 'a') as validation:
        with open(discrepency, 'a') as discrep:
            with open(os.path.join(folder, 'errorsummary.val'),
                      'a') as summary:
                with open(os.path.join(folder, 'genome.gbf'), 'a') as genbank:
                    for dirName, subdirList, fileList in os.walk(
                            folder, topdown=False):
                        if len(subdirList) > 0:
                            continue
                        for f in fileList:
                            if f == 'errorsummary.val':
                                with open(os.path.join(dirName, f)) as infile:
                                    summary.write(infile.read())
                            elif f.endswith('.val'):
                                with open(os.path.join(dirName, f)) as infile:
                                    validation.write(infile.read())
                            elif f.endswith('.gbf'):
                                with open(os.path.join(dirName, f)) as infile:
                                    genbank.write(infile.read())
                            elif f.endswith('.tbl'):
                                shutil.copyfile(os.path.join(dirName, f),
                                                os.path.join(folder, f))
                            elif f.endswith('.sqn'):
                                shutil.copyfile(os.path.join(dirName, f),
                                                os.path.join(folder, f))
                            elif f == 'discrepency.report.txt':
                                with open(os.path.join(dirName, f)) as infile:
                                    discrep.write(infile.read())
예제 #7
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-predict.py',
        usage="%(prog)s [options] -i genome.fasta",
        description='''Script that adds a proteome to the outgroups.''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='Proteome in FASTA format')
    parser.add_argument('-s',
                        '--species',
                        required=True,
                        help='Species name "binomial in quotes"')
    parser.add_argument(
        '-b',
        '--busco_db',
        default='dikarya',
        choices=[
            'fungi', 'microsporidia', 'dikarya', 'ascomycota',
            'pezizomycotina', 'eurotiomycetes', 'sordariomycetes',
            'saccharomycetes', 'saccharomycetales', 'basidiomycota',
            'eukaryota', 'protists', 'alveolata_stramenophiles', 'metazoa',
            'nematoda', 'arthropoda', 'insecta', 'endopterygota',
            'hymenoptera', 'diptera', 'vertebrata', 'actinopterygii',
            'tetrapoda', 'aves', 'mammalia', 'euarchontoglires',
            'laurasiatheria', 'embryophyta'
        ],
        help='BUSCO database to use')
    parser.add_argument('-c',
                        '--cpus',
                        default=2,
                        type=int,
                        help='Number of CPUs to use')
    parser.add_argument('-d',
                        '--database',
                        help='Path to funannotate database, $FUNANNOTATE_DB')
    args = parser.parse_args(args)

    if args.database:
        FUNDB = args.database
    else:
        try:
            FUNDB = os.environ["FUNANNOTATE_DB"]
        except KeyError:
            lib.log.error(
                'Funannotate database not properly configured, run funannotate setup.'
            )
            sys.exit(1)

    parentdir = os.path.join(os.path.dirname(__file__))

    # get base name
    species = args.species.replace(' ', '_').lower() + '.' + args.busco_db
    OUTGROUPS = os.path.join(FUNDB, 'outgroups')

    # create log file
    log_name = species + '-add2outgroups.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running %s" % version)

    # check buscos, download if necessary
    if not os.path.isdir(os.path.join(FUNDB, args.busco_db)):
        lib.log.error(
            "%s busco database is missing, install with funannotate setup -b %s"
            % (args.busco_db, args.busco_db))
        sys.exit(1)

    ProtCount = lib.countfasta(args.input)
    lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')

    # convert to proteins and screen with busco
    lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db)
    BUSCODB = os.path.join(FUNDB, args.busco_db)
    BUSCO = os.path.join(parentdir, 'aux_scripts', 'funannotate-BUSCO2.py')
    cmd = [
        sys.executable, BUSCO, '-i',
        os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB,
        '-o', species, '--cpu',
        str(args.cpus), '-f'
    ]
    lib.runSubprocess(cmd, '.', lib.log)

    # check that it ran correctly
    busco_results = os.path.join('run_' + species,
                                 'full_table_' + species + '.tsv')
    if not lib.checkannotations(busco_results):
        lib.log.error("BUSCO failed, check logfile")
        sys.exit(1)
    nameChange = {}
    with open(busco_results, 'rU') as input:
        for line in input:
            if line.startswith('#'):
                continue
            cols = line.split('\t')
            if cols[1] == 'Complete':
                if not cols[2] in nameChange:
                    nameChange[cols[2]] = cols[0]
                else:
                    lib.log.error(
                        "Duplicate ID found: %s %s. Removing from results" %
                        (cols[2], cols[0]))
                    del nameChange[cols[2]]

    # output counts
    lib.log.info('{0:,}'.format(len(nameChange)) + ' BUSCO models found')

    # index the proteome for parsing
    SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta'))

    # setup output proteome
    busco_out = os.path.join(OUTGROUPS, species + '_buscos.fa')
    with open(busco_out, 'w') as output:
        for k, v in list(nameChange.items()):
            rec = SeqRecords[k]
            output.write('>%s\n%s\n' % (v, rec.seq))
    lib.log.info("Results written to: %s" % busco_out)

    # clean up your mess
    shutil.rmtree('run_' + species)
    shutil.rmtree('tmp')