Exemplo n.º 1
0
def main(args):
        # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)
    parser = argparse.ArgumentParser(prog='sort_rename.py', usage="%(prog)s [options] -i genome.fa -o sorted.fa",
                                     description='''Script that sorts input by length and then renames contig headers.''',
                                     epilog="""Written by Jon Palmer (2016) [email protected]""",
                                     formatter_class=MyFormatter)
    parser.add_argument('-i', '--input', required=True,
                        help='Multi-fasta genome file')
    parser.add_argument('-o', '--out', required=True,
                        help='Cleaned output (FASTA)')
    parser.add_argument('-b', '--base', default='scaffold',
                        help='Basename of contig header')
    parser.add_argument(
        '-m', '--minlen', help='Contigs shorter than threshold are discarded')
    args = parser.parse_args(args)

    print('{:,} contigs records loaded'.format(countfasta(args.input)))
    print("Sorting and renaming contig headers")
    if args.minlen:
        print("Removing contigs less than {:} bp".format(args.minlen))
    SortRenameHeaders(args.input, args.base, args.out, minlen=args.minlen)
    print('{:,} contigs saved to file'.format(countfasta(args.out)))
Exemplo n.º 2
0
            for y in [query, scaffold]:
                try:
                    lib.SafeRemove(y)
                except OSError:
                    lib.log.debug("Error removing %s" % (y))
        # check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes
        if lib.getSize(exonerate_out) < 500:
            os.remove(exonerate_out)
    else:
        lib.log.debug('Error in query or scaffold:{:}'.format(input))
        lib.SafeRemove(query)
        lib.SafeRemove(scaffold)


# count number of proteins to look for
total = lib.countfasta(args.proteins)
lib.log.info('Mapping {:,} proteins to genome using {:} and exonerate'.format(
    total, args.filter))

# make tmpdir
tmpdir = 'p2g_' + str(os.getpid())
if not os.path.isdir(tmpdir):
    os.makedirs(tmpdir)
    os.makedirs(os.path.join(tmpdir, 'failed'))
    os.makedirs(os.path.join(tmpdir, 'scaffolds'))

if args.filter == 'tblastn':
    lib.log.debug("BLAST v%s; Exonerate v%s" % (blast_version, exo_version))
    # check for tblastn input
    if args.tblastn:
        lib.log.info("Using pre-calculated tBLASTN result")
Exemplo n.º 3
0
def runTrinityGG(genome, readTuple, longReads, shortBAM, output, args=False):
    '''
    function will run genome guided Trinity. First step will be to run hisat2 to align reads
    to the genome, then pass that BAM file to Trinity to generate assemblies
    '''
    if not lib.checkannotations(shortBAM):
        # build hisat2 index, using exons and splice sites
        lib.log.info("Building Hisat2 genome index")
        cmd = ['hisat2-build', '-p',
               str(args.cpus), genome, os.path.join(tmpdir, 'hisat2.genome')]
        lib.runSubprocess4(cmd, '.', lib.log)
        # align reads using hisat2
        lib.log.info("Aligning reads to genome using Hisat2")
        # use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM
        # use half number of threads for bam compression threads
        bamthreads = (args.cpus + 2 // 2) // 2
        if args.stranded != 'no' and not readTuple[2]:
            hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen),
                         '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness', args.stranded]
        else:
            hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen),
                         '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome')]
        if readTuple[0] and readTuple[1]:
            hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]]
        if readTuple[2]:
            hisat2cmd = hisat2cmd + ['-U', readTuple[2]]
        cmd = [os.path.join(parentdir, 'sam2bam.sh'), " ".join(
            hisat2cmd), str(bamthreads), shortBAM]
        lib.runSubprocess(cmd, '.', lib.log)
    else:
        lib.log.info('Existig Hisat2 alignments found: {:}'.format(shortBAM))

    # now launch Trinity genome guided
    TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log')
    lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog)
    lib.log.info(
        "Clustering of reads from BAM and preparing assembly commands")
    jaccard_clip = []
    if args.jaccard_clip:
        jaccard_clip = ['--jaccard_clip']
    if args.stranded != 'no':
        cmd = ['Trinity', '--SS_lib_type', args.stranded, '--no_distributed_trinity_exec',
               '--genome_guided_bam', shortBAM, '--genome_guided_max_intron', str(
                   args.max_intronlen),
               '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')]
    else:
        cmd = ['Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam', shortBAM,
               '--genome_guided_max_intron', str(
                   args.max_intronlen), '--CPU', str(args.cpus),
               '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')]
    cmd = cmd + jaccard_clip
    if longReads and lib.checkannotations(longReads):
        cmd = cmd + ['--long_reads', os.path.realpath(longReads)]
    lib.runSubprocess2(cmd, '.', lib.log, TrinityLog)
    commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds')

    # this will create all the Trinity commands, will now run these in parallel using multiprocessing
    # in Python (seems to be much faster than Parafly on my system)
    file_list = []
    with open(commands, 'r') as cmdFile:
        for line in cmdFile:
            line = line.replace('\n', '')
            # don't think this should be appended to every command....
            line = line.replace('--no_distributed_trinity_exec', '')
            line = line.replace('"', '')  # don't need these double quotes
            file_list.append(line)
    lib.log.info("Assembling "+"{0:,}".format(len(file_list)) +
                 " Trinity clusters using %i CPUs" % (args.cpus-1))
    lib.runMultiProgress(safe_run, file_list, args.cpus-1)

    # collected output files and clean
    outputfiles = os.path.join(
        tmpdir, 'trinity_gg', 'trinity_output_files.txt')
    with open(outputfiles, 'w') as fileout:
        for filename in find_files(os.path.join(tmpdir, 'trinity_gg'), '*inity.fasta'):
            fileout.write('%s\n' % filename)
    # now grab them all using Trinity script
    cmd = ['perl', os.path.abspath(os.path.join(
        TRINITY, 'util', 'support_scripts', 'GG_partitioned_trinity_aggregator.pl')), 'Trinity_GG']
    lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output)
    lib.log.info('{:,} transcripts derived from Trinity'.format(
        lib.countfasta(output)))
Exemplo n.º 4
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='contig_cleaner.py',
        usage="%(prog)s [options] -i genome.fa -o cleaned.fa",
        description=
        '''Script that removes short scaffolds that are duplicated elsewhere.''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='Multi-fasta genome file')
    parser.add_argument('-o',
                        '--out',
                        required=True,
                        help='Cleaned output (FASTA)')
    parser.add_argument('-p',
                        '--pident',
                        type=int,
                        default=95,
                        help='percent identity of contig')
    parser.add_argument('-c',
                        '--cov',
                        type=int,
                        default=95,
                        help='coverage of contig')
    parser.add_argument('-m',
                        '--minlen',
                        type=int,
                        default=500,
                        help='Minimum length of contig')
    parser.add_argument('--cpus',
                        default=2,
                        type=int,
                        help='Number of CPUs to use')
    parser.add_argument('--exhaustive',
                        action='store_true',
                        help='Compute every contig, else stop at N50')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Debug the output')
    args = parser.parse_args(args)

    # setup some global variables used in functions above
    global GENOME, CPUS, PIDENT, COV, keepers, repeats
    GENOME = args.input
    CPUS = args.cpus
    PIDENT = args.pident
    COV = args.cov
    keepers, repeats = ([], ) * 2

    # run some checks of dependencies first
    programs = ['minimap2']
    CheckDependencies(programs)

    # calculate N50 of assembly
    n50 = calcN50(args.input)

    # now get list of scaffolds, shortest->largest
    if args.exhaustive:
        scaffolds, keepers = Sortbysize(args.input, False, minlen=args.minlen)
    else:
        scaffolds, keepers = Sortbysize(args.input, n50, minlen=args.minlen)

    print("-----------------------------------------------")
    PassSize = len(scaffolds) + len(keepers)
    print(
        ("{:,} input contigs, {:,} larger than {:,} bp, N50 is {:,} bp".format(
            countfasta(args.input), PassSize, args.minlen, n50)))
    if args.exhaustive:
        print(("Checking duplication of {:,} contigs".format(len(scaffolds))))
    else:
        print(("Checking duplication of {:,} contigs shorter than N50".format(
            len(scaffolds))))
    print("-----------------------------------------------")

    # now generate pool and parallel process the list
    mp_output = multithread_aligning(scaffolds)

    for output, garbage in mp_output:
        if not garbage:
            keepers.append(output)
        else:
            repeats.append(output)

    print("-----------------------------------------------")
    print((
        "{:,} input contigs; {:,} larger than {:} bp; {:,} duplicated; {:,} written to file"
        .format(countfasta(args.input), PassSize, args.minlen, len(repeats),
                len(keepers))))
    if args.debug:
        print(("\nDuplicated contigs are:\n{:}\n".format(', '.join(repeats))))
        print(("Contigs to keep are:\n{:}\n".format(', '.join(keepers))))

    # finally write a new reference based on list of keepers
    with open(args.out, 'w') as output:
        with open(args.input, 'r') as input:
            for title, sequence in SimpleFastaParser(input):
                if title in keepers:
                    output.write('>{}\n{}\n'.format(title, softwrap(sequence)))
Exemplo n.º 5
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='funannotate-predict.py',
        usage="%(prog)s [options] -i genome.fasta",
        description='''Script that adds a proteome to the outgroups.''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='Proteome in FASTA format')
    parser.add_argument('-s',
                        '--species',
                        required=True,
                        help='Species name "binomial in quotes"')
    parser.add_argument(
        '-b',
        '--busco_db',
        default='dikarya',
        choices=[
            'fungi', 'microsporidia', 'dikarya', 'ascomycota',
            'pezizomycotina', 'eurotiomycetes', 'sordariomycetes',
            'saccharomycetes', 'saccharomycetales', 'basidiomycota',
            'eukaryota', 'protists', 'alveolata_stramenophiles', 'metazoa',
            'nematoda', 'arthropoda', 'insecta', 'endopterygota',
            'hymenoptera', 'diptera', 'vertebrata', 'actinopterygii',
            'tetrapoda', 'aves', 'mammalia', 'euarchontoglires',
            'laurasiatheria', 'embryophyta'
        ],
        help='BUSCO database to use')
    parser.add_argument('-c',
                        '--cpus',
                        default=2,
                        type=int,
                        help='Number of CPUs to use')
    parser.add_argument('-d',
                        '--database',
                        help='Path to funannotate database, $FUNANNOTATE_DB')
    args = parser.parse_args(args)

    if args.database:
        FUNDB = args.database
    else:
        try:
            FUNDB = os.environ["FUNANNOTATE_DB"]
        except KeyError:
            lib.log.error(
                'Funannotate database not properly configured, run funannotate setup.'
            )
            sys.exit(1)

    parentdir = os.path.join(os.path.dirname(__file__))

    # get base name
    species = args.species.replace(' ', '_').lower() + '.' + args.busco_db
    OUTGROUPS = os.path.join(FUNDB, 'outgroups')

    # create log file
    log_name = species + '-add2outgroups.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    # initialize script, log system info and cmd issue at runtime
    lib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    lib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    lib.SystemInfo()

    # get version of funannotate
    version = lib.get_version()
    lib.log.info("Running %s" % version)

    # check buscos, download if necessary
    if not os.path.isdir(os.path.join(FUNDB, args.busco_db)):
        lib.log.error(
            "%s busco database is missing, install with funannotate setup -b %s"
            % (args.busco_db, args.busco_db))
        sys.exit(1)

    ProtCount = lib.countfasta(args.input)
    lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')

    # convert to proteins and screen with busco
    lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db)
    BUSCODB = os.path.join(FUNDB, args.busco_db)
    BUSCO = os.path.join(parentdir, 'aux_scripts', 'funannotate-BUSCO2.py')
    cmd = [
        sys.executable, BUSCO, '-i',
        os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB,
        '-o', species, '--cpu',
        str(args.cpus), '-f'
    ]
    lib.runSubprocess(cmd, '.', lib.log)

    # check that it ran correctly
    busco_results = os.path.join('run_' + species,
                                 'full_table_' + species + '.tsv')
    if not lib.checkannotations(busco_results):
        lib.log.error("BUSCO failed, check logfile")
        sys.exit(1)
    nameChange = {}
    with open(busco_results, 'rU') as input:
        for line in input:
            if line.startswith('#'):
                continue
            cols = line.split('\t')
            if cols[1] == 'Complete':
                if not cols[2] in nameChange:
                    nameChange[cols[2]] = cols[0]
                else:
                    lib.log.error(
                        "Duplicate ID found: %s %s. Removing from results" %
                        (cols[2], cols[0]))
                    del nameChange[cols[2]]

    # output counts
    lib.log.info('{0:,}'.format(len(nameChange)) + ' BUSCO models found')

    # index the proteome for parsing
    SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta'))

    # setup output proteome
    busco_out = os.path.join(OUTGROUPS, species + '_buscos.fa')
    with open(busco_out, 'w') as output:
        for k, v in list(nameChange.items()):
            rec = SeqRecords[k]
            output.write('>%s\n%s\n' % (v, rec.seq))
    lib.log.info("Results written to: %s" % busco_out)

    # clean up your mess
    shutil.rmtree('run_' + species)
    shutil.rmtree('tmp')