예제 #1
0
# create tmpdir to store fasta files and output files
TMPDIR = 'phobius_' + str(uuid.uuid4())

# split fasta
lib.splitFASTA(args.input, TMPDIR)

# now get list of files in tmpdir
proteins = []
for file in os.listdir(TMPDIR):
    if file.endswith('.fa'):
        proteins.append(file)

# now run the script
if lib.which('phobius.pl'):
    lib.runMultiProgress(runPhobiusLocal, proteins,
                         multiprocessing.cpu_count())
else:
    lib.runMultiProgress(runPhobiusRemote, proteins,
                         29)  # max is 30 jobs at a time

# collect all results
phobius = []
for file in os.listdir(TMPDIR):
    if file.endswith('.phobius'):
        phobius.append(os.path.join(TMPDIR, file))

# write output
TMdomain = 0
SigPep = 0
with open(args.out, 'w') as output:
    output.write("%s\t%s\t%s\t%s\n" % ('ID', 'TM', 'SP', 'Prediction'))
예제 #2
0
                    SeqIO.write(record, output, 'fasta')
        else:
            name = str(record.id)
            scaffolds.append(name)
            outputfile = os.path.join(tmpdir, name + '.fa')
            with open(outputfile, 'w') as output:
                SeqIO.write(record, output, 'fasta')

# now loop through each scaffold running augustus
if args.cpus > len(scaffolds):
    num = len(scaffolds)
else:
    num = args.cpus
lib.log.debug("Running Augustus on %i chunks, using %i CPUs" %
              (len(scaffolds), num))
lib.runMultiProgress(runAugustus, scaffolds, num)

lib.log.debug("Augustus prediction is finished, now concatenating results")
with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output:
    for file in scaffolds:
        file = os.path.join(tmpdir, file + '.augustus.gff3')
        with open(file) as input:
            output.write(input.read())

if lib.checkannotations(os.path.join(tmpdir, 'augustus_all.gff3')):
    lib.log.debug('Augustus finished, now joining results')
if lib.which_path('join_aug_pred.pl'):
    join_script = 'join_aug_pred.pl'
else:
    join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl')
예제 #3
0
            '-e',
            os.path.join(outputDir, os.path.basename(args.transcripts))
        ]
    if args.repeats:
        cmd += [
            '--repeats',
            os.path.join(outputDir, os.path.basename(args.repeats))
        ]
    cmd += [
        os.path.join(outputDir, 'evm.out'),
        os.path.join(outputDir, 'evm.out.log')
    ]
    file_list.append(cmd)

# run runMultiProgress
lib.runMultiProgress(safe_run, file_list, num_workers, progress=args.progress)

# now combine the paritions
cmd4 = [
    perl, Combine, '--partitions',
    os.path.basename(partitions), '--output_file_name', 'evm.out'
]
lib.runSubprocess(cmd4, tmpdir, lib.log)

# now convert to GFF3
cmd5 = [
    perl, Convert, '--partitions',
    os.path.basename(partitions), '--output', 'evm.out', '--genome',
    os.path.abspath(args.fasta)
]
lib.runSubprocess(cmd5, tmpdir, lib.log)
예제 #4
0
lib.log.info('Found {0:,}'.format(len(Hits)) +
             ' preliminary alignments --> aligning with exonerate')

# index the genome and proteins
# do index here in case memory problems?
protein_dict = SeqIO.index(os.path.abspath(args.proteins), 'fasta')

# split genome fasta into individual scaffolds
with open(os.path.abspath(args.genome), 'rU') as input:
    for record in SeqIO.parse(input, "fasta"):
        SeqIO.write(record, os.path.join(tmpdir, 'scaffolds',
                                         record.id + ".fa"), "fasta")

# run multiprocessing exonerate
lib.runMultiProgress(runExonerate, Hits, args.cpus)

# now need to loop through and offset exonerate predictions back to whole scaffolds
exonerate_raw = os.path.join(tmpdir, 'exonerate.out.combined')
with open(exonerate_raw, 'w') as output:
    for file in os.listdir(tmpdir):
        if file.endswith('.out'):
            with open(os.path.join(tmpdir, file), 'rU') as exoresult:
                offset = int(file.split('__')[1])
                for line in itertools.islice(exoresult, 3, None):
                    if line.startswith('#') or line.startswith(
                            'Average') or line.startswith('-- completed'):
                        output.write(line)
                    else:
                        cols = line.split('\t')
                        cols[3] = str(int(cols[3]) + offset)
예제 #5
0
def runTrinityGG(genome, readTuple, longReads, shortBAM, output, args=False):
    '''
    function will run genome guided Trinity. First step will be to run hisat2 to align reads
    to the genome, then pass that BAM file to Trinity to generate assemblies
    '''
    if not lib.checkannotations(shortBAM):
        # build hisat2 index, using exons and splice sites
        lib.log.info("Building Hisat2 genome index")
        cmd = ['hisat2-build', '-p',
               str(args.cpus), genome, os.path.join(tmpdir, 'hisat2.genome')]
        lib.runSubprocess4(cmd, '.', lib.log)
        # align reads using hisat2
        lib.log.info("Aligning reads to genome using Hisat2")
        # use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM
        # use half number of threads for bam compression threads
        bamthreads = (args.cpus + 2 // 2) // 2
        if args.stranded != 'no' and not readTuple[2]:
            hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen),
                         '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness', args.stranded]
        else:
            hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen),
                         '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome')]
        if readTuple[0] and readTuple[1]:
            hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]]
        if readTuple[2]:
            hisat2cmd = hisat2cmd + ['-U', readTuple[2]]
        cmd = [os.path.join(parentdir, 'sam2bam.sh'), " ".join(
            hisat2cmd), str(bamthreads), shortBAM]
        lib.runSubprocess(cmd, '.', lib.log)
    else:
        lib.log.info('Existig Hisat2 alignments found: {:}'.format(shortBAM))

    # now launch Trinity genome guided
    TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log')
    lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog)
    lib.log.info(
        "Clustering of reads from BAM and preparing assembly commands")
    jaccard_clip = []
    if args.jaccard_clip:
        jaccard_clip = ['--jaccard_clip']
    if args.stranded != 'no':
        cmd = ['Trinity', '--SS_lib_type', args.stranded, '--no_distributed_trinity_exec',
               '--genome_guided_bam', shortBAM, '--genome_guided_max_intron', str(
                   args.max_intronlen),
               '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')]
    else:
        cmd = ['Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam', shortBAM,
               '--genome_guided_max_intron', str(
                   args.max_intronlen), '--CPU', str(args.cpus),
               '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')]
    cmd = cmd + jaccard_clip
    if longReads and lib.checkannotations(longReads):
        cmd = cmd + ['--long_reads', os.path.realpath(longReads)]
    lib.runSubprocess2(cmd, '.', lib.log, TrinityLog)
    commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds')

    # this will create all the Trinity commands, will now run these in parallel using multiprocessing
    # in Python (seems to be much faster than Parafly on my system)
    file_list = []
    with open(commands, 'r') as cmdFile:
        for line in cmdFile:
            line = line.replace('\n', '')
            # don't think this should be appended to every command....
            line = line.replace('--no_distributed_trinity_exec', '')
            line = line.replace('"', '')  # don't need these double quotes
            file_list.append(line)
    lib.log.info("Assembling "+"{0:,}".format(len(file_list)) +
                 " Trinity clusters using %i CPUs" % (args.cpus-1))
    lib.runMultiProgress(safe_run, file_list, args.cpus-1)

    # collected output files and clean
    outputfiles = os.path.join(
        tmpdir, 'trinity_gg', 'trinity_output_files.txt')
    with open(outputfiles, 'w') as fileout:
        for filename in find_files(os.path.join(tmpdir, 'trinity_gg'), '*inity.fasta'):
            fileout.write('%s\n' % filename)
    # now grab them all using Trinity script
    cmd = ['perl', os.path.abspath(os.path.join(
        TRINITY, 'util', 'support_scripts', 'GG_partitioned_trinity_aggregator.pl')), 'Trinity_GG']
    lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output)
    lib.log.info('{:,} transcripts derived from Trinity'.format(
        lib.countfasta(output)))
예제 #6
0
                    SeqIO.write(record, output, 'fasta')
        else:
            name = str(record.id)
            scaffolds.append(name)
            outputfile = os.path.join(tmpdir, name + '.fa')
            with open(outputfile, 'w') as output:
                SeqIO.write(record, output, 'fasta')

# now loop through each scaffold running augustus
if args.cpus > len(scaffolds):
    num = len(scaffolds)
else:
    num = args.cpus
lib.log.debug("Running Augustus on %i chunks, using %i CPUs" %
              (len(scaffolds), num))
lib.runMultiProgress(runAugustus, scaffolds, num, progress=args.progress)

lib.log.debug("Augustus prediction is finished, now concatenating results")
with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output:
    for file in scaffolds:
        file = os.path.join(tmpdir, file + '.augustus.gff3')
        with open(file) as input:
            output.write(input.read())

if lib.checkannotations(os.path.join(tmpdir, 'augustus_all.gff3')):
    lib.log.debug('Augustus finished, now joining results')
if lib.which_path('join_aug_pred.pl'):
    join_script = 'join_aug_pred.pl'
else:
    join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl')
            '-e',
            os.path.join(outputDir, os.path.basename(args.transcripts))
        ]
    if args.repeats:
        cmd += [
            '--repeats',
            os.path.join(outputDir, os.path.basename(args.repeats))
        ]
    cmd += [
        os.path.join(outputDir, 'evm.out'),
        os.path.join(outputDir, 'evm.out.log')
    ]
    file_list.append(cmd)

# run runMultiProgress
lib.runMultiProgress(safe_run, file_list, num_workers)

# now combine the paritions
cmd4 = [
    perl, Combine, '--partitions',
    os.path.basename(partitions), '--output_file_name', 'evm.out'
]
lib.runSubprocess(cmd4, tmpdir, lib.log)

# now convert to GFF3
cmd5 = [
    perl, Convert, '--partitions',
    os.path.basename(partitions), '--output', 'evm.out', '--genome',
    os.path.abspath(args.fasta)
]
lib.runSubprocess(cmd5, tmpdir, lib.log)