Пример #1
0
def Fzip_inplace(input, cpus):
    '''
    function to zip as fast as it can, pigz -> bgzip -> gzip
    '''
    if lib.which('pigz'):
        cmd = ['pigz', '-f', '-p', str(cpus), input]
    elif lib.which('bgzip'):
        cmd = ['bgzip', '-f', '-@', str(cpus), input]
    else:
        cmd = ['gzip', '-f', input]
    try:
        lib.runSubprocess(cmd, '.', lib.log)
    except NameError:
        subprocess.call(cmd)
Пример #2
0
def choose_aligner():
    '''
    function to choose alignment method for mapping reads to transcripts to determine
    orientation of the trinity transcripts. rapmap -> bowtie2 -> hisat2
    note hisat2 is probably not ideal for this, but should work okay.
    '''
    aligner = ''
    if lib.which('rapmap'):
        aligner = 'rapmap'
    elif lib.which('bowtie2'):
        aligner = 'bowtie2'
    else:
        aligner = 'hisat2'
    return aligner
Пример #3
0
#run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote
phobius_out = os.path.join(outputdir, 'annotate_misc', 'phobius.results.txt')
phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log')
lib.log.info("Predicting secreted and transmembrane proteins using Phobius")
if not lib.checkannotations(phobius_out):
    subprocess.call([
        os.path.join(parentdir, 'util', 'phobius-multiproc.py'), '-i',
        Proteins, '-o', phobius_out, '-e', args.email, '-l', phobiusLog
    ])
#run signalP if installed, have to manually install, so test if exists first, then run it if it does, parse results
signalp_out = os.path.join(outputdir, 'annotate_misc', 'signalp.results.txt')
secreted_out = os.path.join(outputdir, 'annotate_misc',
                            'annotations.secretome.txt')
membrane_out = os.path.join(outputdir, 'annotate_misc',
                            'annotations.transmembrane.txt')
if lib.which('signalp'):
    lib.log.info("Predicting secreted proteins with SignalP")
    if not lib.checkannotations(signalp_out):
        lib.signalP(Proteins, os.path.join(outputdir, 'annotate_misc'),
                    signalp_out)
    lib.parsePhobiusSignalP(phobius_out, signalp_out, membrane_out,
                            secreted_out)
else:
    lib.log.info(
        "SignalP not installed, secretome prediction less accurate using only Phobius"
    )
    lib.parsePhobiusSignalP(phobius_out, False, membrane_out, secreted_out)
num_secreted = lib.line_count(secreted_out)
num_mem = lib.line_count(membrane_out)
lib.log.info('{0:,}'.format(num_secreted) + ' secretome and ' +
             '{0:,}'.format(num_mem) + ' transmembane annotations added')
#run BUSCO OGS search
busco_out = os.path.join(outputdir, 'annotate_misc', 'annotations.busco.txt')
lib.log.info("Annotating proteins with BUSCO %s models" % args.busco_db)
buscoDB = os.path.join(parentdir, 'DB', args.busco_db)
if not lib.checkannotations(busco_out):
    lib.runBUSCO(Proteins, buscoDB, args.cpus, os.path.join(outputdir, 'annotate_misc'), busco_out)
num_annotations = lib.line_count(busco_out)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')

#run Phobius if local is installed, otherwise use funannotate remote
phobius_out = os.path.join(outputdir, 'annotate_misc', 'phobius.results.txt')
phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log')
if args.phobius:
    phobius_out = args.phobius
else:
    if lib.which('phobius.pl'):
        if not lib.checkannotations(phobius_out):
            lib.log.info("Predicting secreted and transmembrane proteins using Phobius")
            subprocess.call([os.path.join(parentdir, 'util', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-l', phobiusLog])        
    else:
        if lib.checkannotations(phobius_out):
            lib.log.info("Found phobius pre-computed results")
        else:
            lib.log.info("Skipping phobius predictions, try funannotate remote -m phobius")
#run signalP if installed, have to manually install, so test if exists first, then run it if it does, parse results
signalp_out = os.path.join(outputdir, 'annotate_misc', 'signalp.results.txt')
secreted_out = os.path.join(outputdir, 'annotate_misc', 'annotations.secretome.txt')
membrane_out = os.path.join(outputdir, 'annotate_misc', 'annotations.transmembrane.txt')
if lib.which('signalp'):
    lib.log.info("Predicting secreted proteins with SignalP")
    if not lib.checkannotations(signalp_out):
lib.log.debug(cmd_args)

#create tmpdir to store fasta files and output files
TMPDIR = 'phobius_' + str(os.getpid())

#split fasta
lib.splitFASTA(args.input, TMPDIR)

#now get list of files in tmpdir
proteins = []
for file in os.listdir(TMPDIR):
    if file.endswith('.fa'):
        proteins.append(file)

#now run the script
if lib.which('phobius.pl'):
    lib.runMultiProgress(runPhobiusLocal, proteins, multiprocessing.cpu_count())
else:
    lib.runMultiProgress(runPhobiusRemote, proteins, 29) #max is 30 jobs at a time

#collect all results
phobius = []
for file in os.listdir(TMPDIR):
    if file.endswith('.phobius'):
        phobius.append(os.path.join(TMPDIR,file))

#write output
TMdomain = 0
SigPep = 0
with open(args.out, 'w') as output:
    output.write("%s\t%s\t%s\t%s\n" % ('ID', 'TM', 'SP', 'Prediction'))
lib.log.info("Annotating proteins with EggNog 4.5 database")
if not lib.checkannotations(eggnog_out):
    lib.runEggNog(Proteins, os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm'), os.path.join(parentdir, 'DB', args.eggnog_db+'.annotations.tsv'), args.cpus, 1e-10, os.path.join(outputdir, 'annotate_misc'), eggnog_out)
num_annotations = lib.line_count(eggnog_out)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
#run BUSCO OGS search
busco_out = os.path.join(outputdir, 'annotate_misc', 'annotations.busco.txt')
lib.log.info("Annotating proteins with BUSCO %s models" % args.busco_db)
buscoDB = os.path.join(parentdir, 'DB', args.busco_db)
if not lib.checkannotations(busco_out):
    lib.runBUSCO(Proteins, buscoDB, args.cpus, os.path.join(outputdir, 'annotate_misc'), busco_out)
num_annotations = lib.line_count(busco_out)
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
#run signalP if installed, have to manually install, so test if exists first, then run it if it does
signalp_out = os.path.join(outputdir, 'annotate_misc', 'annotations.signalp.txt')
if lib.which('signalp'):
    lib.log.info("Predicting secreted proteins with SignalP")
    if not lib.checkannotations(signalp_out):
        lib.signalP(Proteins, os.path.join(outputdir, 'annotate_misc'), signalp_out)
    num_annotations = lib.line_count(signalp_out)
    lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
else:
    lib.log.info("SignalP not installed, skipping")

if not args.skip_iprscan:
    if not args.iprscan:
        #run interpro scan
        IPROUT = os.path.join(outputdir, 'annotate_misc', 'iprscan')
        PROTS = os.path.join(outputdir, 'annotate_misc', 'protein_tmp')
        for i in IPROUT,PROTS:
            if not os.path.exists(i):
Пример #7
0
def runPASAtrain(genome, transcripts, stranded, intronlen, cpus, dbname,
                 output):
    '''
    function will run PASA align assembly and then choose best gene models for training
    '''
    if cpus > 2:
        pasa_cpus = cpus / 2
    else:
        pasa_cpus = 2
    #create tmpdir
    folder = os.path.join(tmpdir, 'pasa')
    if not os.path.isdir(folder):
        os.makedirs(folder)

    #create pasa and transdecoder logfiles
    pasa_log = os.path.join(folder, 'pasa.log')
    transdecoder_log = os.path.join(folder, 'transdecoder.log')

    #get config files and edit
    alignConfig = os.path.join(folder, 'alignAssembly.txt')
    pasaDBname = dbname.replace('-', '_')
    with open(alignConfig, 'w') as config1:
        with open(
                os.path.join(PASA, 'pasa_conf',
                             'pasa.alignAssembly.Template.txt'),
                'rU') as template1:
            for line in template1:
                line = line.replace('<__MYSQLDB__>', pasaDBname)
                config1.write(line)
    if not os.path.isfile(
            os.path.join(folder, pasaDBname + '.assemblies.fasta')):
        #now run first PASA step, note this will dump any database with same name
        lib.log.info(
            "Running PASA alignment step using {:,} transcripts".format(
                lib.countfasta(transcripts)))
        cmd = [
            os.path.join(PASA, 'scripts', 'Launch_PASA_pipeline.pl'), '-c',
            os.path.abspath(alignConfig), '-r', '-C', '-R', '-g',
            os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-t',
            os.path.abspath(transcripts), '--stringent_alignment_overlap',
            args.pasa_alignment_overlap, '--TRANSDECODER',
            '--MAX_INTRON_LENGTH',
            str(intronlen), '--CPU',
            str(pasa_cpus)
        ]
        if stranded != 'no':
            cmd = cmd + ['--transcribed_is_aligned_orient']
        lib.runSubprocess(cmd, folder, lib.log)
    else:
        lib.log.info('Existing PASA assemblies found {:}'.format(
            os.path.join(folder, pasaDBname + '.assemblies.fasta')))
    #generate TSV gene-transcripts
    numLoci = getPASAtranscripts2genes(
        os.path.join(folder, pasaDBname + '.pasa_assemblies.gff3'),
        os.path.join(folder, 'pasa.gene2transcripts.tsv'))
    numTranscripts = lib.countfasta(
        os.path.join(folder, pasaDBname + '.assemblies.fasta'))
    lib.log.info(
        "Assigned {:,} transcipts to {:,} loci using {:}% overlap threshold".
        format(numTranscripts, numLoci, args.pasa_alignment_overlap))

    lib.log.info("Getting PASA models for training with TransDecoder")
    pasa_training_gff = os.path.join(
        folder, pasaDBname + '.assemblies.fasta.transdecoder.genome.gff3')
    if lib.which('TransDecoder.LongOrfs') and lib.which(
            'TransDecoder.Predict'):
        cmd = [
            'TransDecoder.LongOrfs', '-t', pasaDBname + '.assemblies.fasta',
            '--gene_trans_map', 'pasa.gene2transcripts.tsv'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
        cmd = [
            'TransDecoder.Predict', '-t', pasaDBname + '.assemblies.fasta',
            '--single_best_only'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
        cmd = [
            os.path.join(PASA, 'pasa-plugins', 'transdecoder',
                         'cdna_alignment_orf_to_genome_orf.pl'),
            pasaDBname + '.assemblies.fasta.transdecoder.gff3',
            pasaDBname + '.pasa_assemblies.gff3',
            pasaDBname + '.assemblies.fasta'
        ]
        lib.runSubprocess2(cmd, folder, lib.log, pasa_training_gff)
    else:
        cmd = [
            os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'),
            '--pasa_transcripts_fasta', pasaDBname + '.assemblies.fasta',
            '--pasa_transcripts_gff3', pasaDBname + '.pasa_assemblies.gff3'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
    #grab final result
    shutil.copyfile(pasa_training_gff, output)