Пример #1
0
def runTrimmomaticSE(reads):
    '''
    function is wrapper for Trinity trimmomatic
    '''
    #create tmpdir
    folder = os.path.join(tmpdir, 'trimmomatic')
    if not os.path.isdir(folder):
        os.makedirs(folder)
    lib.log.info("Adapter and Quality trimming SE reads with Trimmomatic")
    output = os.path.join(folder, 'trimmed_single.fastq')
    TRIMMOMATIC_DIR = os.path.join(TRINITY, 'trinity-plugins',
                                   'Trimmomatic-0.36')
    cmd = [
        'java', '-jar',
        os.path.join(TRIMMOMATIC_DIR, 'trimmomatic.jar'), 'SE', '-threads',
        str(args.cpus), '-phred33', reads, output, 'ILLUMINACLIP:' +
        os.path.join(TRIMMOMATIC_DIR, 'adapters', 'TruSeq3-SE.fa') +
        ':2:30:10', 'SLIDINGWINDOW:4:5', 'LEADING:5', 'TRAILING:5', 'MINLEN:25'
    ]
    lib.runSubprocess(cmd, '.', lib.log)
    Fzip_inplace(output, args.cpus)
    trim_single = os.path.join(folder, 'trimmed_single.fastq.gz')
    return trim_single
Пример #2
0
def runTrimmomaticPE(left, right):
    '''
    function is wrapper for Trinity trimmomatic
    '''
    #create tmpdir
    folder = os.path.join(tmpdir, 'trimmomatic')
    if not os.path.isdir(folder):
        os.makedirs(folder)
    lib.log.info("Adapter and Quality trimming PE reads with Trimmomatic")
    left_paired = os.path.join(folder, 'trimmed_left.fastq')
    left_single = os.path.join(folder, 'trimmed_left.unpaired.fastq')
    right_paired = os.path.join(folder, 'trimmed_right.fastq')
    right_single = os.path.join(folder, 'trimmed_right.unpaired.fastq')
    TRIMMOMATIC_DIR = os.path.join(TRINITY, 'trinity-plugins', 'Trimmomatic-0.36')
    cmd = ['java', '-jar', os.path.join(TRIMMOMATIC_DIR, 'trimmomatic.jar'), 'PE', '-threads', str(args.cpus), '-phred33', 
            left, right, left_paired, left_single, right_paired, right_single,
            'ILLUMINACLIP:'+os.path.join(TRIMMOMATIC_DIR,'adapters','TruSeq3-PE.fa')+':2:30:10', 'SLIDINGWINDOW:4:5', 'LEADING:5', 'TRAILING:5', 'MINLEN:25']
    lib.runSubprocess(cmd, '.', lib.log)
    for x in [left_paired, left_single, right_paired, right_single]:
        Fzip_inplace(x, args.cpus)
    trim_left = os.path.join(folder, 'trimmed_left.fastq.gz')
    trim_right = os.path.join(folder, 'trimmed_right.fastq.gz')
    return trim_left, trim_right
Пример #3
0
def pfamDB(info, force=False):
    hmm = os.path.join(FUNDB, 'Pfam-A.hmm')
    familyinfo = os.path.join(FUNDB, 'Pfam-A.clans.tsv')
    versionfile = os.path.join(FUNDB, 'Pfam.version')
    if os.path.isfile(hmm) and args.update and not force:
        if check4newDB('pfam-log', info):
            force = True
    if not os.path.isfile(hmm) or force:
        lib.log.info('Downloading Pfam database')
        download(lib.DBURL.get('pfam'), hmm + '.gz')
        subprocess.call(['gunzip', '-f', 'Pfam-A.hmm.gz'],
                        cwd=os.path.join(FUNDB))
        download(lib.DBURL.get('pfam-tsv'), familyinfo + '.gz')
        subprocess.call(['gunzip', '-f', 'Pfam-A.clans.tsv.gz'],
                        cwd=os.path.join(FUNDB))
        download(lib.DBURL.get('pfam-log'), versionfile + '.gz')
        md5 = calcmd5(versionfile + '.gz')
        subprocess.call(['gunzip', '-f', 'Pfam.version.gz'],
                        cwd=os.path.join(FUNDB))
        num_records = 0
        pfamdate = ''
        pfamvers = ''
        with open(versionfile, 'rU') as input:
            for line in input:
                if line.startswith('Pfam release'):
                    pfamvers = line.split(': ')[-1].rstrip()
                if line.startswith('Pfam-A families'):
                    num_records = int(line.split(': ')[-1].rstrip())
                if line.startswith('Date'):
                    pfamdate = line.split(': ')[-1].rstrip()
        lib.log.info('Creating Pfam HMM database')
        cmd = ['hmmpress', 'Pfam-A.hmm']
        lib.runSubprocess(cmd, os.path.join(FUNDB), lib.log)
        info['pfam'] = ('hmmer3', hmm, pfamvers, pfamdate, num_records, md5)
    type, name, version, date, records, checksum = info.get('pfam')
    lib.log.info('Pfam Database: version={:} date={:} records={:,}'.format(
        version, date, records))
Пример #4
0
def dbCANDB(info, force=False):
    hmm = os.path.join(FUNDB, 'dbCAN.hmm')
    familyinfo = os.path.join(FUNDB, 'dbCAN-fam-HMMs.txt')
    versionfile = os.path.join(FUNDB, 'dbCAN.changelog.txt')
    if os.path.isfile(hmm) and args.update and not force:
        if check4newDB('dbCAN', info):
            force = True
    if not os.path.isfile(hmm) or force:
        lib.log.info('Downloading dbCAN database')
        download(lib.DBURL.get('dbCAN'), os.path.join(FUNDB, 'dbCAN.tmp'))
        md5 = calcmd5(os.path.join(FUNDB, 'dbCAN.tmp'))
        download(lib.DBURL.get('dbCAN-tsv'), familyinfo)
        download(lib.DBURL.get('dbCAN-log'), versionfile)
        num_records = 0
        dbdate = ''
        dbvers = ''
        with open(hmm, 'w') as out:
            with open(os.path.join(FUNDB, 'dbCAN.tmp'), 'rU') as input:
                for line in input:
                    if line.startswith('NAME'):
                        num_records += 1
                        line = line.replace('.hmm\n', '\n')
                    out.write(line)
        with open(versionfile, 'rU') as infile:
            head = [next(infile) for x in xrange(2)]
        dbdate = head[1].replace('# ', '').rstrip()
        dbvers = head[0].split(' ')[-1].rstrip()
        dbdate = datetime.datetime.strptime(dbdate,
                                            "%m/%d/%Y").strftime("%Y-%m-%d")
        lib.log.info('Creating dbCAN HMM database')
        cmd = ['hmmpress', 'dbCAN.hmm']
        lib.runSubprocess(cmd, os.path.join(FUNDB), lib.log)
        info['dbCAN'] = ('hmmer3', hmm, dbvers, dbdate, num_records, md5)
        os.remove(os.path.join(FUNDB, 'dbCAN.tmp'))
    type, name, version, date, records, checksum = info.get('dbCAN')
    lib.log.info('dbCAN Database: version={:} date={:} records={:,}'.format(
        version, date, records))
Пример #5
0
def runtblastn(input, query, cpus, output, maxhits):
    #start by formatting blast db/dustmasker filtered format
    cmd = [
        'dustmasker', '-in', input, '-infmt', 'fasta', '-parse_seqids',
        '-outfmt', 'maskinfo_asn1_bin', '-out', 'genome_dust.asnb'
    ]
    lib.runSubprocess(cmd, output, lib.log)
    cmd = [
        'makeblastdb', '-in', input, '-dbtype', 'nucl', '-parse_seqids',
        '-mask_data', 'genome_dust.asnb', '-out', 'genome'
    ]
    lib.runSubprocess(cmd, output, lib.log)
    cmd = [
        'tblastn', '-num_threads',
        str(cpus), '-db', 'genome', '-query', query, '-max_target_seqs',
        str(maxhits), '-db_soft_mask', '11', '-threshold', '999',
        '-max_intron_length',
        str(args.maxintron), '-evalue', '1e-10', '-outfmt', '6', '-out',
        'filter.tblastn.tab'
    ]
    lib.runSubprocess(cmd, output, lib.log)
Пример #6
0
    logfile = input + '.log'
    with open(logfile, 'w') as output:
        subprocess.call([perl, Execute, input], stdout=output, stderr=output)


def safe_run(*args, **kwargs):
    """Call run(), catch exceptions."""
    try:
        worker(*args, **kwargs)
    except Exception as e:
        print("error: %s run(*%r, **%r)" % (e, args, kwargs))


#split partitions
lib.log.info("Setting up EVM partitions")
lib.runSubprocess(cmd1, tmpdir, lib.log)
#subprocess.call(cmd1, cwd = tmpdir, stdout = FNULL, stderr = FNULL)
#check output
lib.checkinputs(os.path.join(tmpdir, 'partitions_list.out'))

#generate commands
lib.log.info("Generating EVM command list")
commands = os.path.join(tmpdir, 'commands.list')
with open(commands, 'w') as output:
    subprocess.call(cmd2, cwd=tmpdir, stdout=output, stderr=FNULL)

#count total lines
num_lines = sum(1 for line in open(commands))
#strange thing happens if you try to run with more cpus than commands
if num_lines < cpus:
    x = num_lines
Пример #7
0
def runKallisto(input, fasta, readTuple, stranded, cpus, output):
    '''
    function takes GFF3 output from PASA compare, extracts transcripts, and then calculates TPM
    using Kallisto to idenitfy the best scoring gene model for each locus, the left and right
    these should be the adapter cleaned non-normalized Illumina reads
    '''
    lib.log.info(
        "Using Kallisto TPM data to determine which PASA gene models to select at each locus"
    )
    #convert GFF to transcripts
    folder = os.path.join(tmpdir, 'getBestModel')
    if not os.path.exists(folder):
        os.makedirs(
            folder
        )  # handle already existing folder okay? could also delete it
    PASAtranscripts = os.path.join(folder, 'transcripts.fa')
    cmd = [
        os.path.join(PASA, 'misc_utilities', 'gff3_file_to_proteins.pl'),
        input, fasta, 'cDNA'
    ]
    lib.log.info("Building Kallisto index")
    lib.runSubprocess2(cmd, '.', lib.log, PASAtranscripts)
    #generate kallisto index
    cmd = [
        'kallisto', 'index', '-i',
        os.path.join(folder, 'bestModel'), PASAtranscripts
    ]
    lib.runSubprocess(cmd, '.', lib.log)
    #use kallisto to map reads to index
    #base command
    cmd = [
        'kallisto', 'quant', '-i',
        os.path.join(folder, 'bestModel'), '-o',
        os.path.join(folder, 'kallisto'), '--plaintext', '-t',
        str(cpus)
    ]
    #parse the strand information
    if stranded == 'RF':
        strandcmd = ['--rf-stranded']
    elif stranded == 'FR':
        strandcmd = ['--fr-stranded']
    else:
        strandcmd = []
    #adapt command for input, i.e. single or PE ends -> what do you do if you have both?
    if readTuple[2] and not readTuple[0] and not readTuple[
            1]:  #single, not just using estimated lengths and SD, I think this is okay? can make this an option otherwise
        cmd = cmd + ['--single', '-l', '200', '-s', '20', readTuple[2]]
    elif readTuple[0] and readTuple[1]:
        cmd = cmd + strandcmd + [readTuple[0], readTuple[1]]
    lib.log.info("Mapping reads using pseudoalignment in Kallisto")
    lib.runSubprocess(cmd, '.', lib.log)

    #modify kallisto ouput to map gene names to each mRNA ID so you know what locus they have come from
    mRNADict = {}
    #since mRNA is unique, parse the transcript file which has mRNAID geneID in header
    with open(PASAtranscripts, 'rU') as transin:
        for line in transin:
            if line.startswith('>'):
                line = line.rstrip()
                line = line.replace('>', '')
                cols = line.split(' ')
                mRNAID = cols[0]
                geneID = cols[1]
                location = cols[-1]
                if not mRNAID in mRNADict:
                    mRNADict[mRNAID] = (geneID, location)

    #some PASA models can have incomplete CDS and are wrong, get list of incompletes to ignore list
    ignore = []
    with open(input, 'rU') as infile:
        for line in infile:
            if line.startswith('#PROT'):
                if line.endswith('\t\n'):
                    ID = line.split(' ')[1]
                    ignore.append(ID)
    if len(ignore) > 0:
        lib.log.debug("Ignoring %i incomplete PASA models: %s" %
                      (len(ignore), ','.join(ignore)))

    #now make new tsv file with #mRNAID geneID location TPM
    with open(output, 'w') as outfile:
        outfile.write("#mRNA-ID\tgene-ID\tLocation\tTPM\n")
        with open(os.path.join(folder, 'kallisto', 'abundance.tsv'),
                  'rU') as infile:
            for line in infile:
                if line.startswith('targed_id'):
                    continue
                line = line.rstrip()
                cols = line.split('\t')
                if cols[0] in ignore:
                    continue
                if cols[0] in mRNADict:
                    geneHit = mRNADict.get(cols[0])
                    geneID = geneHit[0]
                    location = geneHit[1]
                    outfile.write('%s\t%s\t%s\t%s\n' %
                                  (cols[0], geneID, location, cols[4]))
Пример #8
0
def runPASAtrain(genome, transcripts, cleaned_transcripts, stranded, intronlen,
                 cpus, dbname, output):
    '''
    function will run PASA align assembly and then choose best gene models for training
    '''
    if cpus > 2:
        pasa_cpus = cpus / 2
    else:
        pasa_cpus = 2
    #create tmpdir
    folder = os.path.join(tmpdir, 'pasa')
    if not os.path.isdir(folder):
        os.makedirs(folder)

    #get config files and edit
    alignConfig = os.path.join(folder, 'alignAssembly.txt')
    pasaDBname = dbname.replace('-', '_')
    if args.pasa_db == 'sqlite':
        pasaDBname_path = os.path.abspath(os.path.join(folder, pasaDBname))
    else:
        pasaDBname_path = pasaDBname
    with open(alignConfig, 'w') as config1:
        with open(
                os.path.join(PASA, 'pasa_conf',
                             'pasa.alignAssembly.Template.txt'),
                'rU') as template1:
            for line in template1:
                line = line.replace('<__DATABASE__>', pasaDBname_path)
                line = line.replace('<__MYSQLDB__>', pasaDBname_path)
                config1.write(line)
    if not os.path.isfile(
            os.path.join(folder, pasaDBname + '.assemblies.fasta')):
        #now run first PASA step, note this will dump any database with same name
        lib.log.info(
            "Running PASA alignment step using {:,} transcripts".format(
                lib.countfasta(cleaned_transcripts)))
        cmd = [
            LAUNCHPASA, '-c',
            os.path.abspath(alignConfig), '-r', '-C', '-R', '-g',
            os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-T', '-t',
            os.path.abspath(cleaned_transcripts), '-u',
            os.path.abspath(transcripts), '--stringent_alignment_overlap',
            args.pasa_alignment_overlap, '--TRANSDECODER', '--ALT_SPLICE',
            '--MAX_INTRON_LENGTH',
            str(intronlen), '--CPU',
            str(pasa_cpus)
        ]
        if stranded != 'no':
            cmd = cmd + ['--transcribed_is_aligned_orient']
        lib.runSubprocess(cmd, folder, lib.log)
    else:
        lib.log.info('Existing PASA assemblies found: {:}'.format(
            os.path.join(folder, pasaDBname + '.assemblies.fasta')))
    #generate TSV gene-transcripts
    Loci = []
    numTranscripts = 0
    with open(os.path.join(folder, 'pasa.gene2transcripts.tsv'),
              'w') as gene2transcripts:
        with open(
                os.path.join(folder,
                             pasaDBname + '.pasa_assemblies_described.txt'),
                'rU') as description:
            for line in description:
                if not line.startswith('#'):
                    cols = line.split('\t')
                    gene2transcripts.write('g_%s\t%s\n' % (cols[1], cols[2]))
                    numTranscripts += 1
                    if not cols[1] in Loci:
                        Loci.append(cols[1])
    lib.log.info("PASA assigned {:,} transcipts to {:,} loci (genes)".format(
        numTranscripts, len(Loci)))
    lib.log.info("Getting PASA models for training with TransDecoder")
    pasa_training_gff = os.path.join(
        folder, pasaDBname + '.assemblies.fasta.transdecoder.genome.gff3')
    cmd = [
        os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'),
        '--pasa_transcripts_fasta', pasaDBname + '.assemblies.fasta',
        '--pasa_transcripts_gff3', pasaDBname + '.pasa_assemblies.gff3'
    ]
    lib.runSubprocess(cmd, folder, lib.log)
    #grab final result
    shutil.copyfile(pasa_training_gff, output)
    lib.log.info(
        'PASA finished. PASAweb accessible via: localhost:port/cgi-bin/index.cgi?db=%s'
        % pasaDBname_path)
Пример #9
0
def removeAntiSense(input, readTuple, output):
    '''
    function will map reads to the input transcripts, determine strandedness, and then filter
    out transcripts that were assembled in antisense orientation. idea here is that the antisense
    transcripts, while potentially valid, aren't going to help update the gene models and perhaps
    could hurt the annotation effort?
    '''
    lib.log.info("Running anti-sense filtering of Trinity transcripts")
    bamthreads = (
        args.cpus +
        2 // 2) // 2  #use half number of threads for bam compression threads
    aligner = choose_aligner()
    if aligner == 'hisat2':
        bowtie2bam = os.path.join(tmpdir, 'hisat2.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building Hisat2 index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'hisat2-build', input,
                os.path.join(tmpdir, 'hisat2.transcripts')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

            #now launch the aligner
            lib.log.info("Aligning reads to trinity transcripts with Hisat2")
            hisat2cmd = [
                'hisat2', '-p',
                str(args.cpus), '-k', '50', '--max-intronlen',
                str(args.max_intronlen), '-x',
                os.path.join(tmpdir, 'hisat2.transcripts')
            ]
            if readTuple[2]:
                hisat2cmd = hisat2cmd + ['-U', readTuple[2]]
            if readTuple[0] and readTuple[1]:
                hisat2cmd = hisat2cmd + [
                    '-1', readTuple[0], '-2', readTuple[1]
                ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(hisat2cmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

    elif aligner == 'bowtie2':
        #using bowtie2
        bowtie2bam = os.path.join(tmpdir,
                                  'bowtie2.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building Bowtie2 index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'bowtie2-build', input,
                os.path.join(tmpdir, 'bowtie2.transcripts')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)
            #now launch the subprocess commands in order
            lib.log.info("Aligning reads to trinity transcripts with Bowtie2")
            bowtie2cmd = [
                'bowtie2', '-p',
                str(args.cpus), '-k', '50', '--local', '--no-unal', '-x',
                os.path.join(tmpdir, 'bowtie2.transcripts')
            ]
            if readTuple[2]:
                bowtie2cmd = bowtie2cmd + ['-U', readTuple[2]]
            if readTuple[0] and readTuple[1]:
                bowtie2cmd = bowtie2cmd + [
                    '-1', readTuple[0], '-2', readTuple[1]
                ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(bowtie2cmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

    elif aligner == 'rapmap':
        #using bowtie2
        bowtie2bam = os.path.join(tmpdir, 'rapmap.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building RapMap index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'rapmap', 'quasiindex', '-t', input, '-i',
                os.path.join(tmpdir, 'rapmap_index')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)
            #now launch the subprocess commands in order
            lib.log.info("Aligning reads to trinity transcripts with RapMap")
            rapmapcmd = [
                'rapmap', 'quasimap', '-t',
                str(args.cpus), '-i',
                os.path.join(tmpdir, 'rapmap_index'), '-1', readTuple[0], '-2',
                readTuple[1]
            ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(rapmapcmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess(cmd, '.', lib.log)

    #now run Trinity examine strandeness tool
    lib.log.info("Examining strand specificity")
    cmd = [
        os.path.join(TRINITY, 'util', 'misc', 'examine_strand_specificity.pl'),
        bowtie2bam,
        os.path.join(tmpdir, 'strand_specific')
    ]
    lib.runSubprocess(cmd, '.', lib.log)
    #parse output dat file and get list of transcripts to remove
    removeList = []
    with open(os.path.join(tmpdir, 'strand_specific.dat'), 'rU') as infile:
        for line in infile:
            line = line.replace('\n', '')
            if line.startswith('#'):
                continue
            cols = line.split('\t')
            if args.stranded == 'RF':  #then we want to keep negative ratios in cols[4]
                if not cols[4].startswith('-'):
                    removeList.append(cols[0])
            elif args.stranded == 'FR':  #keep + values
                if cols[4].startswith('-'):
                    removeList.append(cols[0])

    #now parse the input fasta file removing records in list
    with open(output, 'w') as outfile:
        with open(input, 'rU') as infile:
            for record in SeqIO.parse(infile, 'fasta'):
                if not record.id in removeList:
                    outfile.write(">%s\n%s\n" %
                                  (record.description, str(record.seq)))
    lib.log.info("Removing %i antisense transcripts" % (len(removeList)))
Пример #10
0
def runTrinityGG(genome, readTuple, output):
    '''
    function will run genome guided Trinity. First step will be to run hisat2 to align reads
    to the genome, then pass that BAM file to Trinity to generate assemblies
    '''
    #build hisat2 index, using exons and splice sites
    lib.log.info("Starting Trinity genome guided")
    lib.log.info("Building Hisat2 genome index")
    cmd = ['hisat2-build', genome, os.path.join(tmpdir, 'hisat2.genome')]
    lib.runSubprocess4(cmd, '.', lib.log)
    #align reads using hisat2
    lib.log.info("Aligning reads to genome using Hisat2")
    hisat2bam = os.path.join(tmpdir, 'hisat2.coordSorted.bam')
    #use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM
    bamthreads = (
        args.cpus +
        2 // 2) // 2  #use half number of threads for bam compression threads
    if args.stranded != 'no' and not readTuple[2]:
        hisat2cmd = [
            'hisat2', '-p',
            str(args.cpus), '--max-intronlen',
            str(args.max_intronlen), '--dta', '-x',
            os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness',
            args.stranded
        ]
    else:
        hisat2cmd = [
            'hisat2', '-p',
            str(args.cpus), '--max-intronlen',
            str(args.max_intronlen), '--dta', '-x',
            os.path.join(tmpdir, 'hisat2.genome')
        ]
    if readTuple[0] and readTuple[1]:
        hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]]
    if readTuple[2]:
        hisat2cmd = hisat2cmd + ['-U', readTuple[2]]

    cmd = [
        os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(hisat2cmd),
        str(bamthreads), hisat2bam
    ]
    lib.runSubprocess(cmd, '.', lib.log)

    #now launch Trinity genome guided
    TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log')
    lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog)
    lib.log.info(
        "Clustering of reads from BAM and preparing assembly commands")
    jaccard_clip = []
    if args.jaccard_clip:
        jaccard_clip = ['--jaccard_clip']
    if args.stranded != 'no' and not readTuple[2]:
        cmd = [
            'Trinity', '--SS_lib_type', args.stranded,
            '--no_distributed_trinity_exec', '--genome_guided_bam', hisat2bam,
            '--genome_guided_max_intron',
            str(args.max_intronlen), '--CPU',
            str(args.cpus), '--max_memory', args.memory, '--output',
            os.path.join(tmpdir, 'trinity_gg')
        ]
    else:
        cmd = [
            'Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam',
            hisat2bam, '--genome_guided_max_intron',
            str(args.max_intronlen), '--CPU',
            str(args.cpus), '--max_memory', args.memory, '--output',
            os.path.join(tmpdir, 'trinity_gg')
        ]
    cmd = cmd + jaccard_clip
    lib.runSubprocess2(cmd, '.', lib.log, TrinityLog)
    commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds')

    #this will create all the Trinity commands, will now run these in parallel using multiprocessing in Python (seems to be much faster than Parafly on my system)
    file_list = []
    with open(commands, 'rU') as cmdFile:
        for line in cmdFile:
            line = line.replace('\n', '')
            line = line.replace(
                '--no_distributed_trinity_exec',
                '')  #don't think this should be appended to every command....
            line = line.replace('"', '')  #don't need these double quotes
            file_list.append(line)
    lib.log.info("Assembling " + "{0:,}".format(len(file_list)) +
                 " Trinity clusters using %i CPUs" % (args.cpus - 1))
    lib.runMultiProgress(safe_run, file_list, args.cpus - 1)

    #collected output files and clean
    outputfiles = os.path.join(tmpdir, 'trinity_gg',
                               'trinity_output_files.txt')
    with open(outputfiles, 'w') as fileout:
        for filename in find_files(os.path.join(tmpdir, 'trinity_gg'),
                                   '*inity.fasta'):
            fileout.write('%s\n' % filename)
    #now grab them all using Trinity script
    cmd = [
        os.path.join(TRINITY, 'util', 'support_scripts',
                     'GG_partitioned_trinity_aggregator.pl'), 'Trinity_GG'
    ]
    lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output)
Пример #11
0
        sys.exit(1)

#check EggNog database, download if necessary.
if not args.eggnog_db in lib.Nogs:
    lib.log.error("%s is not a valid EggNog group, options are:\n%s" %
                  (args.eggnog_db, ', '.join(lib.Nogs)))
    sys.exit(1)
if not os.path.isfile(
        os.path.join(parentdir, 'DB', args.eggnog_db + '_4.5.hmm')):
    lib.log.error("%s EggNog DB not found, trying to download and format..." %
                  args.eggnog_db)
    cmd = [
        os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db,
        os.path.join(parentdir, 'DB')
    ]
    lib.runSubprocess(cmd, '.', lib.log)
    if not os.path.isfile(
            os.path.join(parentdir, 'DB', args.eggnog_db + '_4.5.hmm')):
        lib.log.error("Downloading failed, exiting")
        sys.exit(1)
    else:
        lib.log.error("%s downloaded and formatted, moving on." %
                      args.eggnog_db)

#check buscos, download if necessary
if not os.path.isdir(os.path.join(parentdir, 'DB', args.busco_db)):
    lib.download_buscos(args.busco_db)

#need to do some checks here of the input
genbank = ''
Scaffolds = ''
Пример #12
0
scoCount = 0
if len(args.input) > 1:
    if not args.proteinortho:
        lib.log.info("Running orthologous clustering tool, ProteinOrtho5.  This may take awhile...")
        #setup protein ortho inputs, some are a bit strange in the sense that they use equals signs
    
        #generate list of files based on input order for consistency
        filelist = []
        for i in scinames:
            name = i+'.faa'
            filelist.append(name)
		#setup command
        cmd = ['proteinortho5.pl', '-project=funannotate', '-synteny', '-cpus='+str(args.cpus), '-singles', '-selfblast']
        cmd2 = cmd + filelist    
        if not os.path.isfile(os.path.join(args.out, 'protortho', 'funannotate.poff')):
            lib.runSubprocess(cmd2, protortho, lib.log)
    else:
        shutil.copyfile(args.proteinortho, os.path.join(args.out, 'protortho', 'funannotate.poff'))

    #open poff in pandas to parse "easier" for stats, orthologs, etc
    df = pd.read_csv(os.path.join(args.out, 'protortho', 'funannotate.poff'), sep='\t', header=0)
    df.rename(columns=lambda x: x.replace('.faa', ''), inplace=True)
    #reorder table to it matches up with busco list of dicts
    newhead = [df.columns.values[0], df.columns.values[1], df.columns.values[2]]
    newhead += scinames
    try:
        df = df[newhead]
    except KeyError: #means they were not found, likely need to then drop isolate name (I hope that catches them all)
        newhead = [i.rsplit('_',1)[0] for i in newhead]
        for x in newhead:
            if not x in df.columns.values:
Пример #13
0
def runSeqClean(input, folder):
    '''
    wrapper to run PASA seqclean on Trinity transcripts
    '''
    cmd = [os.path.join(PASA, 'bin', 'seqclean'), os.path.basename(input)]
    lib.runSubprocess(cmd, folder, lib.log)
Пример #14
0
def runPASAtrain(genome, transcripts, stranded, intronlen, cpus, dbname,
                 output):
    '''
    function will run PASA align assembly and then choose best gene models for training
    '''
    if cpus > 2:
        pasa_cpus = cpus / 2
    else:
        pasa_cpus = 2
    #create tmpdir
    folder = os.path.join(tmpdir, 'pasa')
    if not os.path.isdir(folder):
        os.makedirs(folder)

    #create pasa and transdecoder logfiles
    pasa_log = os.path.join(folder, 'pasa.log')
    transdecoder_log = os.path.join(folder, 'transdecoder.log')

    #get config files and edit
    alignConfig = os.path.join(folder, 'alignAssembly.txt')
    pasaDBname = dbname.replace('-', '_')
    with open(alignConfig, 'w') as config1:
        with open(
                os.path.join(PASA, 'pasa_conf',
                             'pasa.alignAssembly.Template.txt'),
                'rU') as template1:
            for line in template1:
                line = line.replace('<__MYSQLDB__>', pasaDBname)
                config1.write(line)
    if not os.path.isfile(
            os.path.join(folder, pasaDBname + '.assemblies.fasta')):
        #now run first PASA step, note this will dump any database with same name
        lib.log.info(
            "Running PASA alignment step using {:,} transcripts".format(
                lib.countfasta(transcripts)))
        cmd = [
            os.path.join(PASA, 'scripts', 'Launch_PASA_pipeline.pl'), '-c',
            os.path.abspath(alignConfig), '-r', '-C', '-R', '-g',
            os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-t',
            os.path.abspath(transcripts), '--stringent_alignment_overlap',
            args.pasa_alignment_overlap, '--TRANSDECODER',
            '--MAX_INTRON_LENGTH',
            str(intronlen), '--CPU',
            str(pasa_cpus)
        ]
        if stranded != 'no':
            cmd = cmd + ['--transcribed_is_aligned_orient']
        lib.runSubprocess(cmd, folder, lib.log)
    else:
        lib.log.info('Existing PASA assemblies found {:}'.format(
            os.path.join(folder, pasaDBname + '.assemblies.fasta')))
    #generate TSV gene-transcripts
    numLoci = getPASAtranscripts2genes(
        os.path.join(folder, pasaDBname + '.pasa_assemblies.gff3'),
        os.path.join(folder, 'pasa.gene2transcripts.tsv'))
    numTranscripts = lib.countfasta(
        os.path.join(folder, pasaDBname + '.assemblies.fasta'))
    lib.log.info(
        "Assigned {:,} transcipts to {:,} loci using {:}% overlap threshold".
        format(numTranscripts, numLoci, args.pasa_alignment_overlap))

    lib.log.info("Getting PASA models for training with TransDecoder")
    pasa_training_gff = os.path.join(
        folder, pasaDBname + '.assemblies.fasta.transdecoder.genome.gff3')
    if lib.which('TransDecoder.LongOrfs') and lib.which(
            'TransDecoder.Predict'):
        cmd = [
            'TransDecoder.LongOrfs', '-t', pasaDBname + '.assemblies.fasta',
            '--gene_trans_map', 'pasa.gene2transcripts.tsv'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
        cmd = [
            'TransDecoder.Predict', '-t', pasaDBname + '.assemblies.fasta',
            '--single_best_only'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
        cmd = [
            os.path.join(PASA, 'pasa-plugins', 'transdecoder',
                         'cdna_alignment_orf_to_genome_orf.pl'),
            pasaDBname + '.assemblies.fasta.transdecoder.gff3',
            pasaDBname + '.pasa_assemblies.gff3',
            pasaDBname + '.assemblies.fasta'
        ]
        lib.runSubprocess2(cmd, folder, lib.log, pasa_training_gff)
    else:
        cmd = [
            os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'),
            '--pasa_transcripts_fasta', pasaDBname + '.assemblies.fasta',
            '--pasa_transcripts_gff3', pasaDBname + '.pasa_assemblies.gff3'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
    #grab final result
    shutil.copyfile(pasa_training_gff, output)