def concatenateReads(input, output): ''' Since I can't seem to get the comma separated lists to work with subprocess modules, just concatenate FASTQ files in order and use a single file, input should be a list of FASTQ files using system cat here so that gzipped files are concatenated correctly ''' cmd = ['cat'] cmd = cmd + input lib.runSubprocess2(cmd, '.', lib.log, output)
def runAugustus(Input): if '_part' in Input: chr = Input.split('_part')[0] else: chr = Input species='--species='+args.species hints_input = '--hintsfile='+args.hints aug_out = os.path.join(tmpdir, Input+'.augustus.gff3') core_cmd = ['augustus', species, '--softmasking=1', '--gff3=on', '--UTR=off', '--stopCodonExcludedFromCDS=False', os.path.join(tmpdir, chr+'.fa')] if args.hints: core_cmd.insert(2, extrinsic) core_cmd.insert(3, hints_input) if Input in ranges: start = ranges.get(Input)[0] end = ranges.get(Input)[1] core_cmd.insert(2, '--predictionStart='+str(start)) core_cmd.insert(3, '--predictionEnd='+str(end)) #try using library module lib.runSubprocess2(core_cmd, '.', lib.log, aug_out)
def runNormalization(readTuple, memory): ''' function is wrapper for Trinity read normalization have to run normalization separately for PE versus single ''' left_norm, right_norm, single_norm = (None, ) * 3 SENormalLog = os.path.join(tmpdir, 'trinity_normalization.SE.log') PENormalLog = os.path.join(tmpdir, 'trinity_normalization.PE.log') if args.stranded != 'no': cmd = [ os.path.join(TRINITY, 'util', 'insilico_read_normalization.pl'), '--PARALLEL_STATS', '--JM', memory, '--max_cov', str(args.coverage), '--seqType', 'fq', '--output', os.path.join(tmpdir, 'normalize'), '--CPU', str(args.cpus), '--SS_lib_type', args.stranded ] else: cmd = [ os.path.join(TRINITY, 'util', 'insilico_read_normalization.pl'), '--PARALLEL_STATS', '--JM', memory, '--max_cov', str(args.coverage), '--seqType', 'fq', '--output', os.path.join(tmpdir, 'normalize'), '--CPU', str(args.cpus) ] if readTuple[ 2]: #single reads present, so run normalization just on those reads cmd = cmd + ['--single', readTuple[2]] lib.runSubprocess2(cmd, '.', lib.log, SENormalLog) single_norm = os.path.join(tmpdir, 'normalize', 'single.norm.fq') if readTuple[0] and readTuple[1]: cmd = cmd + [ '--pairs_together', '--left', readTuple[0], '--right', readTuple[1] ] left_norm = os.path.join(tmpdir, 'normalize', 'left.norm.fq') right_norm = os.path.join(tmpdir, 'normalize', 'right.norm.fq') lib.runSubprocess2(cmd, '.', lib.log, PENormalLog) return left_norm, right_norm, single_norm
def runKallisto(input, fasta, readTuple, stranded, cpus, output): ''' function takes GFF3 output from PASA compare, extracts transcripts, and then calculates TPM using Kallisto to idenitfy the best scoring gene model for each locus, the left and right these should be the adapter cleaned non-normalized Illumina reads ''' lib.log.info( "Using Kallisto TPM data to determine which PASA gene models to select at each locus" ) #convert GFF to transcripts folder = os.path.join(tmpdir, 'getBestModel') if not os.path.exists(folder): os.makedirs( folder ) # handle already existing folder okay? could also delete it PASAtranscripts = os.path.join(folder, 'transcripts.fa') cmd = [ os.path.join(PASA, 'misc_utilities', 'gff3_file_to_proteins.pl'), input, fasta, 'cDNA' ] lib.log.info("Building Kallisto index") lib.runSubprocess2(cmd, '.', lib.log, PASAtranscripts) #generate kallisto index cmd = [ 'kallisto', 'index', '-i', os.path.join(folder, 'bestModel'), PASAtranscripts ] lib.runSubprocess(cmd, '.', lib.log) #use kallisto to map reads to index #base command cmd = [ 'kallisto', 'quant', '-i', os.path.join(folder, 'bestModel'), '-o', os.path.join(folder, 'kallisto'), '--plaintext', '-t', str(cpus) ] #parse the strand information if stranded == 'RF': strandcmd = ['--rf-stranded'] elif stranded == 'FR': strandcmd = ['--fr-stranded'] else: strandcmd = [] #adapt command for input, i.e. single or PE ends -> what do you do if you have both? if readTuple[2] and not readTuple[0] and not readTuple[ 1]: #single, not just using estimated lengths and SD, I think this is okay? can make this an option otherwise cmd = cmd + ['--single', '-l', '200', '-s', '20', readTuple[2]] elif readTuple[0] and readTuple[1]: cmd = cmd + strandcmd + [readTuple[0], readTuple[1]] lib.log.info("Mapping reads using pseudoalignment in Kallisto") lib.runSubprocess(cmd, '.', lib.log) #modify kallisto ouput to map gene names to each mRNA ID so you know what locus they have come from mRNADict = {} #since mRNA is unique, parse the transcript file which has mRNAID geneID in header with open(PASAtranscripts, 'rU') as transin: for line in transin: if line.startswith('>'): line = line.rstrip() line = line.replace('>', '') cols = line.split(' ') mRNAID = cols[0] geneID = cols[1] location = cols[-1] if not mRNAID in mRNADict: mRNADict[mRNAID] = (geneID, location) #some PASA models can have incomplete CDS and are wrong, get list of incompletes to ignore list ignore = [] with open(input, 'rU') as infile: for line in infile: if line.startswith('#PROT'): if line.endswith('\t\n'): ID = line.split(' ')[1] ignore.append(ID) if len(ignore) > 0: lib.log.debug("Ignoring %i incomplete PASA models: %s" % (len(ignore), ','.join(ignore))) #now make new tsv file with #mRNAID geneID location TPM with open(output, 'w') as outfile: outfile.write("#mRNA-ID\tgene-ID\tLocation\tTPM\n") with open(os.path.join(folder, 'kallisto', 'abundance.tsv'), 'rU') as infile: for line in infile: if line.startswith('targed_id'): continue line = line.rstrip() cols = line.split('\t') if cols[0] in ignore: continue if cols[0] in mRNADict: geneHit = mRNADict.get(cols[0]) geneID = geneHit[0] location = geneHit[1] outfile.write('%s\t%s\t%s\t%s\n' % (cols[0], geneID, location, cols[4]))
def runTrinityGG(genome, readTuple, output): ''' function will run genome guided Trinity. First step will be to run hisat2 to align reads to the genome, then pass that BAM file to Trinity to generate assemblies ''' #build hisat2 index, using exons and splice sites lib.log.info("Starting Trinity genome guided") lib.log.info("Building Hisat2 genome index") cmd = ['hisat2-build', genome, os.path.join(tmpdir, 'hisat2.genome')] lib.runSubprocess4(cmd, '.', lib.log) #align reads using hisat2 lib.log.info("Aligning reads to genome using Hisat2") hisat2bam = os.path.join(tmpdir, 'hisat2.coordSorted.bam') #use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM bamthreads = ( args.cpus + 2 // 2) // 2 #use half number of threads for bam compression threads if args.stranded != 'no' and not readTuple[2]: hisat2cmd = [ 'hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness', args.stranded ] else: hisat2cmd = [ 'hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome') ] if readTuple[0] and readTuple[1]: hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]] if readTuple[2]: hisat2cmd = hisat2cmd + ['-U', readTuple[2]] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(hisat2cmd), str(bamthreads), hisat2bam ] lib.runSubprocess(cmd, '.', lib.log) #now launch Trinity genome guided TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log') lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog) lib.log.info( "Clustering of reads from BAM and preparing assembly commands") jaccard_clip = [] if args.jaccard_clip: jaccard_clip = ['--jaccard_clip'] if args.stranded != 'no' and not readTuple[2]: cmd = [ 'Trinity', '--SS_lib_type', args.stranded, '--no_distributed_trinity_exec', '--genome_guided_bam', hisat2bam, '--genome_guided_max_intron', str(args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg') ] else: cmd = [ 'Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam', hisat2bam, '--genome_guided_max_intron', str(args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg') ] cmd = cmd + jaccard_clip lib.runSubprocess2(cmd, '.', lib.log, TrinityLog) commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds') #this will create all the Trinity commands, will now run these in parallel using multiprocessing in Python (seems to be much faster than Parafly on my system) file_list = [] with open(commands, 'rU') as cmdFile: for line in cmdFile: line = line.replace('\n', '') line = line.replace( '--no_distributed_trinity_exec', '') #don't think this should be appended to every command.... line = line.replace('"', '') #don't need these double quotes file_list.append(line) lib.log.info("Assembling " + "{0:,}".format(len(file_list)) + " Trinity clusters using %i CPUs" % (args.cpus - 1)) lib.runMultiProgress(safe_run, file_list, args.cpus - 1) #collected output files and clean outputfiles = os.path.join(tmpdir, 'trinity_gg', 'trinity_output_files.txt') with open(outputfiles, 'w') as fileout: for filename in find_files(os.path.join(tmpdir, 'trinity_gg'), '*inity.fasta'): fileout.write('%s\n' % filename) #now grab them all using Trinity script cmd = [ os.path.join(TRINITY, 'util', 'support_scripts', 'GG_partitioned_trinity_aggregator.pl'), 'Trinity_GG' ] lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output)
'iprscan' + str(os.getpid())) os.makedirs(IPROUT) #now split XML file splitter = os.path.join(parentdir, 'util', 'prepare_ind_xml.pl') cmd = [splitter, args.iprscan, IPROUT] lib.runSubprocess(cmd, '.', lib.log) #now collect the results from InterProscan, then start to reformat results lib.log.info( "InterProScan has finished, now pulling out annotations from results") IPR_terms = os.path.join(outputdir, 'annotate_misc', 'annotations.iprscan.txt') if not os.path.isfile(IPR_terms): IPR2TSV = os.path.join(parentdir, 'util', 'ipr2tsv.py') cmd = [sys.executable, IPR2TSV, IPROUT] lib.runSubprocess2(cmd, '.', lib.log, IPR_terms) GO_terms = os.path.join(outputdir, 'annotate_misc', 'annotations.GO.txt') if not os.path.isfile(GO_terms): IPR2GO = os.path.join(parentdir, 'util', 'ipr2go.py') OBO = os.path.join(parentdir, 'DB', 'go.obo') cmd = [sys.executable, IPR2GO, OBO, IPROUT] lib.runSubprocess2(cmd, '.', lib.log, GO_terms) #check if antiSMASH data is given, if so parse and reformat for annotations and cluster textual output if args.antismash: AntiSmashFolder = os.path.join(outputdir, 'annotate_misc', 'antismash') AntiSmashBed = os.path.join(AntiSmashFolder, 'clusters.bed') GFF2clusters = os.path.join(AntiSmashFolder, 'secmet.clusters.txt') AntiSmash_annotations = os.path.join(outputdir, 'annotate_misc', 'annotations.antismash.txt') Cluster_annotations = os.path.join(outputdir, 'annotate_misc',
file = os.path.join(go_folder, file) with open(file) as input: pop.write(input.read()) #now loop through each genome comparing to population for f in os.listdir(go_folder): if f.startswith('associations'): continue if f.startswith('population'): continue file = os.path.join(go_folder, f) base = f.replace('.txt', '') goa_out = os.path.join(args.out, 'go_enrichment', base+'.go.enrichment.txt') if not lib.checkannotations(goa_out): cmd = ['find_enrichment.py', '--obo', os.path.join(parentdir, 'DB', 'go.obo'), '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', file, os.path.join(go_folder, 'population.txt'), os.path.join(go_folder, 'associations.txt')] lib.runSubprocess2(cmd, '.', lib.log, goa_out) #load into pandas and write to html with open(os.path.join(args.out, 'go.html'), 'w') as output: pd.set_option('display.max_colwidth', -1) pd.options.mode.chained_assignment = None #turn off warning output.write(lib.HEADER) output.write(lib.GO) for f in os.listdir(os.path.join(args.out, 'go_enrichment')): if f.endswith('go.enrichment.txt'): file = os.path.join(args.out, 'go_enrichment', f) base = os.path.basename(file) name = base.split('.go_enrichment.txt')[0] #check goatools output, return is a tuple with True/False and header line # goresult = lib.checkgoatools(file) output.write('<h4 class="sub-header" align="left">GO Enrichment: '+name+'</h4>')
final_proteins = os.path.join(ResultsFolder, baseOUTPUT+'.proteins.fa') final_transcripts = os.path.join(ResultsFolder, baseOUTPUT+'.transcripts.fa') final_fasta = os.path.join(ResultsFolder, baseOUTPUT+'.scaffolds.fa') final_annotation = os.path.join(ResultsFolder, baseOUTPUT+'.annotations.txt') os.rename(os.path.join(outputdir, 'annotate_misc', 'gag', 'genome.gbf'), final_gbk) os.rename(os.path.join(outputdir, 'annotate_misc', 'gag', 'genome.gff'), os.path.join(ResultsFolder, baseOUTPUT+'.gff3')) os.rename(os.path.join(outputdir, 'annotate_misc', 'gag', 'genome.tbl'), os.path.join(ResultsFolder, baseOUTPUT+'.tbl')) os.rename(os.path.join(outputdir, 'annotate_misc', 'gag', 'genome.sqn'), os.path.join(ResultsFolder, baseOUTPUT+'.sqn')) lib.gb2output(final_gbk, final_proteins, final_transcripts, final_fasta) #write AGP output so all files in correct directory lib.log.info("Creating AGP file and corresponding contigs file") agp2fasta = os.path.join(parentdir, 'util', 'fasta2agp.pl') AGP = os.path.join(ResultsFolder, baseOUTPUT+'.agp') cmd = ['perl', agp2fasta, baseOUTPUT+'.scaffolds.fa'] lib.runSubprocess2(cmd, ResultsFolder, lib.log, AGP) #write secondary metabolite clusters output using the final genome in gbk format if lib.checkannotations(antismash_input): lib.log.info("Cross referencing SM cluster hits with MIBiG database") #do a blast best hit search against MIBiG database for cluster annotation, but looping through gene cluster hits AllProts = [] for k, v in lib.dictClusters.items(): for i in v: if not i in AllProts: AllProts.append(i) AllProts = set(AllProts) mibig_fasta = os.path.join(AntiSmashFolder, 'smcluster.proteins.fasta') mibig_blast = os.path.join(AntiSmashFolder, 'smcluster.MIBiG.blast.txt') mibig_db = os.path.join(parentdir, 'DB', 'MIBiG') with open(mibig_fasta, 'w') as output:
def runPhobiusLocal(Input): base = Input.split('/')[-1] base = base.split('.fa')[0] OUTPATH = os.path.join(TMPDIR, base+'.phobius') cmd = ['phobius.pl', '-short', Input] lib.runSubprocess2(cmd, TMPDIR, lib.log, OUTPATH)
def runPASAtrain(genome, transcripts, stranded, intronlen, cpus, dbname, output): ''' function will run PASA align assembly and then choose best gene models for training ''' if cpus > 2: pasa_cpus = cpus / 2 else: pasa_cpus = 2 #create tmpdir folder = os.path.join(tmpdir, 'pasa') if not os.path.isdir(folder): os.makedirs(folder) #create pasa and transdecoder logfiles pasa_log = os.path.join(folder, 'pasa.log') transdecoder_log = os.path.join(folder, 'transdecoder.log') #get config files and edit alignConfig = os.path.join(folder, 'alignAssembly.txt') pasaDBname = dbname.replace('-', '_') with open(alignConfig, 'w') as config1: with open( os.path.join(PASA, 'pasa_conf', 'pasa.alignAssembly.Template.txt'), 'rU') as template1: for line in template1: line = line.replace('<__MYSQLDB__>', pasaDBname) config1.write(line) if not os.path.isfile( os.path.join(folder, pasaDBname + '.assemblies.fasta')): #now run first PASA step, note this will dump any database with same name lib.log.info( "Running PASA alignment step using {:,} transcripts".format( lib.countfasta(transcripts))) cmd = [ os.path.join(PASA, 'scripts', 'Launch_PASA_pipeline.pl'), '-c', os.path.abspath(alignConfig), '-r', '-C', '-R', '-g', os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-t', os.path.abspath(transcripts), '--stringent_alignment_overlap', args.pasa_alignment_overlap, '--TRANSDECODER', '--MAX_INTRON_LENGTH', str(intronlen), '--CPU', str(pasa_cpus) ] if stranded != 'no': cmd = cmd + ['--transcribed_is_aligned_orient'] lib.runSubprocess(cmd, folder, lib.log) else: lib.log.info('Existing PASA assemblies found {:}'.format( os.path.join(folder, pasaDBname + '.assemblies.fasta'))) #generate TSV gene-transcripts numLoci = getPASAtranscripts2genes( os.path.join(folder, pasaDBname + '.pasa_assemblies.gff3'), os.path.join(folder, 'pasa.gene2transcripts.tsv')) numTranscripts = lib.countfasta( os.path.join(folder, pasaDBname + '.assemblies.fasta')) lib.log.info( "Assigned {:,} transcipts to {:,} loci using {:}% overlap threshold". format(numTranscripts, numLoci, args.pasa_alignment_overlap)) lib.log.info("Getting PASA models for training with TransDecoder") pasa_training_gff = os.path.join( folder, pasaDBname + '.assemblies.fasta.transdecoder.genome.gff3') if lib.which('TransDecoder.LongOrfs') and lib.which( 'TransDecoder.Predict'): cmd = [ 'TransDecoder.LongOrfs', '-t', pasaDBname + '.assemblies.fasta', '--gene_trans_map', 'pasa.gene2transcripts.tsv' ] lib.runSubprocess(cmd, folder, lib.log) cmd = [ 'TransDecoder.Predict', '-t', pasaDBname + '.assemblies.fasta', '--single_best_only' ] lib.runSubprocess(cmd, folder, lib.log) cmd = [ os.path.join(PASA, 'pasa-plugins', 'transdecoder', 'cdna_alignment_orf_to_genome_orf.pl'), pasaDBname + '.assemblies.fasta.transdecoder.gff3', pasaDBname + '.pasa_assemblies.gff3', pasaDBname + '.assemblies.fasta' ] lib.runSubprocess2(cmd, folder, lib.log, pasa_training_gff) else: cmd = [ os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'), '--pasa_transcripts_fasta', pasaDBname + '.assemblies.fasta', '--pasa_transcripts_gff3', pasaDBname + '.pasa_assemblies.gff3' ] lib.runSubprocess(cmd, folder, lib.log) #grab final result shutil.copyfile(pasa_training_gff, output)