Hits = parseDiamond(BlastResult) lib.log.info('Found {0:,}'.format(len(Hits)) + ' preliminary alignments') #index the genome and proteins protein_dict = SeqIO.index(os.path.abspath(args.proteins), 'fasta') #do index here in case memory problems? #split genome fasta into individual scaffolds with open(os.path.abspath(args.genome), 'rU') as input: for record in SeqIO.parse(input, "fasta"): SeqIO.write(record, os.path.join(tmpdir, 'scaffolds', record.id + ".fa"), "fasta") #run multiprocessing exonerate lib.runMultiProgress(runExonerate, Hits, args.cpus) #now need to loop through and offset exonerate predictions back to whole scaffolds exonerate_raw = os.path.join(tmpdir, 'exonerate.out.combined') with open(exonerate_raw, 'w') as output: for file in os.listdir(tmpdir): if file.endswith('.out'): with open(os.path.join(tmpdir, file), 'rU') as exoresult: offset = int(file.split('__')[1]) for line in itertools.islice(exoresult, 3, None): if line.startswith('#') or line.startswith( 'Average') or line.startswith('-- completed'): output.write(line) else: cols = line.split('\t') cols[3] = str(int(cols[3]) + offset)
def runTrinityGG(genome, readTuple, output): ''' function will run genome guided Trinity. First step will be to run hisat2 to align reads to the genome, then pass that BAM file to Trinity to generate assemblies ''' #build hisat2 index, using exons and splice sites lib.log.info("Starting Trinity genome guided") lib.log.info("Building Hisat2 genome index") cmd = ['hisat2-build', genome, os.path.join(tmpdir, 'hisat2.genome')] lib.runSubprocess4(cmd, '.', lib.log) #align reads using hisat2 lib.log.info("Aligning reads to genome using Hisat2") hisat2bam = os.path.join(tmpdir, 'hisat2.coordSorted.bam') #use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM bamthreads = ( args.cpus + 2 // 2) // 2 #use half number of threads for bam compression threads if args.stranded != 'no' and not readTuple[2]: hisat2cmd = [ 'hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness', args.stranded ] else: hisat2cmd = [ 'hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome') ] if readTuple[0] and readTuple[1]: hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]] if readTuple[2]: hisat2cmd = hisat2cmd + ['-U', readTuple[2]] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(hisat2cmd), str(bamthreads), hisat2bam ] lib.runSubprocess(cmd, '.', lib.log) #now launch Trinity genome guided TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log') lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog) lib.log.info( "Clustering of reads from BAM and preparing assembly commands") jaccard_clip = [] if args.jaccard_clip: jaccard_clip = ['--jaccard_clip'] if args.stranded != 'no' and not readTuple[2]: cmd = [ 'Trinity', '--SS_lib_type', args.stranded, '--no_distributed_trinity_exec', '--genome_guided_bam', hisat2bam, '--genome_guided_max_intron', str(args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg') ] else: cmd = [ 'Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam', hisat2bam, '--genome_guided_max_intron', str(args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg') ] cmd = cmd + jaccard_clip lib.runSubprocess2(cmd, '.', lib.log, TrinityLog) commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds') #this will create all the Trinity commands, will now run these in parallel using multiprocessing in Python (seems to be much faster than Parafly on my system) file_list = [] with open(commands, 'rU') as cmdFile: for line in cmdFile: line = line.replace('\n', '') line = line.replace( '--no_distributed_trinity_exec', '') #don't think this should be appended to every command.... line = line.replace('"', '') #don't need these double quotes file_list.append(line) lib.log.info("Assembling " + "{0:,}".format(len(file_list)) + " Trinity clusters using %i CPUs" % (args.cpus - 1)) lib.runMultiProgress(safe_run, file_list, args.cpus - 1) #collected output files and clean outputfiles = os.path.join(tmpdir, 'trinity_gg', 'trinity_output_files.txt') with open(outputfiles, 'w') as fileout: for filename in find_files(os.path.join(tmpdir, 'trinity_gg'), '*inity.fasta'): fileout.write('%s\n' % filename) #now grab them all using Trinity script cmd = [ os.path.join(TRINITY, 'util', 'support_scripts', 'GG_partitioned_trinity_aggregator.pl'), 'Trinity_GG' ] lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output)
'{0:,}'.format(num_prots) + ' proteins') #build in a check before running (in case script gets stopped and needs to restart finished = [] for file in os.listdir(IPROUT): if file.endswith('.xml'): base = file.split('.xml')[0] fasta_file = os.path.join(PROTS, base + '.fa') finished.append(fasta_file) finished = set(finished) #make sure no duplicates runlist = [x for x in proteins if x not in finished] if len(runlist) < num_prots: lib.log.info("Results found, querying remaining %i proteins" % len(runlist)) #start up the list, max 25 at a time lib.runMultiProgress(runIPRpython, runlist, 25) #clean up protein fasta files shutil.rmtree(PROTS) #now convert to single file and then clean up with open(IPRCombined, 'w') as output: subprocess.call([sys.executable, XMLCombine, IPROUT], stdout=output) if lib.checkannotations(IPRCombined): shutil.rmtree(IPROUT) if 'antismash' in args.methods or 'all' in args.methods: if args.antismash == 'fungi': base_address = "https://fungismash.secondarymetabolites.org" job_parameters = { 'email': args.email, 'smcogs': 'on', 'knownclusterblast': 'on',
with open(os.path.join(tmpdir, i+'.hints.gff'), 'w') as output: with open(args.hints, 'rU') as hintsfile: for line in hintsfile: cols = line.split('\t') if cols[0] == i: output.write(line) ''' #now loop through each scaffold running augustus if args.cpus > len(scaffolds): num = len(scaffolds) else: num = args.cpus lib.log.debug("Running Augustus on %i chunks, using %i CPUs" % (len(scaffolds), num)) lib.runMultiProgress(runAugustus, scaffolds, num) lib.log.debug("Augustus prediction is finished, now concatenating results") with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output: for file in scaffolds: file = os.path.join(tmpdir, file + '.augustus.gff3') with open(file) as input: output.write(input.read()) join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl') with open(args.out, 'w') as finalout: with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'rU') as input: subprocess.call([join_script], stdin=input, stdout=finalout) if not args.debug: shutil.rmtree(tmpdir) lib.log.info("Found %i gene models" % countGFFgenes(args.out))
else: eggs = 'None' if len(buscos) > 0: buscos = set(buscos) buscos = ', '.join(str(v) for v in buscos) else: buscos = 'None' #write now to file output.write("%s\t%s\t%s\t%s\n" % (ID, eggs, buscos, ', '.join(proteins))) if args.run_dnds: #multiprocessing dN/dS on list of folders dNdSList = lib.get_subdirs(ortho_folder) if args.run_dnds == 'estimate': lib.log.debug("Running simple dN/dS estimate") lib.runMultiProgress(lib.rundNdSestimate, dNdSList, args.cpus) else: lib.log.debug("Running exhasitve dN/dS ratio with Likelihood Ratio Tests") lib.runMultiProgress(lib.rundNdSexhaustive, dNdSList, args.cpus) #after all data is run, then parse result log files, return dictionary dNdSresults = lib.parsedNdS(ortho_folder) if len(args.input) > 1: orthologs = os.path.join(args.out, 'orthology','orthology_groups.txt') with open(orthologs, 'w') as output: with open(orthologstmp, 'rU') as input: for line in input: line = line.replace('\n', '') cols = line.split('\t') if args.run_dnds: if cols[0] in dNdSresults:
#create tmpdir to store fasta files and output files TMPDIR = 'phobius_' + str(os.getpid()) #split fasta lib.splitFASTA(args.input, TMPDIR) #now get list of files in tmpdir proteins = [] for file in os.listdir(TMPDIR): if file.endswith('.fa'): proteins.append(file) #now run the script if lib.which('phobius.pl'): lib.runMultiProgress(runPhobiusLocal, proteins, multiprocessing.cpu_count()) else: lib.runMultiProgress(runPhobiusRemote, proteins, 29) #max is 30 jobs at a time #collect all results phobius = [] for file in os.listdir(TMPDIR): if file.endswith('.phobius'): phobius.append(os.path.join(TMPDIR,file)) #write output TMdomain = 0 SigPep = 0 with open(args.out, 'w') as output: output.write("%s\t%s\t%s\t%s\n" % ('ID', 'TM', 'SP', 'Prediction')) for x in phobius: