def runExonerate(input): s = input.split(':::') ProtID = s[0] ScaffID = s[1] ScaffStart = int(s[2]) ScaffEnd = int(s[3]) #get the protein model query = os.path.join(tmpdir, ProtID + '.' + str(os.getpid()) + '.fa') with open(query, 'w') as output: SeqIO.write(protein_dict[ProtID], output, 'fasta') #now get the genome region, use different variable names for SeqRecords to avoid collision scaffold = os.path.join( tmpdir, ScaffID + '.' + ProtID + '.' + str(ScaffStart) + '-' + str(ScaffEnd) + '.fa') with open(scaffold, 'w') as output2: with open(os.path.join(tmpdir, 'scaffolds', ScaffID + '.fa'), 'rU') as fullscaff: for header, Sequence in SimpleFastaParser(fullscaff): #grab a 3 kb cushion on either side of hit region, careful of scaffold ends start = ScaffStart - 3000 if start < 1: start = 1 end = ScaffEnd + 3000 if end > len(Sequence): end = len(Sequence) output2.write('>%s\n%s\n' % (header, Sequence[start:end])) exoname = ProtID + '.' + ScaffID + '__' + str(start) + '__' #check that input files are created and valid exonerate_out = os.path.join(tmpdir, 'exonerate.' + exoname + '.out') ryo = "AveragePercentIdentity: %pi\n" cmd = [ 'exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment', 'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron', str(args.maxintron), '--percent', '80', '--ryo', ryo, query, scaffold ] if lib.checkannotations(query) and lib.checkannotations(scaffold): #run exonerate, capture errors with open(exonerate_out, 'w') as output3: proc = subprocess.Popen(cmd, stdout=output3, stderr=subprocess.PIPE) stderr = proc.communicate() if 'WARNING' in stderr[1]: lib.log.debug('Error in input:{:}'.format(input)) lib.log.debug( '%s, Len=%i, %i-%i; %i-%i' % (header, len(Sequence), ScaffStart, ScaffEnd, start, end)) os.rename(query, os.path.join(tmpdir, 'failed', os.path.basename(query))) os.rename( scaffold, os.path.join(tmpdir, 'failed', os.path.basename(scaffold))) else: for y in [query, scaffold]: try: lib.SafeRemove(y) except OSError: lib.log.debug("Error removing %s" % (y)) #check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes if lib.getSize(exonerate_out) < 500: os.remove(exonerate_out) else: lib.log.debug('Error in query or scaffold:{:}'.format(input)) lib.SafeRemove(query) lib.SafeRemove(scaffold)
output.write('\t'.join(cols)) #convert to GFF3 using ExoConverter from EVM with open(args.out, 'w') as output: subprocess.call([ExoConverter, exonerate_raw], stdout=output, stderr=FNULL) #output some quick summary of exonerate alignments that you found Found = lib.countGFFgenes(exonerate_raw) lib.log.info('Exonerate finished: found {:,} alignments'.format(Found)) #check for saving output of tblastn if args.tblastn_out: shutil.copyfile(BlastResult, args.tblastn_out) #finally clean-up your mess if failed is empty if args.debug: try: os.rmdir(os.path.join(tmpdir, 'failed')) empty = True except OSError: empty = False if empty: lib.SafeRemove(tmpdir) else: lib.log.error("Failed exonerate alignments found, see files in %s" % os.path.join(tmpdir, 'failed')) else: if os.path.isdir(tmpdir): lib.SafeRemove(tmpdir) sys.exit(1)
#create tmp folder to run tbl2asn from #make tmp folder tmp = outputname + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #now move files into proper location if not lib.checkannotations(args.fasta): print('FASTA genome file not found: {:}'.format(args.fasta)) sys.exit(1) if not lib.checkannotations(args.tbl): print('TBL annotations file not found: {:}'.format(args.tbl)) sys.exit(1) shutil.copyfile(args.fasta, os.path.join(tmp, 'genome.fsa')) shutil.copyfile(args.tbl, os.path.join(tmp, 'genome.tbl')) #now we can run tbl2asn if args.sbt: SBT = args.sbt else: SBT = os.path.join(parentdir, 'lib', 'test.sbt') discrep = outputname + '.discrepency.txt' version = 1 tbl2asn_cmd = runtbl2asn(tmp, SBT, discrep, organism, args.isolate, args.strain, args.tbl2asn, version) #get output files gbkout = outputname + '.gbk' shutil.copyfile(os.path.join(tmp, 'genome.gbf'), gbkout) lib.SafeRemove(tmp)