alignCutOff = 0.45 print "alignCutOff wasn't given, assuming 0.5" try: coveCutOff = int(sys.argv[6]) print('coveCutOff: %s' % coveCutOff) except: coveCutOff = 7 print "coveCutOff wasn't given, assuming 7" x = geneCheck(genBankReference, resultFile, 0.50, True, blastFolder, organismType, alignCutOff) print 'Features found: %s' % len(x[0]) print 'Total features: %s' % len(x[1]) print '' print('Running tRNAscan-SE...') presentFeatures = x[0] assemblyCheck = tRNAscanChecker.tRNAscanCheck(resultFile, True, False, organismType, coveCutOff) #returns a Assembly object with statistics and alignment info tRNAs = assemblyCheck.tRNAs listOfFeaturesToOutput = [] listOfFoundTRNAs = [] for foundFeature in presentFeatures: thisFeatureFound = presentFeatures[foundFeature][1] #comparing tRNAscan-SE results with this, in case tRNAscan-SE was run if "trn" in thisFeatureFound.seq2.lower(): for tRNAFound in tRNAs: #down here we update the start and end positions of tRNAs found with Needle, with the #results outputted by tRNAScan-SE #tRNAconver = guarantees all tRNA names are in tRNA-Phe format if 'trna-' + tRNAFound.tRNAtype.lower() == tRNAconvert(thisFeatureFound.seq2.lower()): thisFeatureFound.startBase = min(tRNAFound.tRNAcoordinates[0], tRNAFound.tRNAcoordinates[1])
destFile = pathOfFinalResults + args.processName + '.unordered.maf' shutil.copyfile(pathOfMafResult, destFile) destFile = pathOfFinalResults + args.processName + '.unordered.caf' shutil.copyfile(pathOfCafResult, destFile) print '## Final sequence saved to %s' % pathOfFinalResults #from now on, just checking how the build went to output to user and then annotate print '## Now running tRNAscan-SE to check the final build...' if args.skipTrnaScan == True: print '' print '## --skiptrna is turned on, going to ignore this check and move on...' fifthStep = tRNAscanChecker.tRNAscanCheck(resultFile, fourthStep[0], args.skipTrnaScan, args.organismType, args.coveCutOff, args.buildBacteria, args.buildArchea) #returns a Assembly object #with statistics and alignment info if args.skipTrnaScan == False: print '## %s tRNAs were found.' % len(fifthStep) if args.ignoreFirstBuildChecks == False: print '' print '## Procceding to genomic check and annotation...' print '' #time to look for genomic features, searching according to the -o flag #or with the genbank reference that was given organismType = args.organismType module_dir = os.path.dirname(__file__) module_dir = os.path.abspath(module_dir) '''
def checkResults(processName, pathToWork, sizeToLook, refSeqFile = None, cutoffValue = (2500,18000), blasteVal = 10.0, blastHitSizePercentage = 0.625, usingSOAP = True, numberOfReadGroups = 1, buildCloroplast = False, skipTrnaScan = False, circularSize = 40, circularOffSet = 220, cutoffEquality = 0.625, organismType = 2, blastFolder = 'installed', noExtension = False, ignoreFirstBuildChecks = False, coveCutOff = 8, buildBacteria = False, buildArchea = False, validContigs = 1): ''' Do checks for main checker function... Checks for tRNAs, other features like rRNAs, genes, etc and returns a Assembly object, from tRNAscanChecker that holds this info. ''' genBankReference = False #Let's set the default gene checking genbank file according to the organismType flag #and if a genbank reference was given in the -r flag, it will be changed module_dir = os.path.dirname(__file__) module_dir = os.path.abspath(module_dir) ''' 1. The Standard Code 2. The Vertebrate Mitochondrial Code 3. The Yeast Mitochondrial Code 4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code 5. The Invertebrate Mitochondrial Code 6. The Ciliate, Dasycladacean and Hexamita Nuclear Code 9. The Echinoderm and Flatworm Mitochondrial Code 10. The Euplotid Nuclear Code 11. The Bacterial, Archaeal and Plant Plastid Code 12. The Alternative Yeast Nuclear Code 13. The Ascidian Mitochondrial Code 14. The Alternative Flatworm Mitochondrial Code 16. Chlorophycean Mitochondrial Code 21. Trematode Mitochondrial Code 22. Scenedesmus obliquus Mitochondrial Code 23. Thraustochytrium Mitochondrial Code 24. Pterobranchia Mitochondrial Code 25. Candidate Division SR1 and Gracilibacteria Code ''' if buildCloroplast == True: refSeqFileForGenes = os.path.join(module_dir, 'references/cloroplast.gb') elif organismType == 11: #plant plastid or bacterial or archea refSeqFileForGenes = os.path.join(module_dir, 'references/magnolia.gb') if buildBacteria == True: refSeqFileForGenes = os.path.join(module_dir, 'references/bacteria.gb') #some bacterial DNA elif buildArchea == True: refSeqFileForGenes = os.path.join(module_dir, 'references/archea.gb') #some archea DNA elif organismType == 3: #yeast refSeqFileForGenes = os.path.join(module_dir, 'references/yeast.gb') elif organismType == 5: #invertebrate, insect refSeqFileForGenes = os.path.join(module_dir, 'references/beetle.gb') elif organismType == 6: #ciliate refSeqFileForGenes = os.path.join(module_dir, 'references/paramecium.gb') elif organismType != 1: #human, 2(vertebrate), also the default option refSeqFileForGenes = os.path.join(module_dir, 'references/human.gb') #we don't need a sequence reference for this check if refSeqFile != None: #if user gave a reference file not in fasta, let's consider it to look for features if refSeqFile[-6:] != '.fasta': genBankReference = True refSeq = SeqIO.read(refSeqFile, "genbank", generic_dna) refSeqFileForGenes = refSeqFile circularCheck = circularizationCheck.circularizationCheck("best_query.fasta", circularSize, circularOffSet, blastFolder) #returns a tuple with True or False and coordinates if circularCheck[0] == True: print('Circularization was found.') else: print('Circularization was not found.') if skipTrnaScan == False: #user did not want to skip tRNAscanChecker print("Checking for tRNAs...") else: print("Checking for genomic features...") checktRNA = tRNAscanChecker.tRNAscanCheck("best_query.fasta", circularCheck, skipTrnaScan, organismType, coveCutOff, buildBacteria, buildArchea) #add to the assembly object the number of contigs concatenated to create this super contig checktRNA.validContigs = validContigs ''' let's check for it's features to see if everything was built, if a .gb reference was given we check against that, if not, we check against our own database inside references/ folder according to -o flag ''' if ignoreFirstBuildChecks == False: checktRNA.checkFeatures = geneChecker.geneCheck(refSeqFileForGenes, "best_query.fasta", cutoffEquality, genBankReference, blastFolder, organismType = organismType) presentFeatures = checktRNA.checkFeatures[0] importantFeatures = checktRNA.checkFeatures[1] splitFeatures = checktRNA.checkFeatures[2] print 'Features found: %s / %s' % (len(presentFeatures) - len(splitFeatures), len(importantFeatures)) else: checktRNA.checkFeatures = ([],[]) presentFeatures = checktRNA.checkFeatures[0] importantFeatures = checktRNA.checkFeatures[1] print 'Ignoring genomic checks, since --relaxed is on.' print "Assessing checks..." if (len(checktRNA) == 21 and buildCloroplast == False) or (len(checktRNA) == 37 and buildCloroplast == True): print 'All ' + str(len(checktRNA)) + ' tRNAs were found!' if circularCheck[0] == True: print 'And this build is circular.' if len(presentFeatures) == len(importantFeatures): print 'And all genes, tRNAs and rRNAs were found.' if skipTrnaScan == True: return True elif len(presentFeatures) >= len(importantFeatures): print 'And all genes, tRNAs and rRNAs were found, but some were split or duplicated.' else: print 'But not all genes, tRNAs and rRNAs were found, storing info and rebuilding...' print '' return checktRNA print '' #if all tRNAs were found, all features were found and it's circular, just return true and stop the recursive part return True else: print 'But circularization could not be found yet, storing this info...' if len(presentFeatures) == len(importantFeatures): print 'And all genes, tRNAs and rRNAs were found.' elif len(presentFeatures) >= len(importantFeatures): print 'And all genes, tRNAs and rRNAs were found, but some were split or duplicated.' else: print 'But not all genes, tRNAs and rRNAs were found, storing info...' print 'Rebuilding...' print '' return checktRNA else: print str(len(checktRNA)) + ' tRNAs were built.' if len(presentFeatures) == len(importantFeatures): print 'And all genes and rRNAs were found.' elif len(presentFeatures) >= len(importantFeatures): print 'And all genes, tRNAs and rRNAs were found, but some were split or duplicated.' else: print 'But not all genes and rRNAs were found, storing info...' print '' return checktRNA