예제 #1
0
			alignCutOff = 0.45
			print "alignCutOff wasn't given, assuming 0.5"
		try:
			coveCutOff = int(sys.argv[6])
			print('coveCutOff: %s' % coveCutOff)
		except:
			coveCutOff = 7
			print "coveCutOff wasn't given, assuming 7"
		x = geneCheck(genBankReference, resultFile, 0.50, True, blastFolder, organismType, alignCutOff)
		print 'Features found: %s' % len(x[0])
		print 'Total features: %s' % len(x[1])
		print ''
		print('Running tRNAscan-SE...')
		presentFeatures = x[0]

		assemblyCheck = tRNAscanChecker.tRNAscanCheck(resultFile, True, False, organismType, coveCutOff) #returns a Assembly object with statistics and alignment info 
		tRNAs = assemblyCheck.tRNAs
		
		listOfFeaturesToOutput = []
		listOfFoundTRNAs = []
		for foundFeature in presentFeatures:
			thisFeatureFound = presentFeatures[foundFeature][1]
			#comparing tRNAscan-SE results with this, in case tRNAscan-SE was run
			if "trn" in thisFeatureFound.seq2.lower():
				for tRNAFound in tRNAs:
				#down here we update the start and end positions of tRNAs found with Needle, with the
				#results outputted by tRNAScan-SE
				#tRNAconver = guarantees all tRNA names are in tRNA-Phe format
					if 'trna-' + tRNAFound.tRNAtype.lower() == tRNAconvert(thisFeatureFound.seq2.lower()):
						thisFeatureFound.startBase = min(tRNAFound.tRNAcoordinates[0],
										tRNAFound.tRNAcoordinates[1])
예제 #2
0
			destFile = pathOfFinalResults + args.processName + '.unordered.maf'
			shutil.copyfile(pathOfMafResult, destFile)
			
			destFile = pathOfFinalResults + args.processName + '.unordered.caf'
			shutil.copyfile(pathOfCafResult, destFile)
				
			print '## Final sequence saved to %s' % pathOfFinalResults 

			#from now on, just checking how the build went to output to user and then annotate
			print '## Now running tRNAscan-SE to check the final build...'
			
			if args.skipTrnaScan == True:
				print ''
				print '## --skiptrna is turned on, going to ignore this check and move on...'

			fifthStep = tRNAscanChecker.tRNAscanCheck(resultFile, fourthStep[0], args.skipTrnaScan, args.organismType, args.coveCutOff,
								args.buildBacteria, args.buildArchea) #returns a Assembly object 
												      #with statistics and alignment info
			if args.skipTrnaScan == False:
				print '## %s tRNAs were found.' % len(fifthStep)

			if args.ignoreFirstBuildChecks == False:
				print ''
				print '## Procceding to genomic check and annotation...'
				print ''

				#time to look for genomic features, searching according to the -o flag
				#or with the genbank reference that was given
				organismType = args.organismType
				module_dir = os.path.dirname(__file__)
				module_dir = os.path.abspath(module_dir)
				'''
예제 #3
0
def checkResults(processName, pathToWork, sizeToLook, refSeqFile = None, cutoffValue = (2500,18000), blasteVal = 10.0,
                    blastHitSizePercentage = 0.625, usingSOAP = True, numberOfReadGroups = 1, buildCloroplast = False,
                    skipTrnaScan = False, circularSize = 40, circularOffSet = 220, cutoffEquality = 0.625, organismType = 2,
                    blastFolder = 'installed', noExtension = False, ignoreFirstBuildChecks = False, coveCutOff = 8,
                    buildBacteria = False, buildArchea = False, validContigs = 1):
	'''
	Do checks for main checker function...
	Checks for tRNAs, other features like rRNAs, genes, etc and returns a Assembly object, from tRNAscanChecker
	that holds this info.
	'''

	genBankReference = False
	
	#Let's set the default gene checking genbank file according to the organismType flag
	#and if a genbank reference was given in the -r flag, it will be changed
	module_dir = os.path.dirname(__file__)
	module_dir = os.path.abspath(module_dir)
	'''
		1. The Standard Code
		2. The Vertebrate Mitochondrial Code
		3. The Yeast Mitochondrial Code
		4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code
		5. The Invertebrate Mitochondrial Code
		6. The Ciliate, Dasycladacean and Hexamita Nuclear Code
		9. The Echinoderm and Flatworm Mitochondrial Code
		10. The Euplotid Nuclear Code
		11. The Bacterial, Archaeal and Plant Plastid Code
		12. The Alternative Yeast Nuclear Code
		13. The Ascidian Mitochondrial Code
		14. The Alternative Flatworm Mitochondrial Code
		16. Chlorophycean Mitochondrial Code
		21. Trematode Mitochondrial Code
		22. Scenedesmus obliquus Mitochondrial Code
		23. Thraustochytrium Mitochondrial Code
		24. Pterobranchia Mitochondrial Code
		25. Candidate Division SR1 and Gracilibacteria Code
	'''
	if buildCloroplast == True:
		refSeqFileForGenes = os.path.join(module_dir, 'references/cloroplast.gb')
	elif organismType == 11: #plant plastid or bacterial or archea
		refSeqFileForGenes = os.path.join(module_dir, 'references/magnolia.gb')
		if buildBacteria == True:
			refSeqFileForGenes = os.path.join(module_dir, 'references/bacteria.gb') #some bacterial DNA
		elif buildArchea == True:
			refSeqFileForGenes = os.path.join(module_dir, 'references/archea.gb') #some archea DNA
	elif organismType == 3: #yeast
		refSeqFileForGenes = os.path.join(module_dir, 'references/yeast.gb')
	elif organismType == 5: #invertebrate, insect
		refSeqFileForGenes = os.path.join(module_dir, 'references/beetle.gb')
	elif organismType == 6: #ciliate
		refSeqFileForGenes = os.path.join(module_dir, 'references/paramecium.gb')
	elif organismType != 1: #human, 2(vertebrate), also the default option
		refSeqFileForGenes = os.path.join(module_dir, 'references/human.gb')
	
	#we don't need a sequence reference for this check
	if refSeqFile != None: #if user gave a reference file not in fasta, let's consider it to look for features
		if refSeqFile[-6:] != '.fasta':
			genBankReference = True
			refSeq = SeqIO.read(refSeqFile, "genbank", generic_dna)
			refSeqFileForGenes = refSeqFile

	circularCheck = circularizationCheck.circularizationCheck("best_query.fasta", circularSize, circularOffSet, blastFolder) #returns a tuple with True or False and coordinates
	if circularCheck[0] == True:
		print('Circularization was found.')
	else:
		print('Circularization was not found.')

	if skipTrnaScan == False: #user did not want to skip tRNAscanChecker
		print("Checking for tRNAs...")
	else:
		print("Checking for genomic features...")
	checktRNA = tRNAscanChecker.tRNAscanCheck("best_query.fasta", circularCheck, skipTrnaScan, organismType, coveCutOff, buildBacteria,
						buildArchea)
	#add to the assembly object the number of contigs concatenated to create this super contig
	checktRNA.validContigs = validContigs
	'''
	let's check for it's features to see if everything was built,
	if a .gb reference was given we check against that, if not, we check against
	our own database inside references/ folder according to -o flag
	'''
	if ignoreFirstBuildChecks == False:
		checktRNA.checkFeatures = geneChecker.geneCheck(refSeqFileForGenes, "best_query.fasta", cutoffEquality, genBankReference, blastFolder,
                                                                organismType = organismType)
		presentFeatures = checktRNA.checkFeatures[0]
		importantFeatures = checktRNA.checkFeatures[1]
		splitFeatures = checktRNA.checkFeatures[2]
		print 'Features found: %s / %s' % (len(presentFeatures) - len(splitFeatures), len(importantFeatures))
	else:
		checktRNA.checkFeatures = ([],[])
		presentFeatures = checktRNA.checkFeatures[0]
		importantFeatures = checktRNA.checkFeatures[1]
		print 'Ignoring genomic checks, since --relaxed is on.'

	print "Assessing checks..."
	if (len(checktRNA) == 21 and buildCloroplast == False) or (len(checktRNA) == 37 and buildCloroplast == True):
		print 'All ' + str(len(checktRNA)) + ' tRNAs were found!'
		if circularCheck[0] == True:
			print 'And this build is circular.'
			if len(presentFeatures) == len(importantFeatures):
				print 'And all genes, tRNAs and rRNAs were found.'
				if skipTrnaScan == True:
					return True
			elif len(presentFeatures) >= len(importantFeatures):
				print 'And all genes, tRNAs and rRNAs were found, but some were split or duplicated.'
			else:
				print 'But not all genes, tRNAs and rRNAs were found, storing info and rebuilding...'
				print ''
				return checktRNA
			print ''
			#if all tRNAs were found, all features were found and it's circular, just return true and stop the recursive part
			return True
		else:
			print 'But circularization could not be found yet, storing this info...'
			if len(presentFeatures) == len(importantFeatures):
				print 'And all genes, tRNAs and rRNAs were found.'
			elif len(presentFeatures) >= len(importantFeatures):
				print 'And all genes, tRNAs and rRNAs were found, but some were split or duplicated.'
			else:
				print 'But not all genes, tRNAs and rRNAs were found, storing info...'
			print 'Rebuilding...'
			print ''
			return checktRNA
	else:
		print str(len(checktRNA)) + ' tRNAs were built.'
		if len(presentFeatures) == len(importantFeatures):
			print 'And all genes and rRNAs were found.'
		elif len(presentFeatures) >= len(importantFeatures):
			print 'And all genes, tRNAs and rRNAs were found, but some were split or duplicated.'
		else:
			print 'But not all genes and rRNAs were found, storing info...'
		print ''
		return checktRNA