Пример #1
0
def dataGen(percHold,allIn,ncIn,typeIn):
	#build datasets for the models
	#1. create postive and negative datasets
	pHead,pSeq = fastaU.read(typeIn,True,False)
	nHead,nSeq = negativeDataGen.compileData(len(pSeq),ncIn,40,allIn,40,True,20,True,False)
	#randomize the data
	[pHead,pSeq] = randomizeSeqs(pHead,pSeq)
	[nHead,nSeq] = randomizeSeqs(nHead,nSeq)
	
	return pHead,pSeq,nHead,nSeq
Пример #2
0
	mFile.close()
	mTemplate.close()
		
		

maxWindow = 4
prefix = "haca-1"
foldN = 10
directory = prefix
makeDirectory(directory)
masterName = "run_" + prefix

print "reading in fastas..."
#read and generate
pHead,pSeq = fastaU.read(hacaFastaGB,True,False)
nHead,nSeq = negativeDataGen.compileData(len(pSeq),ncRNAGB,40,allDNAGB,40,True,20,True,False)

print "randomizing data..."
#randomize the data
pHead,pSeq = randomizeSeqs(pHead,pSeq)
nHead,nSeq = randomizeSeqs(nHead,nSeq)

print "making type lists..."
#make type lists
pType = [1]*len(pHead)
nType = [0]*len(nHead)

#make holdouts
print "spliting for fold validation..."
pHeadL,pSeqL,pTypeL = foldSplit(foldN,pHead,pSeq,pType)
nHeadL,nSeqL,nTypeL = foldSplit(foldN,nHead,nSeq,nType)