def dataGen(percHold,allIn,ncIn,typeIn): #build datasets for the models #1. create postive and negative datasets pHead,pSeq = fastaU.read(typeIn,True,False) nHead,nSeq = negativeDataGen.compileData(len(pSeq),ncIn,40,allIn,40,True,20,True,False) #randomize the data [pHead,pSeq] = randomizeSeqs(pHead,pSeq) [nHead,nSeq] = randomizeSeqs(nHead,nSeq) return pHead,pSeq,nHead,nSeq
mFile.close() mTemplate.close() maxWindow = 4 prefix = "haca-1" foldN = 10 directory = prefix makeDirectory(directory) masterName = "run_" + prefix print "reading in fastas..." #read and generate pHead,pSeq = fastaU.read(hacaFastaGB,True,False) nHead,nSeq = negativeDataGen.compileData(len(pSeq),ncRNAGB,40,allDNAGB,40,True,20,True,False) print "randomizing data..." #randomize the data pHead,pSeq = randomizeSeqs(pHead,pSeq) nHead,nSeq = randomizeSeqs(nHead,nSeq) print "making type lists..." #make type lists pType = [1]*len(pHead) nType = [0]*len(nHead) #make holdouts print "spliting for fold validation..." pHeadL,pSeqL,pTypeL = foldSplit(foldN,pHead,pSeq,pType) nHeadL,nSeqL,nTypeL = foldSplit(foldN,nHead,nSeq,nType)