fname = "".join([str(n),"_Mchains.p"]) Mchains = pickle.load(open(fname,"r")) print "fitting model to test set..." a = 0 for line in fileinput.input(): l = line.strip("\n") l = l.split("\t") l[0] = l[0].split("_")[0] if len(l[1])<=n: continue target = l[0] answer = np.array(Mchains.keys()) estimates = np.zeros( (len(Mchains.keys()),1)) temp = create_chain(len(ix),1) l[1] = l[1].replace("N",randomN()) temp = process_read(l[1],temp,n,ix) temp = temp/float(sum(temp)) for k,val in enumerate(Mchains.keys()): prob = euclid(temp,Mchains[val]) estimates[k] = np.log2(prob) best = np.argmin(estimates) print answer[best], target if answer[best] == target: a+=1 print "total of ",a, "fitted properly"
'''create a dictionnary containing all markov chains (order 6) for each chromosome''' Mchains = {} probchr = {} a = 0 for line in fileinput.input(): l = line.strip("\n") l = l.split("\t") l[0] = l[0].split("_")[0] if len(l[1])<=n: continue if not ( l[0] in Mchains.keys()): Mchains[l[0]] = create_chain(len(ix),1) probchr[l[0]] = 0 temp = create_chain(len(ix),1) l[1] = l[1].replace("N",randomN()) Mchains[l[0]]+=process_read(l[1],temp,n,ix) probchr[l[0]]+=1 sum_prob = float(np.sum(probchr.values())) #calculating Poisson distribution parameters (mu) for each Markov chain for k in Mchains.keys(): Mchains[k] = Mchains[k]/float(sum(Mchains[k])) probchr[k] = probchr[k]/sum_prob ''' For each chromosome, the occurences are converted to probability tables. For each chromosome, a |1-distance|**2 is calculated as a metric''' fname = "".join([str(n),"_Mchains.p"]) pickle.dump(Mchains,open((fname),"w"))