def kFoldEvaluation(k, sessFile, featFile, weightFile, percent, typeFile): sessions = loadSessions(sessFile) #weightMatrix = readWeightMatrix(weightFile) # #p1 = {} #r1 = {} #p2 = {} #r2 = {} #p3 = {} #r3 = {} #p4 = {} #r4 = {} # amean = [] ymean = [] for i in range(k): x, y, uniqx, uniqy = sampleSessions(sessions, percent) acount = 0.0 ylen = 0.0 termList, termDict = getTermList(uniqx) for session in y: aTerms, rTerms = addedAndRemovedTerms(session[0], session[1:], termDict) acount += len(aTerms) ylen += len(session) print acount, ylen, acount / len(y), ylen / len(y) amean.append(acount / len(y)) ymean.append(ylen / len(y)) print np.mean(amean), np.mean(ymean)
def predictTerms(queryList, y, qclusters): termList, termDict = getTermList(queryList) oracle_prec = 0.0 oracle_mrr = 0.0 added = 0 cScorer = ScoreClusterTerms() for session in y: query = session[0] aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) if len(aTerms) > 0: prec1, mrr1 = getPrecRecall(termList, aTerms) added += 1.0 oracle_prec += prec1 oracle_mrr += mrr1 print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added, added #porter = stem.porter.PorterStemmer(); clusters, clusIndex = toTerms(qclusters) lim = 5 i = 0 prec = {} mrr = {} pf = 0.0 pr = 0.0 for session in y: query = session[0].strip() qSet = getQueryTerms(query) #getQueryTermsStemmed(query, porter); aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) if len(aTerms) > 0: terms = cScorer.scoreWithCosine(qSet, clusters, clusIndex, lim) if len(terms) > 0: #print len(aTerms), len(terms) prec1, mrr1 = getClustPrecRecall(terms, aTerms) # returns a list #print 'METRIC',i, prec1, mrr1 #print topk , prec1, mrr1 if sum(prec1) > 0: pf += 1.0 if sum(mrr1) > 0: pr += 1.0 for topk in range(len(prec1)): if topk not in prec: prec[topk] = [] mrr[topk] = [] prec[topk].append(prec1[topk]) mrr[topk].append(mrr1[topk]) i += 1 retPrec = {} retRecall = {} for entry, ls in prec.items(): print 'Prec @', entry, np.mean(ls) retPrec[entry] = np.mean(ls) for entry, ls in mrr.items(): print 'Recall @', entry, np.mean(ls) retRecall[entry] = np.mean(ls) print 'Percentage ', pf / i, pr / i return retPrec, retRecall