def findQueryCounts(queryFile): #coOccur = CoOccurrence(); pairs = {} porter = stem.porter.PorterStemmer() qTerms = '' for line in open(queryFile, 'r'): split = line.strip().lower().split('\t') query = split[0].strip() freq = float(split[1]) #for each query get nonEntTerms and update co-occurrence stats qTerms = '' qTerms = ' '.join(getQueryTerms(query)) if len(qTerms) > 3: ngrams = sorted(getNGramsAsList(qTerms.strip(), 1)) lngrams = len(ngrams) if lngrams > 1: for i in range(lngrams - 1): if ngrams[i] not in stopSet and len(ngrams[i]) > 2: for j in range(i + 1, lngrams): if ngrams[j] not in stopSet and len(ngrams[j]) > 2: stemd1 = porter.stem(ngrams[i]) stemd2 = porter.stem(ngrams[j]) key = stemd1 + ' ' + stemd2 if key not in pairs: pairs[key] = 0.0 pairs[key] += freq #coOccur.updateStats(stemd1, stemd2, freq); #coOccur.setTermTotal(); #coOccur.writeTermCo(outFile); return pairs
def findSessionCounts(queryFile, outFile, wordSet): coOccur = {} #CoOccurrence(); qTerms = '' sess = 0 qid = 0.0 qSet = set() for session in getSessionWithQuery(queryFile): qSet.clear() for query in session: qid += 1 terms = getQueryTerms(query) if len(terms) > 0: qSet |= getQueryTerms(query) if qid % 1000000 == 0: print qid print len(coOccur) #print len(session) , len(qSet); #for each query get nonEntTerms and update co-occurrence stats qTerms = '' qTerms = ' '.join(qSet) if len(qTerms) > 3 and len(qSet) > 1: #print qSet; ngrams = sorted(getNGramsAsList(qTerms.strip(), 1)) lngrams = len(ngrams) if lngrams > 1: for i in range(lngrams - 1): if ngrams[i] not in stopSet and len( ngrams[i]) > 2 and ngrams[i] in wordSet: for j in range(i + 1, lngrams): if ngrams[j] not in stopSet and len( ngrams[j]) > 2 and ngrams[j] in wordSet: #coOccur.updateStats(ngrams[i],ngrams[j],1.0); key = ngrams[i] + ' ' + ngrams[j] if key not in coOccur: coOccur[key] = 0.0 coOccur[key] += 1.0 if len(coOccur) >= 9000000: writeDictToFile(outFile, coOccur, sess) coOccur.clear() coOccur = {} sess += 1
def calWordCount(fileName): words = {} for line in open(fileName, 'r'): split = line.lower().split('\t') query = split[1].strip() qsplit = getQueryTerms(query) for entry in qsplit: if entry not in words: words[entry] = 0.0 words[entry] += 1 wsort = sorted(words.items(), reverse=True, key=lambda x: x[1]) for entry in wsort: if entry[1] > 3 and len(entry[0]) > 2: print entry[0], '\t', entry[1]
def getURLFeatures(fileName, wordFeat): for line in open(fileName, 'r'): split = line.split('\t') queryTerms = getQueryTerms(split[0]) links = linkP.findall(split[1]) #ast.literal_eval(split[3]) urlFeatures = [] for tup in links: try: entry = tup.rsplit(',') entry[0] = entry[0][1:].strip() entry[1] = int(entry[1][:-1]) urlFeatures.append((entry[0], entry[1])) except: pass #print split[0], len(urlFeatures); for oentry in queryTerms: entry = porter.stem(oentry) if len(entry) > 2 and entry in wordFeat: for urlPair in urlFeatures: wordFeat[entry].updateURLStats(urlPair[0], urlPair[1])
def predictTerms(queryList, y, qclusters): termList, termDict = getTermList(queryList) oracle_prec = 0.0 oracle_mrr = 0.0 added = 0 cScorer = ScoreClusterTerms() for session in y: query = session[0] aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) if len(aTerms) > 0: prec1, mrr1 = getPrecRecall(termList, aTerms) added += 1.0 oracle_prec += prec1 oracle_mrr += mrr1 print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added, added #porter = stem.porter.PorterStemmer(); clusters, clusIndex = toTerms(qclusters) lim = 5 i = 0 prec = {} mrr = {} pf = 0.0 pr = 0.0 for session in y: query = session[0].strip() qSet = getQueryTerms(query) #getQueryTermsStemmed(query, porter); aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) if len(aTerms) > 0: terms = cScorer.scoreWithCosine(qSet, clusters, clusIndex, lim) if len(terms) > 0: #print len(aTerms), len(terms) prec1, mrr1 = getClustPrecRecall(terms, aTerms) # returns a list #print 'METRIC',i, prec1, mrr1 #print topk , prec1, mrr1 if sum(prec1) > 0: pf += 1.0 if sum(mrr1) > 0: pr += 1.0 for topk in range(len(prec1)): if topk not in prec: prec[topk] = [] mrr[topk] = [] prec[topk].append(prec1[topk]) mrr[topk].append(mrr1[topk]) i += 1 retPrec = {} retRecall = {} for entry, ls in prec.items(): print 'Prec @', entry, np.mean(ls) retPrec[entry] = np.mean(ls) for entry, ls in mrr.items(): print 'Recall @', entry, np.mean(ls) retRecall[entry] = np.mean(ls) print 'Percentage ', pf / i, pr / i return retPrec, retRecall
def findNonEntTerms(self, query, sdict): newQuery = query for entity in sdict: newQuery = newQuery.replace(entity, '') terms = getQueryTerms(newQuery) return terms
def main(argv): ipaddress = "localhost" # dexter object tagURL = "http://" + ipaddress + ":8080/rest/annotate" catURL = "http://" + ipaddress + ":8080/rest/graph/get-entity-categories" dexter = Dexter(tagURL, catURL, argv[5]) # load the Category co-occurrence bit catCoMan = CoOcManager(argv[4], CoOccurrence(), " ") # category vector catVect = loadCategoryVector(argv[2]) catManage1 = CategoryManager(catVect, argv[3], Category) catManage2 = CategoryManager(catVect, argv[7], CategorySubcluster) # ranker ranker = Ranker() totalVocab = loadFileInList(argv[6]) # task extraction # htcTask = TaskExpansion('Indexes/htcIndex',ranker,3000); qccTask = TaskExpansion("Indexes/qccIndex", ranker, 3000, totalVocab) # taskK = argv[5][argv[5].rfind('/')+1:] wordFeatMan = None # WordManager(argv[8],False); # expansion # entExp1 = CatThesExpansion(dexter, catManage1, ranker,catCoMan,wordFeatMan); entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan, wordFeatMan) # term expansion coOccExp = CoOccurExpansion(catCoMan, None, ranker) # randomWalk # randWalk = RandomWalk(argv[2],argv[3],ranker) prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} ent_prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} ent_mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} """ sess_prec = {}; sess_mrr = {}; """ covered = {} i = 0 porter = stem.porter.PorterStemmer() ttype = argv[10] for session, doc, click, cTitle, cSummary in getSessionWithXML(argv[1]): query = session[0] qSet = getQueryTerms(query) # print 'Title, Summary clicked ',cTitle[0], cSummary[0]; aTerms = None # cText = normalize(' '.join(cTitle[0]),porter); if ttype == "query": aTerms, rTerms = addedAndRemovedTerms(query, session[1:], totalVocab) elif ttype == "title": aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1)) else: aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1)) bTerms = getTerms(cSummary, qSet, totalVocab, porter, range(1, len(session) - 1)) aTerms = aTerms | bTerms print i, "Query", query, aTerms, len(aTerms) if len(aTerms) > 0: # and query not in covered: covered[query] = 1 coExpTerms = coOccExp.expandTextWithStep(query, 0, 55, 5) # entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query,'',1,0,55,5); entStatus1, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster(query, "", 1, 0, 55, 5) qccTaskTerms = qccTask.expandTextWithStep(query, 0, 55, 5) # htcTaskTerms = htcTask.expandTextWithStep(query,0,55,5) # randExpTerms = randWalk.expandTextWithStep(query,55,105,5) if not entStatus1: print i, "Ent False", query # addLen = getBand(len(aTerms)); # if addLen not in sess_prec: # sess_prec[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} }; # sess_mrr[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} }; # for noTerms in entExpTerms1.keys(): # print 'ETerms\t',i,'\t',query,'\t',entExpTerms1[noTerms],'\t',noTerms; # prec1 , mrr1 = getPrecRecall(entExpTerms1[noTerms],aTerms); # prec = updateStats(noTerms, 'ent',prec1, prec); # mrr = updateStats(noTerms, 'ent',mrr1, mrr); # if entStatus1: # ent_prec = updateStats(noTerms, 'ent',prec1, ent_prec) # ent_mrr = updateStats(noTerms, 'ent',mrr1, ent_mrr); ##sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen]) ##sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]); # print 'EMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1; # for noTerms in entExpTerms2.keys(): print "ESubTerms\t", i, "\t", query, "\t", entExpTerms2[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(entExpTerms2[noTerms], aTerms) prec = updateStats(noTerms, "entSub", prec1, prec) mrr = updateStats(noTerms, "entSub", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "entSub", prec1, ent_prec) ent_mrr = updateStats(noTerms, "entSub", mrr1, ent_mrr) # sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen]) # sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]); print "ESubMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 for noTerms in qccTaskTerms.keys(): print "qccTaskTerms\t", i, "\t", query, "\t", qccTaskTerms[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(qccTaskTerms[noTerms], aTerms) prec = updateStats(noTerms, "qccTask", prec1, prec) mrr = updateStats(noTerms, "qccTask", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "qccTask", prec1, ent_prec) ent_mrr = updateStats(noTerms, "qccTask", mrr1, ent_mrr) """ sess_prec[addLen] = updateStats(noTerms, 'qccTask',prec1, sess_prec[addLen]) sess_mrr[addLen] = updateStats(noTerms, 'qccTask',mrr1, sess_mrr[addLen]); """ print "qccTaskMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 # for noTerms in htcTaskTerms.keys(): # print 'htcTaskTerms\t',i,'\t',query,'\t',htcTaskTerms[noTerms],'\t',noTerms # prec1 , mrr1 = getPrecRecall(htcTaskTerms[noTerms],aTerms) # prec = updateStats(noTerms, 'htcTask',prec1, prec) # mrr = updateStats(noTerms, 'htcTask',mrr1, mrr); # if entStatus1: # ent_prec = updateStats(noTerms, 'htcTask',prec1, ent_prec) # ent_mrr = updateStats(noTerms, 'htcTask',mrr1, ent_mrr); ##sess_prec[addLen] = updateStats(noTerms, 'htcTask',prec1, sess_prec[addLen]) ##sess_mrr[addLen] = updateStats(noTerms, 'htcTask',mrr1, sess_mrr[addLen]); # # print 'htcTaskMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1 for noTerms in coExpTerms.keys(): print "CoTerms\t", i, "\t", query, "\t", coExpTerms[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(coExpTerms[noTerms], aTerms) prec = updateStats(noTerms, "co", prec1, prec) mrr = updateStats(noTerms, "co", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "co", prec1, ent_prec) ent_mrr = updateStats(noTerms, "co", mrr1, ent_mrr) """ sess_prec[addLen] = updateStats(noTerms, 'co',prec1, sess_prec[addLen]) sess_mrr[addLen] = updateStats(noTerms, 'co' ,mrr1, sess_mrr[addLen]); """ print "CoMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 else: pass # print 'NO ADDED TERMS in', i; i += 1 printMetric(prec, "entSub", "Prec") printMetric(mrr, "entSub", "Mrr") printMetric(prec, "ent", "Prec") printMetric(mrr, "ent", "Mrr") printMetric(prec, "htcTask", "Prec") printMetric(mrr, "htcTask", "Mrr") printMetric(prec, "qccTask", "Prec") printMetric(mrr, "qccTask", "Mrr") printMetric(prec, "co", "Prec") printMetric(mrr, "co", "Mrr") printMetric(ent_prec, "entSub", "EntPrec") printMetric(ent_mrr, "entSub", "EntMrr") printMetric(ent_prec, "ent", "EntPrec") printMetric(ent_mrr, "ent", "EntMrr") printMetric(ent_prec, "htcTask", "EntPrec") printMetric(ent_mrr, "htcTask", "EntMrr") printMetric(ent_prec, "qccTask", "EntPrec") printMetric(ent_mrr, "qccTask", "EntMrr") printMetric(ent_prec, "co", "EntPrec") printMetric(ent_mrr, "co", "EntMrr") plotMultipleSys( prec, "No of Terms", "Prec", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "prec.png", "Term Prediction Prec Plot", ) plotMultipleSys( mrr, "No of Terms", "MRR", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "mrr.png", "Term Prediction MRR Plot", ) plotMultipleSys( ent_prec, "No of Terms", "Prec", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_prec.png", "Term Prediction Prec Plot (Ent queries)", ) plotMultipleSys( ent_mrr, "No of Terms", "MRR", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_mrr.png", "Term Prediction MRR Plot (Ent queries)", ) # htcTask.closeIndex(); qccTask.closeIndex() """