def sampleEntityQueries(featFile, clusterFile): queries = {} for line in open(featFile, 'r'): split = line.split('\t') entityDict = ast.literal_eval(split[5]) qsplit = split[0].split() if len(qsplit) > 2: queries[split[0].strip()] = entityDict outFile = open('EntityQueries.sample', 'w') globalEnt = {} entTerms = {} for cluster in loadClusters(clusterFile): if len(cluster) > 4: cent = {} for query in cluster: if query in queries: for entry, value in queries[query].items(): if entry not in cent: cent[entry] = 0.0 cent[entry] += value entSort = sorted(cent.items(), reverse=True, key=lambda x: x[1]) toPrint = {} if len(entSort) > 0: bestEnt = entSort[0][0] if bestEnt not in globalEnt: globalEnt[bestEnt] = [] entTerms[bestEnt] = set([]) for query in cluster: if query in queries and bestEnt in queries[query]: qsplit = query.split() #getNGramsAsList(query,2) # for entry in qsplit: if len(entry) > 2 and entry not in stopSet \ and entry not in bestEnt and entry not in ashleelString: if entry not in toPrint: toPrint[entry] = 0.0 toPrint[entry] += 1.0 sortP = sorted(toPrint.items(), reverse=True, key=lambda x: x[1]) fset = set([]) if len(sortP) > 3: for entry in sortP: if entry[1] > 1: fset.add(entry[0]) if len(fset) > 7: break covered = 0 for entry in fset: if entry not in entTerms[bestEnt]: entTerms[bestEnt].add(entry) else: covered += 1.0 if len(fset) > 3 and covered / len(fset) < .20: globalEnt[bestEnt].append(fset) #else: # print covered, len(fset) for bestEnt, sList in globalEnt.items(): if len(sList) > 1: for sortP in sList: outFile.write(bestEnt.encode('utf-8') + '\t' + '\t'.join(sortP) + '\n') outFile.close()
def main(argv): #Scorer coSessOccur = CoOccurrence() coSessOcMan = CoOcManager(argv[2], coSessOccur, ' ') tScorer = CoOccurSimScore(coSessOcMan) cScorer = ScoreClusterTerms() #vocab = set() i = 0 prec = {} mrr = {} lim = 55 queryList = loadFileInList(argv[5]) termList, termDict = getTermList(queryList) print len(termList) added = 0 oracle_prec = 0.0 oracle_mrr = 0.0 for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): query = session[0].strip() aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) if len(aTerms) > 0: prec1, mrr1 = getPrecRecall(termList, aTerms) added += 1.0 oracle_prec += prec1 oracle_mrr += mrr1 print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added porter = stem.porter.PorterStemmer() ttype = argv[6] print ttype for iFile in os.listdir(argv[3]): qclusters = loadClusters(argv[3] + '/' + iFile) clusters, clusIndex = toTerms(qclusters) print iFile, len(clusters) prec[iFile] = {} mrr[iFile] = {} added = 0.0 i = 1 for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() qSet = getQueryTermsStemmed(query, porter) print 'Query ', query, qSet if ttype == 'query': aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict) elif ttype == 'title': aTerms = getTerms(cTitle, qSet, termDict, porter, range( 1, len(session) - 1)) else: aTerms = getTerms(cTitle, qSet, termDict, porter, range( 1, len(session) - 1)) bTerms = getTerms(cSummary, qSet, termDict, porter, range( 1, len(session) - 1)) aTerms = aTerms | bTerms #aTerms,rTerms = addedAndRemovedTerms(query, session[1:], None ) if len(aTerms) > 0: terms = cScorer.scoreWithIndex(qSet, clusters, clusIndex, tScorer, lim) #terms = cScorer.scoreWithClustPos(qSet, clusters,tScorer, lim) print 'TERMS', '\t', i, '\t', ttype, '\t', iFile, '\t', len( terms), terms #for topk in range(1,lim,5): prec1, mrr1 = getClustPrecMrr(terms, aTerms) # returns a list print 'METRIC', iFile, i, prec1, mrr1 #print topk , prec1, mrr1 for topk in prec1.keys(): if topk not in prec[iFile]: prec[iFile][topk] = [] mrr[iFile][topk] = [] prec[iFile][topk].append(prec1[topk]) mrr[iFile][topk].append(mrr1[topk]) #prec[iFile][topk] += prec1 #mrr[iFile][topk] += mrr1 added += 1.0 #if i == 3: # break for fName, scoreDict in prec.items(): for pos in scoreDict.keys(): print 'Prec all', fName, pos, len(scoreDict[pos]) total = sum(scoreDict[pos]) prec[fName][pos] = total / added #len(scoreDict[pos]) print 'Prec', fName, pos, prec[fName][pos], total for fName, scoreDict in mrr.items(): for pos in scoreDict.keys(): print 'Mrr all', fName, pos, len(scoreDict[pos]) total = sum(mrr[fName][pos]) mrr[fName][pos] = total / added #len(scoreDict[pos]) print 'MRR', fName, pos, mrr[fName][pos], total #for entry in prec.keys(): #for t in prec[entry].keys(): #print 'Prec',entry, t, prec[entry][t], prec[entry][t]/added #prec[entry][t]/=added #for entry in mrr.keys(): #for t in mrr[entry].keys(): #print 'Mrr',entry, t, mrr[entry][t], mrr[entry][t]/added #mrr[entry][t]/=added print 'Plotting Precision and MRR' plotMultipleSys(prec, 'No of Terms', 'Prec', argv[4] + 'prec.png', 'Term Prediction Prec Plot') plotMultipleSys(mrr, 'No of Terms', 'MRR', argv[4] + 'mrr.png', 'Term Prediction MRR Plot')