def main(argv): #open the index searcher = SearchIndex(argv[2]) searcher.initializeAnalyzer() ipaddress = 'localhost' #dexter object tagURL = 'http://' + ipaddress + ':8080/rest/annotate' catURL = 'http://' + ipaddress + ':8080/rest/graph/get-entity-categories' dexter = Dexter(tagURL, catURL, argv[7]) #category vector catVect = loadCategoryVector(argv[3]) catManage1 = CategoryManager(catVect, argv[4], Category) catManage2 = CategoryManager(catVect, argv[5], CategorySubcluster) #load the Category co-occurrence bit catCoMan = CoOcManager(argv[6], CoOccurrence(), ' ') #ranker ranker = Ranker() #task extraction htcTask = TaskExpansion('Indexes/htcIndex', ranker, 3000) qccTask = TaskExpansion('Indexes/qccIndex', ranker, 3000) #taskK = argv[5][argv[5].rfind('/')+1:] #totalVocab = loadFileInList(argv[6]); #expansion entExp1 = CatThesExpansion(dexter, catManage1, ranker, catCoMan) entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan) #term expansion coOccExp = CoOccurExpansion(catCoMan, None, ranker) rel, noRel = loadRelJudgements(argv[8]) outFolder = argv[9] #randomWalk #randWalk = RandomWalk(argv[3],argv[4],ranker) #randWalk = RandomWalk(catManage,catCoMan,entTermVect, catTermVect,ranker) #result String #query key terms #queryList = loadQueryList(argv[4]); #plotMap = {'baseline':{},'ent':{}, 'entSub':{}, 'qccTask':{}, 'htcTask':{},'co':{}}; #plotNDCG = {'baseline':{},'ent':{}, 'entSub':{}, 'qccTask':{}, 'htcTask':{},'co':{}}; #viewedFileFolder = argv[5] #i=0 ##qMap = []; ##qNdcg = []; #meth = 'baseline' #oFile = open(outFolder+'/baseline.RL1','w'); #covered = {}; #porter = stem.porter.PorterStemmer(); #for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(argv[1]): #i+=1 #query = session[0].strip(); #if i in rel and query not in covered: #covered[query] = 1.0; #docList = searcher.getTopDocuments(query,1000,'content','id'); #k = 1 #for dtuple in docList: #oFile.write(str(i)+' Q0 '+dtuple[0]+' '+str(k)+' '+str(round(dtuple[1],2))+' baseline\n'); #k +=1 #'''qmap = findAvgPrec(docList,rel[i],noRel[i]); #dcg10, idcg10 = findDCG(docList[:10],rel[i]); ##print dcg10, idcg10, rel[i].values(); #ndcg10 = 0.0; #if idcg10 > 0: #ndcg10 = dcg10/idcg10; #qMap.append(qmap); #qNdcg.append(ndcg10); #oFile.write('ndcg10 '+str(i)+' '+str(ndcg10)+'\n'); #oFile.write('map '+str(i)+' '+str(qmap)+'\n'); #''' #else: #print 'No rel ', i, session[0]; #oFile.close(); #''' #fmap = sum(qMap)/len(qMap); #fnd = sum(qNdcg)/len(qNdcg); #oFile.write('all map ' +str(fmap)+'\n'); #oFile.write('all ndcg10 '+str(fnd)+'\n'); #for val in range(0,55,5): #plotMap[meth][val] = fmap; #plotNDCG[meth][val] = fnd; #oFile.close(); #''' i = 0 #qMap = {}; #qNdcg = {}; oFile = {} meth = 'co' covered = {} for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() if i in rel and query not in covered: covered[query] = 1.0 coExpTerms = coOccExp.expandTextWithStep(query, 50, 55, 5) for noTerms, terms in coExpTerms.items(): #if noTerms not in qMap: # qMap[noTerms] = []; # qNdcg[noTerms] = []; if noTerms not in oFile: oFile[noTerms] = open( outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w') docList = searcher.getTopDocumentsWithExpansion(query, terms, 1000, 'content', 'id') k = 1 for dtuple in docList: oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' ' + str(round(dtuple[1], 2)) + ' baseline\n') k += 1 '''qmap = findAvgPrec(docList,rel[i],noRel[i]); dcg10, idcg10 = findDCG(docList[:10],rel[i]); ndcg10 = 0.0; if idcg10 > 0: ndcg10 = dcg10/idcg10; qMap[noTerms].append(qmap); qNdcg[noTerms].append(ndcg10); oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n'); oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n'); ''' ''' for entry, vlist in qMap.items(): i = len(vlist); fmap = sum(vlist)/i; fnd = sum(qNdcg[entry])/i; print sum(vlist), len(vlist); oFile[entry].write('all map ' +str(fmap)+'\n'); oFile[entry].write('all ndcg10 '+str(fnd)+'\n'); plotMap[meth][entry] = fmap; plotNDCG[meth][entry] = fnd; oFile[entry].close(); ''' for entry in oFile.keys(): oFile[entry].close() i = 0 #qMap = {}; #qNdcg = {}; oFile = {} meth = 'ent' covered = {} for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() cText = normalize(' '.join(cTitle[0]), porter) if i in rel and query not in covered: covered[query] = 1.0 entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query, cText, 1, 50, 55, 5) for noTerms, terms in entExpTerms1.items(): #if noTerms not in qMap: # qMap[noTerms] = []; # qNdcg[noTerms] = []; if noTerms not in oFile: oFile[noTerms] = open( outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w') docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000, 'content', 'id') k = 1 for dtuple in docList: oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' ' + str(round(dtuple[1], 2)) + ' baseline\n') k += 1 ''' qmap = findAvgPrec(docList,rel[i],noRel[i]); dcg10, idcg10 = findDCG(docList[:10],rel[i]); ndcg10 = 0.0; if idcg10 > 0: ndcg10 = dcg10/idcg10; qMap[noTerms].append(qmap); qNdcg[noTerms].append(ndcg10); oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n'); oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n'); for entry, vlist in qMap.items(): i = len(vlist); fmap = sum(qMap[entry])/i; fnd = sum(qNdcg[entry])/i; oFile[entry].write('all map ' +str(fmap)+'\n'); oFile[entry].write('all ndcg10 '+str(fnd)+'\n'); plotMap[meth][entry] = fmap; plotNDCG[meth][entry] = fnd; oFile[entry].close(); ''' for entry in oFile.keys(): oFile[entry].close() i = 0 #qMap = {}; #qNdcg = {}; oFile = {} meth = 'entSub' covered = {} for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() cText = normalize(' '.join(cTitle[0]), porter) if i in rel and query not in covered: covered[query] = 1.0 entStatus2, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster( query, cText, 1, 50, 55, 5) for noTerms, terms in entExpTerms2.items(): #if noTerms not in qMap: #qMap[noTerms] = []; #qNdcg[noTerms] = []; if noTerms not in oFile: oFile[noTerms] = open( outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w') docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000, 'content', 'id') k = 1 for dtuple in docList: oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' ' + str(round(dtuple[1], 2)) + ' baseline\n') k += 1 '''qmap = findAvgPrec(docList,rel[i],noRel[i]); dcg10, idcg10 = findDCG(docList[:10],rel[i]); ndcg10 = 0.0; if idcg10 > 0: ndcg10 = dcg10/idcg10; qMap[noTerms].append(qmap); qNdcg[noTerms].append(ndcg10); oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n'); oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n'); for entry, vlist in qMap.items(): i = len(vlist); fmap = sum(qMap[entry])/i; fnd = sum(qNdcg[entry])/i; oFile[entry].write('all map ' +str(fmap)+'\n'); oFile[entry].write('all ndcg10 '+str(fnd)+'\n'); plotMap[meth][entry] = fmap; plotNDCG[meth][entry] = fnd; oFile[entry].close(); ''' for entry in oFile.keys(): oFile[entry].close() i = 0 #qMap = {}; #qNdcg = {}; oFile = {} meth = 'qccTask' covered = {} for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() if i in rel and query not in covered: covered[query] = 1.0 qccTaskTerms = qccTask.expandTextWithStep(query, 50, 55, 5) for noTerms, terms in qccTaskTerms.items(): #if noTerms not in qMap: #qMap[noTerms] = []; #qNdcg[noTerms] = []; if noTerms not in oFile: oFile[noTerms] = open( outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w') docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000, 'content', 'id') k = 1 for dtuple in docList: oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' ' + str(round(dtuple[1], 2)) + ' baseline\n') k += 1 #qmap = findAvgPrec(docList,rel[i],noRel[i]); #dcg10, idcg10 = findDCG(docList[:10],rel[i]); #ndcg10 = 0.0; #if idcg10 > 0: #ndcg10 = dcg10/idcg10; # #qMap[noTerms].append(qmap); #qNdcg[noTerms].append(ndcg10); #oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n'); #oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n'); # #for entry, vlist in qMap.items(): #i = len(vlist); #fmap = sum(qMap[entry])/i; #fnd = sum(qNdcg[entry])/i; #oFile[entry].write('all map ' +str(fmap)+'\n'); #oFile[entry].write('all ndcg10 '+str(fnd)+'\n'); #plotMap[meth][entry] = fmap; #plotNDCG[meth][entry] = fnd; #oFile[entry].close(); # for entry in oFile.keys(): oFile[entry].close() i = 0 #qMap = {}; #qNdcg = {}; oFile = {} meth = 'htcTask' covered = {} for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML( argv[1]): i += 1 query = session[0].strip() if i in rel and query not in covered: covered[query] = 1.0 htcTaskTerms = htcTask.expandTextWithStep(query, 50, 55, 5) for noTerms, terms in htcTaskTerms.items(): #if noTerms not in qMap: #qMap[noTerms] = []; #qNdcg[noTerms] = []; if noTerms not in oFile: oFile[noTerms] = open( outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w') docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000, 'content', 'id') k = 1 for dtuple in docList: oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' ' + str(round(dtuple[1], 2)) + ' baseline\n') k += 1 #qmap = findAvgPrec(docList,rel[i],noRel[i]); #dcg10, idcg10 = findDCG(docList[:10],rel[i]); #ndcg10 = 0.0; #if idcg10 > 0: #ndcg10 = dcg10/idcg10; #qMap[noTerms].append(qmap); #qNdcg[noTerms].append(ndcg10); #oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n'); #oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n'); # #for entry, vlist in qMap.items(): #i = len(vlist); #fmap = sum(qMap[entry])/i; #fnd = sum(qNdcg[entry])/i; #oFile[entry].write('all map ' +str(fmap)+'\n'); #oFile[entry].write('all ndcg10 '+str(fnd)+'\n'); #plotMap[meth][entry] = fmap; #plotNDCG[meth][entry] = fnd; #oFile[entry].close(); for entry in oFile.keys(): oFile[entry].close() #plotMultipleSys(plotMap,'No of Terms', 'MAP',outFolder+'/map.png','Retrieval MAP Plot'); #plotMultipleSys(plotNDCG,'No of Terms', 'NDCG@10',outFolder+'/ndcg10.png','Retrieval NDCG Plot'); searcher.close()
def main(argv): ipaddress = "localhost" # dexter object tagURL = "http://" + ipaddress + ":8080/rest/annotate" catURL = "http://" + ipaddress + ":8080/rest/graph/get-entity-categories" dexter = Dexter(tagURL, catURL, argv[5]) # load the Category co-occurrence bit catCoMan = CoOcManager(argv[4], CoOccurrence(), " ") # category vector catVect = loadCategoryVector(argv[2]) catManage1 = CategoryManager(catVect, argv[3], Category) catManage2 = CategoryManager(catVect, argv[7], CategorySubcluster) # ranker ranker = Ranker() totalVocab = loadFileInList(argv[6]) # task extraction # htcTask = TaskExpansion('Indexes/htcIndex',ranker,3000); qccTask = TaskExpansion("Indexes/qccIndex", ranker, 3000, totalVocab) # taskK = argv[5][argv[5].rfind('/')+1:] wordFeatMan = None # WordManager(argv[8],False); # expansion # entExp1 = CatThesExpansion(dexter, catManage1, ranker,catCoMan,wordFeatMan); entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan, wordFeatMan) # term expansion coOccExp = CoOccurExpansion(catCoMan, None, ranker) # randomWalk # randWalk = RandomWalk(argv[2],argv[3],ranker) prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} ent_prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} ent_mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}} """ sess_prec = {}; sess_mrr = {}; """ covered = {} i = 0 porter = stem.porter.PorterStemmer() ttype = argv[10] for session, doc, click, cTitle, cSummary in getSessionWithXML(argv[1]): query = session[0] qSet = getQueryTerms(query) # print 'Title, Summary clicked ',cTitle[0], cSummary[0]; aTerms = None # cText = normalize(' '.join(cTitle[0]),porter); if ttype == "query": aTerms, rTerms = addedAndRemovedTerms(query, session[1:], totalVocab) elif ttype == "title": aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1)) else: aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1)) bTerms = getTerms(cSummary, qSet, totalVocab, porter, range(1, len(session) - 1)) aTerms = aTerms | bTerms print i, "Query", query, aTerms, len(aTerms) if len(aTerms) > 0: # and query not in covered: covered[query] = 1 coExpTerms = coOccExp.expandTextWithStep(query, 0, 55, 5) # entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query,'',1,0,55,5); entStatus1, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster(query, "", 1, 0, 55, 5) qccTaskTerms = qccTask.expandTextWithStep(query, 0, 55, 5) # htcTaskTerms = htcTask.expandTextWithStep(query,0,55,5) # randExpTerms = randWalk.expandTextWithStep(query,55,105,5) if not entStatus1: print i, "Ent False", query # addLen = getBand(len(aTerms)); # if addLen not in sess_prec: # sess_prec[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} }; # sess_mrr[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} }; # for noTerms in entExpTerms1.keys(): # print 'ETerms\t',i,'\t',query,'\t',entExpTerms1[noTerms],'\t',noTerms; # prec1 , mrr1 = getPrecRecall(entExpTerms1[noTerms],aTerms); # prec = updateStats(noTerms, 'ent',prec1, prec); # mrr = updateStats(noTerms, 'ent',mrr1, mrr); # if entStatus1: # ent_prec = updateStats(noTerms, 'ent',prec1, ent_prec) # ent_mrr = updateStats(noTerms, 'ent',mrr1, ent_mrr); ##sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen]) ##sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]); # print 'EMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1; # for noTerms in entExpTerms2.keys(): print "ESubTerms\t", i, "\t", query, "\t", entExpTerms2[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(entExpTerms2[noTerms], aTerms) prec = updateStats(noTerms, "entSub", prec1, prec) mrr = updateStats(noTerms, "entSub", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "entSub", prec1, ent_prec) ent_mrr = updateStats(noTerms, "entSub", mrr1, ent_mrr) # sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen]) # sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]); print "ESubMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 for noTerms in qccTaskTerms.keys(): print "qccTaskTerms\t", i, "\t", query, "\t", qccTaskTerms[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(qccTaskTerms[noTerms], aTerms) prec = updateStats(noTerms, "qccTask", prec1, prec) mrr = updateStats(noTerms, "qccTask", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "qccTask", prec1, ent_prec) ent_mrr = updateStats(noTerms, "qccTask", mrr1, ent_mrr) """ sess_prec[addLen] = updateStats(noTerms, 'qccTask',prec1, sess_prec[addLen]) sess_mrr[addLen] = updateStats(noTerms, 'qccTask',mrr1, sess_mrr[addLen]); """ print "qccTaskMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 # for noTerms in htcTaskTerms.keys(): # print 'htcTaskTerms\t',i,'\t',query,'\t',htcTaskTerms[noTerms],'\t',noTerms # prec1 , mrr1 = getPrecRecall(htcTaskTerms[noTerms],aTerms) # prec = updateStats(noTerms, 'htcTask',prec1, prec) # mrr = updateStats(noTerms, 'htcTask',mrr1, mrr); # if entStatus1: # ent_prec = updateStats(noTerms, 'htcTask',prec1, ent_prec) # ent_mrr = updateStats(noTerms, 'htcTask',mrr1, ent_mrr); ##sess_prec[addLen] = updateStats(noTerms, 'htcTask',prec1, sess_prec[addLen]) ##sess_mrr[addLen] = updateStats(noTerms, 'htcTask',mrr1, sess_mrr[addLen]); # # print 'htcTaskMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1 for noTerms in coExpTerms.keys(): print "CoTerms\t", i, "\t", query, "\t", coExpTerms[noTerms], "\t", noTerms prec1, mrr1 = getPrecRecall(coExpTerms[noTerms], aTerms) prec = updateStats(noTerms, "co", prec1, prec) mrr = updateStats(noTerms, "co", mrr1, mrr) if entStatus1: ent_prec = updateStats(noTerms, "co", prec1, ent_prec) ent_mrr = updateStats(noTerms, "co", mrr1, ent_mrr) """ sess_prec[addLen] = updateStats(noTerms, 'co',prec1, sess_prec[addLen]) sess_mrr[addLen] = updateStats(noTerms, 'co' ,mrr1, sess_mrr[addLen]); """ print "CoMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1 else: pass # print 'NO ADDED TERMS in', i; i += 1 printMetric(prec, "entSub", "Prec") printMetric(mrr, "entSub", "Mrr") printMetric(prec, "ent", "Prec") printMetric(mrr, "ent", "Mrr") printMetric(prec, "htcTask", "Prec") printMetric(mrr, "htcTask", "Mrr") printMetric(prec, "qccTask", "Prec") printMetric(mrr, "qccTask", "Mrr") printMetric(prec, "co", "Prec") printMetric(mrr, "co", "Mrr") printMetric(ent_prec, "entSub", "EntPrec") printMetric(ent_mrr, "entSub", "EntMrr") printMetric(ent_prec, "ent", "EntPrec") printMetric(ent_mrr, "ent", "EntMrr") printMetric(ent_prec, "htcTask", "EntPrec") printMetric(ent_mrr, "htcTask", "EntMrr") printMetric(ent_prec, "qccTask", "EntPrec") printMetric(ent_mrr, "qccTask", "EntMrr") printMetric(ent_prec, "co", "EntPrec") printMetric(ent_mrr, "co", "EntMrr") plotMultipleSys( prec, "No of Terms", "Prec", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "prec.png", "Term Prediction Prec Plot", ) plotMultipleSys( mrr, "No of Terms", "MRR", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "mrr.png", "Term Prediction MRR Plot", ) plotMultipleSys( ent_prec, "No of Terms", "Prec", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_prec.png", "Term Prediction Prec Plot (Ent queries)", ) plotMultipleSys( ent_mrr, "No of Terms", "MRR", argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_mrr.png", "Term Prediction MRR Plot (Ent queries)", ) # htcTask.closeIndex(); qccTask.closeIndex() """