def getPrecRecall(opt, catList, f1Dict, catVector, queryTerms, aTerms, index): catScore = {} maxQs = -1000 maxCat = '' notFound = set() for cat in catList: if cat in f1Dict: catScore[cat ] = {'aP': 0.0, 'aR': 0.0, 'qS': 0.0, 'qInt': set(), 'aInt': set()} #phrase cat score phrase1 = loadPhrasesWithScore(f1Dict[cat]) pTotal = sum(phrase1.values()) #total no of terms in cat pset = set(phrase1.keys()) #unique phrases in cat qInt = pset & queryTerms #no of query terms cat contains score = 0.0 for iphrase in qInt: score += phrase1[iphrase] / pTotal if len(queryTerms) > 0: score *= (1.0 * len(qInt)) / len(queryTerms) #cosine score queryDict = getDictFromSet(queryTerms) cVector = catVector[cat] cscore = get_cosine(queryDict, cVector) #total score catScore[cat]['qs'] = cscore + score if maxQs < catScore[cat]['qs']: maxQs = catScore[cat]['qs'] maxCat = cat sortP = sorted(phrase1.items(), reverse=True, key=lambda x: x[1]) #print 'sorted' , sortP[0],sortP[1] apset = set(x[0] for x in sortP[0:index]) #print 'pSet ',apset aInt = aTerms & apset catScore[cat]['aP'] = (1.0 * len(aInt)) / len(aTerms) catScore[cat]['aR'] = (1.0 * len(aInt)) / len(apset) catScore[cat]['aInt'] = aInt catScore[cat]['qInt'] = qInt else: notFound.add(cat) if opt == 'max': if maxCat in catScore: return notFound, maxCat, catScore[maxCat] else: return notFound, None, { 'aP': 0.0, 'aR': 0.0, 'qS': 0.0, 'qInt': set(), 'aInt': set() } else: avgScore = {'aP': 0.0, 'aR': 0.0, 'qS': 0.0, 'qInt': set(), 'aInt': set()} for entry, cdict in catScore.iteritems(): avgScore['aP'] += cdict['aP'] avgScore['aR'] += cdict['aR'] avgScore['qS'] += cdict['qS'] avgScore['qInt'] |= cdict['qInt'] avgScore['aInt'] |= cdict['aInt'] avgScore['aP'] /= len(catScore) avgScore['aR'] /= len(catScore) avgScore['qS'] /= len(catScore) return notFound, None, avgScore return notFound, None, None
def getStatsPerQuery(argv): tagURL = 'http://localhost:8080/rest/annotate' catURL = 'http://localhost:8080/rest/graph/get-entity-categories' catVector = loadCategoryVector(argv[3]) f1Dict = getCats(argv[2]) sFound = 0.0 sTotal = 0.0 eTotal = set() eRemov = set() catFoundNoTerm = set() catNotFound = set() catTermFound = set() catEntity = set() outfile = open('match_session_dom.txt', 'w') #categoryVectors = {} for session in getSessionWithNL(argv[1]): catCount = {} entCount = {} querySpotList = {} for query in session: #find the entities in query try: spotDict = None #tagQueryWithDexter(query, tagURL,catURL) querySpotList[query] = spotDict for text in spotDict.keys(): for entry in spotDict[text]['cat'].split(): catCount[entry] = catCount.setdefault(entry, 1) + 1 entCount[text] = entCount.setdefault(text, 1) + 1 except Exception as err: print err #print 'SESSION', session, 'CATCOUNT', catCount, 'ENTCOUNT',entCount found = False if len(catCount) > 0: #find the dominant entity maxEnt = max(entCount.values()) #sessionQueryMapping = {} for query, spotList in querySpotList.iteritems(): matchl = spotList.keys() for entry in matchl: eTotal.add(entry) if entCount[entry] < maxEnt: spotList.pop(entry, None) print 'Removing spot', query, entry eRemov.add(entry) else: #get the categories #catTermMatch = {} rquery = query.replace(entry, '') queryTerms = set(rquery.split()) for cat in spotList[entry]['cat'].lower().split(): catEntity.add(entry + '_' + cat) if cat in f1Dict: phrase1 = loadPhrasesWithScore(argv[2] + '/' + f1Dict[cat]) pVector = catVector[cat] queryDict = getDictFromSet(queryTerms) pTotal = sum(phrase1.values()) pset = set(phrase1.keys()) sint = pset & queryTerms score = 0.0 cscore = get_cosine(queryDict, pVector) for iphrase in sint: score += phrase1[iphrase] / pTotal if len(queryTerms) > 0: score *= (1.0 * len(sint)) / len(queryTerms) if sint: outfile.write(query + '\t' + entry + '\t' + cat + '\t' + str(cscore) + '\t' + ', '.join(sint) + '\n') found = True catTermFound.add(entry + '_' + cat) else: outfile.write(query + '\t' + entry + '\t' + cat + '\t0\t0\n') catFoundNoTerm.add(cat + '_' + entry) else: outfile.write( query + '\t' + entry + '\t' + cat + '\t0\tNOT FOUND\n') catNotFound.add(cat + '_' + entry) #load the terms for category #check if these terms match if found: sFound += 1 sTotal += 1 outfile.write('\n') print 'Total Sessions ', sTotal print 'Sessions with dominant entity in AOL', sFound print '# Unique Entities', len(eTotal) print '# Removed Entities (non dominant)', len(eRemov) print '# no of entity types', len(catEntity) print '# no of entity types with terms match ', len(catTermFound) print '# no of entity types with no term match', len(catFoundNoTerm) print '# no of entity types with no match in AOL', len(catNotFound)