def generatePhraseFeatures(featureFile, spotFile, outFile): #load features for queries qfeatMan = FeatureManager() qfeatMan.readFeatures(featureFile) pid = 0 pfeatMan = FeatureManager() #generate features for phrases for query, pList in generatePhrases(spotFile): qkey, qfeat = qfeatMan.returnFeature(query) #print query, qkey if qkey: #print query, pList for phrase in pList: qVect = getDictFromSet(phrase.split()) ngrams = getNGramsAsList(phrase, 2) url = qfeat.returnUrl() user = qfeat.returnUsers() ent = qfeat.returnEntities() cat = qfeat.returnCategories() typ = qfeat.returnType() sess = qfeat.returnSessions() if 'tournament' in phrase: print query, phrase print sess print typ print ent nFeature = QueryFeature(phrase, ngrams, qVect, url, user, sess, ent, cat, typ) pfeatMan.addFeature(phrase, pid, nFeature) pid += 1 pfeatMan.writeFeatures(outFile)
def scoreWithCosine(self, qSet, clustList, cIndex, limit): toEvaluate = [] done = set() for entry in qSet: try: clusters = cIndex[entry] #print 'CLUSTERS',entry, clusters for cind in clusters: if cind not in done: toEvaluate.append(clustList[cind]) done.add(cind) except: pass #for each cluster find cosine similarity clustScore = {} i = 0 qDict = getDictFromSet(qSet) for clust in toEvaluate: cos = get_cosine(qDict, clust) if cos > 0: clustScore[i] = cos i += 1 toReturn = [] for entry in sorted(clustScore.items(), reverse=True, key=lambda x: x[1]): toReturn.append(toEvaluate[entry[0]].keys()) return toReturn
def expandTextWithStepAndSubcluster(self, qText, clickText, topC, limit1, limit2, step): spotDict = self.dexter.tagText(qText) entStatus = False scoredTerms = {} if len(spotDict) == 0: print 'No Entitity Found\t', qText else: print 'Tagged ', qText, '\t', spotDict entStatus = True qSplit = qText.split() qSet = set(qSplit) qDict = getDictFromSet(qSplit) #Rank cateogories print 'Click Text ', clickText catList = self.scoreCategories(qSet, qDict, clickText, spotDict, topC) print qText, 'CatList ', catList #Rank subclusters terms = self.aggregateTermsFromSubclusters(qSet, catList, limit2 + 400) print terms #print 'total term set',len(termSet); for i in xrange(limit1, limit2, step): if i == 0: scoredTerms[i] = self.ranker.getTopK(terms, i + 1) #getTopKWithFilter(terms,i+1,i+50) else: scoredTerms[i] = self.ranker.getTopK(terms, i) #getTopKWithFilter(terms,i,i+50) return entStatus, scoredTerms
def convertListToDict(iList): tSet = {} for entry in iList: for word, count in getDictFromSet(entry.split()).items(): try: tSet[word] += count except: tSet[word] = count return tSet
def getSessionTerms(session, porter): lSet = getDictFromSet(session[-1].split()) nlSet = normalizeDict(lSet, porter) tSet = convertListToDict(session) ntSet = normalizeDict(tSet, porter) tSet = removeSameKeys(nlSet, ntSet) return tSet
def expandText(self, text, topC, limit): spotDict = self.dexter.tagText(text) if len(spotDict) == 0: print 'No Entity found\t', text, spotDict else: print 'Tagged\t', text, '\t', spotDict qsplit = text.split() termSet = set(qsplit) termDict = getDictFromSet(qsplit) catList = self.scoreCategories(termSet, termDict, spotDict, topC) terms = self.aggregateTerms(text, catList) scoredTerms = self.ranker.getTopKWithFilter(terms, limit, limit + 50) return scoredTerms
def getTopEntityCategoryTerms(self, query, topC, limit): entCatTerms = {} spotDict = self.dexter.tagText(query) qsplit = query.split() termSet = set(qsplit) termDict = getDictFromSet(qsplit) catList = self.scoreCategories(termSet, termDict, spotDict, topC) for entity, cats in catList.iteritems(): entCatTerms[entity] = {} for cat, score in cats: terms = self.aggregateTermsForCategory(query, termSet, cat) entCatTerms[entity][cat] = self.ranker.getTopKWithFilter(terms, limit, limit + 50) return entCatTerms
def getNString(string, glen): string = string.strip() gString = '' ngrams = getNGramsAsList(string, glen) #print glen, string, bi, ind gString = ' '.join('{0}:{1}'.format(x.replace(' ', '_'), y) for x, y in ngrams.items()) queryVect = getDictFromSet(string.split()) qVectString = ' '.join('{0}:{1}'.format(x, y) for x, y in queryVect.items()) return gString + '\t' + qVectString
def getTopSubclusters(self, qText, clickText, topC, limit): spotDict = self.dexter.tagText(qText) entStatus = False if len(spotDict) == 0: print 'No Entitity Found\t', qText else: print 'Tagged ', qText, '\t', spotDict entStatus = True qSplit = qText.split() qSet = set(qSplit) qDict = getDictFromSet(qSplit) #Rank cateogories #catList = self.scoreCategories(qSet,qDict,clickText,spotDict,topC); #print qText,'CatList ',catList; #Rank subclusters topClusters = None #self.rankClusters(qSet,catList, limit); return entStatus, topClusters
def getClickedSummaryTerms(session, cSummary, cTitle, porter): tSet = {} sSet = {} qSet = getDictFromSet(session[-1].split()) nqSet = normalizeDict(qSet, porter) #print cTitle, cSummary tSet = convertListToDict(cTitle) sSet = convertListToDict(cSummary) ntSet = normalizeDict(tSet, porter) nsSet = normalizeDict(sSet, porter) tSet = removeSameKeys(nqSet, ntSet) sSet = removeSameKeys(nqSet, nsSet) return tSet, sSet
def expandTextWithStep(self, text, topC, limit1, limit2, step, spotDict=None): if not spotDict: spotDict = self.dexter.tagText(text) if len(spotDict) == 0: print 'No Entity found\t', text, spotDict else: print 'Tagged\t', text, '\t', spotDict qsplit = text.split() termSet = set(qsplit) termDict = getDictFromSet(qsplit) catList = self.scoreCategories(termSet, termDict, spotDict, topC) terms = self.aggregateTerms(text, catList) scoredTerms = {} for i in xrange(limit1, limit2, step): if i == 0: scoredTerms[i] = self.ranker.getTopKWithFilter(terms, i + 1, i + 50) else: scoredTerms[i] = self.ranker.getTopKWithFilter(terms, i, i + 50) return scoredTerms
def expandTextWithSubClusters(self, qText, clickText, topC, limit): spotDict = self.dexter.tagText(qText) entStatus = False if (not spotDict) or len(spotDict) == 0: print 'No Entitity Found\t', qText else: print 'Tagged ', qText, '\t', spotDict entStatus = True qSplit = qText.split() qSet = set(qSplit) qDict = getDictFromSet(qSplit) #Rank cateogories catList = self.scoreCategories(qSet, qDict, clickText, spotDict, topC) print qText, 'CatList ', catList #Rank subclusters termSet = self.aggregateTermsFromSubclusters(qSet, catList, limit + 100) #print len(termSet); #Rank terms scoredTerms = self.ranker.getTopKWithFilter(termSet, limit, limit + 50) return entStatus, scoredTerms
def scoreCategories(self, querySet, queryDict, clickText, spotDict, k): entityCatScore = {} cDict = getDictFromSet(clickText.split()) combDict = combineDict(cDict, queryDict) cSet = set(cDict.keys()) print 'cDict ', cDict, ' combDict ', combDict for entry, eDict in spotDict.iteritems(): catList = eDict['cat'].lower().split() queryTerms = (querySet | cSet) queryTerms = queryTerms - set([entry]) catScore = {} for cat in catList: pset = self.catManager.getPhraseSet(cat) #unique phrases in cat if len(pset) == 0: print 'CAT NO PHRASE ', cat qInt = pset & queryTerms #no of query terms cat contains score = 0.0 for iphrase in qInt: score += self.catManager.getPhraseProb(cat, iphrase) if len(queryTerms) > 0: score *= (1.0 * len(qInt)) / len(queryTerms) #cosine score cVector = self.catManager.getVector(cat) cscore = get_cosine(combDict, cVector) #total score catScore[cat] = (cscore + score) / 2.0 sortedScore = sorted(catScore.items(), reverse=True, key=lambda x: x[1]) #get terms from all categories if k == 1000 or k > len(sortedScore): k = len(sortedScore) entityCatScore[entry] = sortedScore[0:k] print 'Query\t', querySet, ' Entity\t', entry, entityCatScore[entry] return entityCatScore
def main(): parser = ap.ArgumentParser(description = 'Generate features for entity tagged queries') parser.add_argument('-i', '--iFile', help='Query log file', required=True) parser.add_argument('-o', '--oFile', help='Output feature file', required=True) parser.add_argument('-t', '--typeFile', help='DBPedia type file', required=True) parser.add_argument('-c', '--catFile', help='DBPedia cat file', required=True) parser.add_argument('-u', '--uid', help='User id present or not', required=True,type=bool) parser.add_argument('-w', '--wtype', help='Phrase (phrase) or query (query) features', required=True) args = parser.parse_args() boolUid = args.uid #load the category list dbCatList = loadCategories(args.catFile) #print 'Categories',len(dbCatList) #load the type list dbTypeList = loadInstancesInList(args.typeFile) #print 'Types',len(dbTypeList) #query list queryList = {} #user list userList = {} #url list urlList = {} #session list sessionList = {} #entity List entityList = {} #category List categoryList = {} #type list typeList = {} ipaddress = 'localhost' tagURL = 'http://'+ipaddress+':8080/dexter-webapp/api/rest/annotate' cqid = 1 sid = 1 qid = None for session in getSessionTuples(args.iFile,'\t', 1560): print 'Session id and length' , sid, len(session) for entry in session: query = entry[QUERY] #tag it with dexter and get all 3 parameters spotDict = tagQueryWithDexter(query,tagURL) if 'spots' in spotDict: updatedSpotDict = getCatAndTypeInfo(spotDict,dbCatList, dbTypeList) if args.wtype == 'query': #given wtype find the following if query not in queryList: #print 'Mapping ', query , 'to ', cqid queryList[query] = cqid qid = cqid cqid+=1 else: qid = queryList[query] updateDict(sessionList,sid, qid) if boolUid: updateDict(userList, entry[USER], qid) if CLICKU in entry: updateDict(urlList, entry[CLICKU],qid) if updatedSpotDict: for spot in updatedSpotDict['spots']: updateDict(categoryList,spot['cat'], qid) updateDict(typeList,spot['type'], qid) updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid) if args.wtype == 'phrase': for spot in updatedSpotDict['spots']: splits = query.split(spot['mention']) for split in splits: split = split.strip() #remove stop words split = filterStopWordsFromQuery(split) if len(split) > 1: if split not in queryList: queryList[split] = cqid qid = cqid cqid+=1 else: qid = queryList[split] updateDict(sessionList,sid, qid) if boolUid: updateDict(userList, entry[USER], qid) if CLICKU in entry: updateDict(urlList, entry[CLICKU],qid) if updatedSpotDict: updateDict(categoryList,spot['cat'], qid) updateDict(typeList,spot['type'], qid) updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid) sid+=1 #write the features to the outfile outF = open(args.oFile,'w') for query, qid in queryList.items(): outF.write(query) #generate ngrams queryVect = getDictFromSet(query.split()) ngramString = getNGramsAsList(query,3) #ngrams = 1 outF.write('\t'+str(ngramString)) #query vect = 2 outF.write('\t'+str(queryVect)) if qid in urlList: outF.write('\t'+str(urlList[qid])) else: outF.write('\t{}') if qid in userList: outF.write('\t'+str(userList[qid])) else: outF.write('\t{}') if qid in entityList: outF.write('\t'+str(entityList[qid])) else: outF.write('\t{}') if qid in categoryList: outF.write('\t'+str(categoryList[qid])) else: outF.write('\t{}') if qid in typeList: outF.write('\t'+str(typeList[qid])) else: outF.write('\t{}') if qid in sessionList: outF.write('\t'+str(sessionList[qid])) else: outF.write('\t{}') outF.write('\n') outF.close()
def getStatsPerQuery(argv): tagURL = 'http://localhost:8080/rest/annotate' catURL = 'http://localhost:8080/rest/graph/get-entity-categories' catVector = loadCategoryVector(argv[3]) f1Dict = getCats(argv[2]) sFound = 0.0 sTotal = 0.0 eTotal = set() eRemov = set() catFoundNoTerm = set() catNotFound = set() catTermFound = set() catEntity = set() outfile = open('match_session_dom.txt', 'w') #categoryVectors = {} for session in getSessionWithNL(argv[1]): catCount = {} entCount = {} querySpotList = {} for query in session: #find the entities in query try: spotDict = None #tagQueryWithDexter(query, tagURL,catURL) querySpotList[query] = spotDict for text in spotDict.keys(): for entry in spotDict[text]['cat'].split(): catCount[entry] = catCount.setdefault(entry, 1) + 1 entCount[text] = entCount.setdefault(text, 1) + 1 except Exception as err: print err #print 'SESSION', session, 'CATCOUNT', catCount, 'ENTCOUNT',entCount found = False if len(catCount) > 0: #find the dominant entity maxEnt = max(entCount.values()) #sessionQueryMapping = {} for query, spotList in querySpotList.iteritems(): matchl = spotList.keys() for entry in matchl: eTotal.add(entry) if entCount[entry] < maxEnt: spotList.pop(entry, None) print 'Removing spot', query, entry eRemov.add(entry) else: #get the categories #catTermMatch = {} rquery = query.replace(entry, '') queryTerms = set(rquery.split()) for cat in spotList[entry]['cat'].lower().split(): catEntity.add(entry + '_' + cat) if cat in f1Dict: phrase1 = loadPhrasesWithScore(argv[2] + '/' + f1Dict[cat]) pVector = catVector[cat] queryDict = getDictFromSet(queryTerms) pTotal = sum(phrase1.values()) pset = set(phrase1.keys()) sint = pset & queryTerms score = 0.0 cscore = get_cosine(queryDict, pVector) for iphrase in sint: score += phrase1[iphrase] / pTotal if len(queryTerms) > 0: score *= (1.0 * len(sint)) / len(queryTerms) if sint: outfile.write(query + '\t' + entry + '\t' + cat + '\t' + str(cscore) + '\t' + ', '.join(sint) + '\n') found = True catTermFound.add(entry + '_' + cat) else: outfile.write(query + '\t' + entry + '\t' + cat + '\t0\t0\n') catFoundNoTerm.add(cat + '_' + entry) else: outfile.write( query + '\t' + entry + '\t' + cat + '\t0\tNOT FOUND\n') catNotFound.add(cat + '_' + entry) #load the terms for category #check if these terms match if found: sFound += 1 sTotal += 1 outfile.write('\n') print 'Total Sessions ', sTotal print 'Sessions with dominant entity in AOL', sFound print '# Unique Entities', len(eTotal) print '# Removed Entities (non dominant)', len(eRemov) print '# no of entity types', len(catEntity) print '# no of entity types with terms match ', len(catTermFound) print '# no of entity types with no term match', len(catFoundNoTerm) print '# no of entity types with no match in AOL', len(catNotFound)
def combineQueryFeatures(queryFile, spotFile, featFile, newFile): #load features featDict = {} i = 1 urlDict = {} for line in open(featFile, 'r'): split = line.strip().split('\t') featDict[split[0].strip()] = split[1:] querySpots = {} for line in open(spotFile, 'r'): spotDict = ast.literal_eval(line) querySpots[spotDict['text']] = spotDict outF = open(newFile, 'w') #all queries for line in open(queryFile, 'r'): query = line.strip() queryFeat = [] #getNString(query,3).decode('utf-8') #triString = str(triString.encode('ascii','ignore')).strip() triString = getNGramsAsList(query, 3) if len(triString) > 0: queryFeat.append(triString) else: queryFeat.append({}) queryVect = getDictFromSet(query.split()) if len(queryVect) > 0: queryFeat.append(queryVect) else: queryFeat.append({}) if query in featDict: #normalize the users userString = getUserString(featDict[query][0]) if len(userString) > 0: queryFeat.append(userString) else: queryFeat.append({}) #normalize the urls i, urlDict, linkString = getUrlString(featDict[query][1], urlDict, i) if len(linkString) > 0: queryFeat.append(linkString) else: queryFeat.append({}) else: print 'Query not found ', query if query in querySpots: spotDict = querySpots[query] #ast.literal_eval(line) #cat, ent and type info result = getCatEntString(spotDict) for entry in result: if len(entry) > 0: queryFeat.append(entry) else: queryFeat.append({}) else: queryFeat += [{}, {}, {}] #print queryFeat try: outF.write(query) for entry in queryFeat: outF.write('\t' + str(entry)) outF.write('\n') except: print 'ERROR ', queryFeat outF.close()
def getPrecRecall(opt, catList, f1Dict, catVector, queryTerms, aTerms, index): catScore = {} maxQs = -1000 maxCat = '' notFound = set() for cat in catList: if cat in f1Dict: catScore[cat ] = {'aP': 0.0, 'aR': 0.0, 'qS': 0.0, 'qInt': set(), 'aInt': set()} #phrase cat score phrase1 = loadPhrasesWithScore(f1Dict[cat]) pTotal = sum(phrase1.values()) #total no of terms in cat pset = set(phrase1.keys()) #unique phrases in cat qInt = pset & queryTerms #no of query terms cat contains score = 0.0 for iphrase in qInt: score += phrase1[iphrase] / pTotal if len(queryTerms) > 0: score *= (1.0 * len(qInt)) / len(queryTerms) #cosine score queryDict = getDictFromSet(queryTerms) cVector = catVector[cat] cscore = get_cosine(queryDict, cVector) #total score catScore[cat]['qs'] = cscore + score if maxQs < catScore[cat]['qs']: maxQs = catScore[cat]['qs'] maxCat = cat sortP = sorted(phrase1.items(), reverse=True, key=lambda x: x[1]) #print 'sorted' , sortP[0],sortP[1] apset = set(x[0] for x in sortP[0:index]) #print 'pSet ',apset aInt = aTerms & apset catScore[cat]['aP'] = (1.0 * len(aInt)) / len(aTerms) catScore[cat]['aR'] = (1.0 * len(aInt)) / len(apset) catScore[cat]['aInt'] = aInt catScore[cat]['qInt'] = qInt else: notFound.add(cat) if opt == 'max': if maxCat in catScore: return notFound, maxCat, catScore[maxCat] else: return notFound, None, { 'aP': 0.0, 'aR': 0.0, 'qS': 0.0, 'qInt': set(), 'aInt': set() } else: avgScore = {'aP': 0.0, 'aR': 0.0, 'qS': 0.0, 'qInt': set(), 'aInt': set()} for entry, cdict in catScore.iteritems(): avgScore['aP'] += cdict['aP'] avgScore['aR'] += cdict['aR'] avgScore['qS'] += cdict['qS'] avgScore['qInt'] |= cdict['qInt'] avgScore['aInt'] |= cdict['aInt'] avgScore['aP'] /= len(catScore) avgScore['aR'] /= len(catScore) avgScore['qS'] /= len(catScore) return notFound, None, avgScore return notFound, None, None