def findPairwiseDistance(featureFile, outFile): featMan = FeatureManager() featMan.readFeatures(featureFile) featDict = featMan.featureDict oFile = open(outFile, 'w') ids = featDict.keys() keys = sorted(ids) print len(keys), keys[-5:] for i in range(0, len(keys) - 1): qid1, qf1 = featMan.returnFeature(keys[i]) for j in range(i + 1, len(keys)): qid2, qf2 = featMan.returnFeature(keys[j]) qcos, ucos, userCos, sessionCos, ngramCos, entCos, \ catCos,typeCos = qf1.findCosineDistance(qf2) qjac = qf1.findJacardDistance(qf2) #qedit = qf1.findEditDistance(qf2) edgeScore = (15*((qcos + qjac )/2.0) +\ 12.5*ngramCos + 12.5*ucos + 20*sessionCos +\ 20*userCos + 10*((entCos + catCos)/2.0) + 10*typeCos)/100.0 if edgeScore > 0.0: oFile.write( #str(qid1) + ' ' + str(qid2) + ' ' + str(round(edgeScore, 3)) + '\n') featMan.returnQuery(qid1) + '\t' + featMan.returnQuery(qid2) + '\t' + str(round(edgeScore, 3)) + '\n') #oFile1.write(str(qid1)+'\t'+str(qid2)+'\t'+\ #str(round(qcos,2))+'\t'+str(round(qjac,2))+'\t'+\ #str(round(ngramCos,2))+'\t'+str(round(userCos,2))+'\t' + \ #str(round(entCos,2))+'\t'+ str(round(catCos,2))+\ #'\t'+ str(round(sessionCos,2))+'\t'+ str(round(typeCos,2))+'\n') oFile.close()
def printCategoryQueryDictionary(fileName, clusFile, weightFile): featMan = FeatureManager() featMan.readFeatures(fileName) categoryDictionary = {} for query, feat in featMan.iterFeatures(): catDict = feat.returnFeature('cat') for entry in catDict: if entry not in categoryDictionary: categoryDictionary[entry] = set() categoryDictionary[entry].add(query) outC = open(clusFile,'w') outW = open(weightFile,'w') for entry, qlist in categoryDictionary.items(): outC.write(toString(qlist,featMan)+'\n') outW.write(str(qlist)+'\n') outC.close() weightMatrix = {} cc = 0 #calculate the weight matrix for entry, qlist in categoryDictionary.items(): sort = sorted(qlist) for i in range(len(sort)-1): qid1, qf1 = featMan.returnFeature(sort[i]) if qf1: if sort[i] not in weightMatrix: weightMatrix[sort[i]] = {} for j in range(i+1,len(sort)): qid2, qf2 = featMan.returnFeature(sort[j]) if qf2: if sort[j] not in weightMatrix[sort[i]]: qcos, ucos, userCos, ngramCos, entCos, catCos = qf1.findCosineDistance(qf2) qjac = qf1.findJacardDistance(qf2) #qedit = qf1.findEditDistance(qf2) #normalized distance #dist = (j - i)#*1.0/len(session) edgeScore = (.25*((qcos + qjac )/2.0) +\ .15*ngramCos + .15*ucos + \ .15*userCos + .15*entCos + .15*catCos) if edgeScore > 0.05: weightMatrix[sort[i]][sort[j]] = edgeScore if cc % 10000==0: print cc cc+=1 outW.write('\n') for entry1, scoreList in weightMatrix.items(): for entry2, score in scoreList.items(): outW.write(str(entry1)+' '+str(entry2)+' '+str(score)+'\n'); outW.close();
def generatePhraseFeatures(featureFile, spotFile, outFile): #load features for queries qfeatMan = FeatureManager() qfeatMan.readFeatures(featureFile) pid = 0 pfeatMan = FeatureManager() #generate features for phrases for query, pList in generatePhrases(spotFile): qkey, qfeat = qfeatMan.returnFeature(query) #print query, qkey if qkey: #print query, pList for phrase in pList: qVect = getDictFromSet(phrase.split()) ngrams = getNGramsAsList(phrase, 2) url = qfeat.returnUrl() user = qfeat.returnUsers() ent = qfeat.returnEntities() cat = qfeat.returnCategories() typ = qfeat.returnType() sess = qfeat.returnSessions() if 'tournament' in phrase: print query, phrase print sess print typ print ent nFeature = QueryFeature(phrase, ngrams, qVect, url, user, sess, ent, cat, typ) pfeatMan.addFeature(phrase, pid, nFeature) pid += 1 pfeatMan.writeFeatures(outFile)
print len(weightMatrix) samePairsSet = differentPairsSet = None if args.pairLabelFile: samePairsSet , differentPairsSet = loadPairsFromFile(args.pairLabelFile) total_metrics_dict = {} for threshold in np.arange(args.lowerLimit, args.upperLimit, 0.02): sessCount = 0 lastSes = None session = [] metrics = {} qcc = QCCTasks() for session in getSessionWithQuery(args.sessionFile): #calculate the score for i in range(len(session) - 1): qid1, qf1 = featMan.returnFeature(session[i]) if qf1: for j in range(i + 1, len(session)): qid2, qf2 = featMan.returnFeature(session[j]) if qf2: try: if qid1 < qid2: edgeScore = 1.0 - weightMatrix[qid1][qid2] else: edgeScore = 1.0 - weightMatrix[qid2][qid1] if edgeScore > threshold: qcc.addEdge(qid1, qid2, edgeScore) except: pass else: print 'Query feature error ', session[i]