def findSessionCountsOfNonEnt(netDict, queryFile, outFile): coOccur = CoOccurrence() qTerms = '' for session in getSessionWithQuery(queryFile): #for each query get nonEntTerms and update co-occurrence stats qTerms = '' for query in session: query = (query.decode('utf-8')).encode('ascii', 'ignore') if query in netDict: for entry in netDict[query].getNonEntityTerms(): if entry not in qTerms: qTerms += ' ' + entry qTerms = qTerms.strip() if len(qTerms) > 2: ngrams = getNGramsAsList(qTerms.strip(), 1) lngrams = len(ngrams) if lngrams > 1: for i in range(lngrams - 1): if ngrams[i] not in stopSet and len(ngrams[i]) > 2: for j in range(i + 1, lngrams): if ngrams[j] not in stopSet and len(ngrams[j]) > 2: coOccur.updateStats(ngrams[i], ngrams[j], 1.0) coOccur.setTermTotal() coOccur.writeTermCo(outFile)
def findBigramCoOccurence(sessionFile, outFile): #generate all bigrams from session # for every bigram get term and co-occurrence bigramCo = {} for session in getSessionWithQuery(sessionFile): for i in range(len(session) - 1): b1 = getNGramsAsList(session[i], 2) b2 = getNGramsAsList(session[i + 1], 2)
def loadSessions(sessFile): sessions = [] count = 0.0 clen = 0 for session in getSessionWithQuery(sessFile): if len(session) > 0: sessions.append(session) count += 1 clen += len(session) print count, clen / count return sessions
def findSessionCounts(queryFile, outFile, wordSet): coOccur = {} #CoOccurrence(); qTerms = '' sess = 0 qid = 0.0 qSet = set() for session in getSessionWithQuery(queryFile): qSet.clear() for query in session: qid += 1 terms = getQueryTerms(query) if len(terms) > 0: qSet |= getQueryTerms(query) if qid % 1000000 == 0: print qid print len(coOccur) #print len(session) , len(qSet); #for each query get nonEntTerms and update co-occurrence stats qTerms = '' qTerms = ' '.join(qSet) if len(qTerms) > 3 and len(qSet) > 1: #print qSet; ngrams = sorted(getNGramsAsList(qTerms.strip(), 1)) lngrams = len(ngrams) if lngrams > 1: for i in range(lngrams - 1): if ngrams[i] not in stopSet and len( ngrams[i]) > 2 and ngrams[i] in wordSet: for j in range(i + 1, lngrams): if ngrams[j] not in stopSet and len( ngrams[j]) > 2 and ngrams[j] in wordSet: #coOccur.updateStats(ngrams[i],ngrams[j],1.0); key = ngrams[i] + ' ' + ngrams[j] if key not in coOccur: coOccur[key] = 0.0 coOccur[key] += 1.0 if len(coOccur) >= 9000000: writeDictToFile(outFile, coOccur, sess) coOccur.clear() coOccur = {} sess += 1
def main(argv): simpleWalk = SimpleWalk() top50 = loadFileInList(argv[2]) porter = stem.porter.PorterStemmer() for rsession in getSessionWithQuery(argv[1]): i = 0 j = 1 session = removeWrongEntries(rsession, top50) sesLen = len(session) while i < sesLen and j < sesLen: stemI = stemQuery(session[i], porter) stemJ = stemQuery(session[j], porter) simpleWalk.addEdge(stemI, stemJ, 1.0) i = j j += 1 simpleWalk.filter(2) simpleWalk.walk()
def populateDatasetWithBigrams(logFile, bigramSet, queryFile): sid = 0 queryList = buildBigramSet(queryFile) stemmer = stem.porter.PorterStemmer() for session in getSessionWithQuery(logFile): sessionStr = ' '.join(session) sessionSet = set(getNGramsAsList(sessionStr, 2)) inter = sessionSet & bigramSet #print len(sessionSet), len(bigramSet), inter if len(inter) > 0: lastq = None for q in session: if q in queryList: q = normalize(q, stemmer) if lastq != q and len(q) > 1: print sid, '\t', q lastq = q sid += 1
def populateDataset(logFile, queryList): sid = 1 sessionList = {} for session in getSessionWithQuery(logFile): prints = False #print len(session) for entry in session: if entry in queryList: #prints = True; if entry not in sessionList: sessionList[entry] = {} if sid not in sessionList[entry]: sessionList[entry][sid] = 0.0 sessionList[entry][sid] += 1.0 #else: # print 'NOT FOUND ',entry #print session sid += 1 for entry, sessionCount in sessionList.iteritems(): print entry, '\t', sessionCount
def main(argv): ischema = Schema(session=TEXT(stored=True, phrase=False)) if not os.path.exists(argv[2]): os.mkdir(argv[2]) qindex = create_in(argv[2], schema=ischema, indexname=argv[2]) writer = qindex.writer() i = 0 for sess in getSessionWithQuery(argv[1], 1500): #print sess string = ' '.join(sess) try: writer.add_document(session=unicode(string.decode( 'unicode_escape').encode('ascii', 'ignore'))) except Exception as err: print sess, 'problem in indexing' print err, err.args i += 1 if i % 100000 == 0: print i writer.commit() qindex.close()
featMan.readFeatures(args.featFile) # Loads the distance between two queries (i.e. 1-similarity) weightMatrix = readWeightMatrix(args.distFile) print len(weightMatrix) samePairsSet = differentPairsSet = None if args.pairLabelFile: samePairsSet , differentPairsSet = loadPairsFromFile(args.pairLabelFile) total_metrics_dict = {} for threshold in np.arange(args.lowerLimit, args.upperLimit, 0.02): sessCount = 0 lastSes = None session = [] metrics = {} qcc = QCCTasks() for session in getSessionWithQuery(args.sessionFile): #calculate the score for i in range(len(session) - 1): qid1, qf1 = featMan.returnFeature(session[i]) if qf1: for j in range(i + 1, len(session)): qid2, qf2 = featMan.returnFeature(session[j]) if qf2: try: if qid1 < qid2: edgeScore = 1.0 - weightMatrix[qid1][qid2] else: edgeScore = 1.0 - weightMatrix[qid2][qid1] if edgeScore > threshold: qcc.addEdge(qid1, qid2, edgeScore) except: