def loadTasks(fileName, ttype): iFile = open(fileName, 'r') corpus = [[], []] tokenDict = {} taskVectorDict = {} tid = 0 for line in iFile: split = line.strip().split('\t') taskDict = literal_eval(split[-1]) for entry, entDict in taskDict[ttype]['tasks'].iteritems(): #indexTaskVectors(entry,None,tid,tokenDict,taskVectorDict) ttext = SPACE.join(entry) tid += 1 corpus[0].append(ttext) #print corpus[0], ttext, entry taskTokenDict = text_to_vector(ttext) #etDict = {} #for entity in entDict['AND']: #etDict[str(entity)] = entDict['AND'][entity]['score'] #etDict.update(taskTokenDict) corpus[1].append(taskTokenDict) #print etDict, corpus[1][-1] #print tsDict, corpus[0][-1] iFile.close() return corpus, tokenDict, taskVectorDict
def getUserVector(fileName, uIndex, qIndex): userVector = {} lastUser = None porter1 = porter.PorterStemmer() for line in open(fileName, 'r'): split = line.strip().split('\t') uId = split[uIndex] query = split[qIndex] if not lastUser: lastUser = uId raw_split = re.sub(SYMB, ' ', query.lower()).split(' ') query = filterStopWordsFromList(raw_split) #print uId, lastUser, lastUser!=uId if lastUser != uId: yield lastUser, userVector userVector = {} if (not (hasManyChars(query,raw_split,1,4,70) \ or hasInapWords(raw_split) or hasManyWords(raw_split,15,40))) \ and hasAlpha(query): qDict = text_to_vector(query) for entry, val in qDict.iteritems(): entry1 = porter1.stem(entry) userVector[entry1] = userVector.setdefault(entry1, 0.0) + val lastUser = uId yield lastUser, userVector
def getPairFeatures(session): totalTime = 1.0 + (session[-1][QTIME] - session[0][QTIME]).total_seconds() for i in range(len(session) - 1): for j in range(i + 1, len(session)): e1 = session[i] e2 = session[j] jaccard = 1.0 - distance.jaccard(e1[QUERY].split(), e2[QUERY].split()) edit = 1.0 - distance.nlevenshtein(e1[QUERY].split(), e2[QUERY].split()) timeDiff = ((e2[QTIME] - e1[QTIME]).total_seconds()) / totalTime * 1.0 #normalized distance dist = (j - i) * 1.0 / len(session) urlMatch = -1 if CLICKU in e1 and CLICKU in e2: urlMatch = 1.0 - distance.nlevenshtein(e1[CLICKU], e2[CLICKU]) cosine = get_cosine(text_to_vector(e1[QUERY]), text_to_vector(e2[QUERY])) edgeScore = .20 * cosine + .20 * jaccard + .20 * edit + .15 * dist + .15 * timeDiff + .10 * urlMatch yield i, j, edgeScore, cosine, jaccard, edit, dist, timeDiff, urlMatch
def getTermList(queryList): termList = {} for entry in queryList: count = text_to_vector(entry) for w, c in count.items(): #w = porter.stem(w) if w not in termList: termList[w] = 0.0 termList[w] += c #print 'TermList ',len(termList), termList return termList.items(), set(termList.keys())
def getTaskTermSet(rSort, rank): termSet = {} index = rank if rank == 'all': index = len(rSort) for entry in rSort[:index]: #tDict = taskDict[entry[0]] tDict = text_to_vector(entry[0]) for tentry, value in tDict.iteritems(): termSet[tentry] = tDict.setdefault(tentry, 0.0) + value return sorted(termSet.iteritems(), reverse=True, key=lambda x: x[1])
def loadTasksFromTxt(fileName): corpus = [[], []] tokenDict = {} taskVectorDict = {} tid = 0 for line in open(fileName, 'r'): line = line.strip() #indexTaskVectors(line,None,tid,tokenDict,taskVectorDict) tid += 1 corpus[0].append(line) taskTokenDict = text_to_vector(line) corpus[1].append(taskTokenDict) return corpus, tokenDict, taskVectorDict
def getTaskTermSet(self, rSort, text): termSet = {} #qterms = getQueryTerms(text); for entry in rSort: #tDict = taskDict[entry[0]] tDict = text_to_vector(entry[0]) for tentry, value in tDict.iteritems(): stem = self.porter.stem(tentry) if tentry not in stopSet and len(tentry) >2 and hasAlpha(tentry) \ and (tentry not in text and stem not in text) and tentry in self.vocab: termSet[stem] = termSet.setdefault(stem, 0.0) + value #for term in termSet.keys(): # for qterm in qTerms: # termSet[term] *= return termSet