def getEntCatFeatures(fileName, queryFreq): wordFeat = {} for line in open(fileName, 'r'): split = line.split('\t') query = (split[0].strip().decode('utf-8')).encode('ascii', 'ignore') freq = 1.0 if query in queryFreq: freq = queryFreq[query] spotDict = split[1].strip() sDict = SpotDict(spotDict, query) nEntTerms = sDict.getNonEntityTerms() for oentry in nEntTerms: entry = porter.stem(oentry) if len(entry) > 2: if entry not in wordFeat: wordFeat[entry] = Word(entry, '') for entity in sDict.getEntities(): wordFeat[entry].updateEntStats(entity, freq) for cat in sDict.getEntCategories(entity): if 'redirect' not in cat: wordFeat[entry].updateCatStats(cat, freq) return wordFeat
def loadNonEntityTerms(fileName): queryTermDict = {} terms = 0.0 sDict = None #for each query store the terms for line in open(fileName, 'r'): split = line.split('\t') query = (split[0].strip().decode('utf-8')).encode('ascii', 'ignore') spotDict1 = split[1].strip() #create the dict object sDict = SpotDict(spotDict1, query) if len(sDict.getNonEntityTerms()) > 0 and query not in queryTermDict: queryTermDict[query] = sDict terms += sDict.getNonEntTermsLen() print len(queryTermDict), terms return queryTermDict