Пример #1
0
def getEntCatFeatures(fileName, queryFreq):
  wordFeat = {}

  for line in open(fileName, 'r'):
    split = line.split('\t')
    query = (split[0].strip().decode('utf-8')).encode('ascii', 'ignore')
    freq = 1.0
    if query in queryFreq:
      freq = queryFreq[query]

    spotDict = split[1].strip()
    sDict = SpotDict(spotDict, query)
    nEntTerms = sDict.getNonEntityTerms()
    for oentry in nEntTerms:
      entry = porter.stem(oentry)
      if len(entry) > 2:
        if entry not in wordFeat:
          wordFeat[entry] = Word(entry, '')
        for entity in sDict.getEntities():
          wordFeat[entry].updateEntStats(entity, freq)

          for cat in sDict.getEntCategories(entity):
            if 'redirect' not in cat:
              wordFeat[entry].updateCatStats(cat, freq)
  return wordFeat
def loadNonEntityTerms(fileName):
  queryTermDict = {}
  terms = 0.0
  sDict = None
  #for each query store the terms
  for line in open(fileName, 'r'):
    split = line.split('\t')
    query = (split[0].strip().decode('utf-8')).encode('ascii', 'ignore')
    spotDict1 = split[1].strip()
    #create the dict object
    sDict = SpotDict(spotDict1, query)
    if len(sDict.getNonEntityTerms()) > 0 and query not in queryTermDict:
      queryTermDict[query] = sDict
      terms += sDict.getNonEntTermsLen()

  print len(queryTermDict), terms
  return queryTermDict