Пример #1
0
def updateNetwork(query, network, qp, searcher, tlc, field, ntype):
  #find the top 50 documents
  q = qp.parse(unicode(query))
  totalText = ''
  total = 0.0
  tmin = -1000
  tmax = 1000
  terms = set()
  try:
    searcher.search_with_collector(q, tlc)
  except TimeLimit:
    print '--LONG-- ', query

  results = tlc.results()
  for entry in results:
    totalText += entry[field] + ' '

  finder = BigramCollocationFinder.from_words(word_tokenize(totalText))
  #update the network

  rList = finder.score_ngrams(biMeas.pmi)

  for rTuple in rList:
    total += rTuple[1]
    if tmin > rTuple[1]:
      tmin = rTuple[1]
    if tmax < rTuple[1]:
      tmax = rTuple[1]

  for rTuple in sorted(rList, reverse=True, key=lambda x: x[1]):
    if (len(terms) < 3000 and finder.ngram_fd[rTuple[0]] > 2
     ) or (finder.ngram_fd[rTuple[0]] > 1.0 and rTuple[0][0] in query or
           rTuple[0][1] in query and len(terms) < 4000):
      #if (len(terms) < 3000  and finder.ngram_fd[rTuple[0]] > 2) or (rTuple[0][0] in query or rTuple[0][1] in query and len(terms) < 4000):
      a = rTuple[0][0]
      if len(a) > 2 and hasAlpha(a) and a not in stopSet and not hasWebsite(a):
        if a not in network:
          network[a] = {}
          terms.add(a)
        b = rTuple[0][1]
        if len(b) > 2 and hasAlpha(b) and b not in stopSet and not hasWebsite(
            b):
          if b not in network[a]:
            network[a][b] = {}
            terms.add(b)
          network[a][b][ntype] = network[a][b].setdefault(ntype, 0.0) + (
              (rTuple[1] - tmin) / (tmax - tmin))

  print query, ntype, len(terms)

  return terms
Пример #2
0
def getUserVector(fileName, uIndex, qIndex):
  userVector = {}
  lastUser = None
  porter1 = porter.PorterStemmer()

  for line in open(fileName, 'r'):
    split = line.strip().split('\t')
    uId = split[uIndex]
    query = split[qIndex]

    if not lastUser:
      lastUser = uId
    raw_split = re.sub(SYMB, ' ', query.lower()).split(' ')
    query = filterStopWordsFromList(raw_split)
    #print uId, lastUser, lastUser!=uId
    if lastUser != uId:
      yield lastUser, userVector
      userVector = {}

    if (not (hasManyChars(query,raw_split,1,4,70) \
			or hasInapWords(raw_split) or hasManyWords(raw_split,15,40))) \
			and hasAlpha(query):
      qDict = text_to_vector(query)
      for entry, val in qDict.iteritems():
        entry1 = porter1.stem(entry)
        userVector[entry1] = userVector.setdefault(entry1, 0.0) + val

    lastUser = uId
  yield lastUser, userVector
Пример #3
0
  def createVector(self, fileName):
    porter = stem.porter.PorterStemmer()
    word_catVector = {}
    word_entVector = {}
    for line in open(fileName, 'r'):
      split = line.strip().split('\t')
      query = split[0]
      qsplit = query.split()
      spotDict = ast.literal_eval(split[1])
      for entity, elist in spotDict.iteritems():
        for oword in qsplit:
          oword = oword.replace('\'', '')
          word = porter.stem(oword)
          if len(word) > 2 and hasAlpha(word) and word not in stopSet:
            if word not in word_entVector:
              word_catVector[word] = {}
              word_entVector[word] = {}
            for cat in elist['cat'].split():
              word_catVector[word][cat] = word_catVector[word].setdefault(
                  cat, 0.0) + 1.0
            word_entVector[word][entity] = word_entVector[word].setdefault(
                entity, 0.0) + 1.0

    self.writeVector('ont/Word_catCount.txt', word_catVector)
    self.writeVector('ont/Word_entCount.txt', word_entVector)
Пример #4
0
  def updateNetworkFromText(self, query, text, ntype):

    total = 0.0
    tmin = -1000
    tmax = 1000

    qsplit = query.split()
    for entry in qsplit:
      term = self.porter.stem(entry)
      self.network[term] = {}
      self.terms.add(term)

    finder = BigramCollocationFinder.from_words(word_tokenize(text))
    #update the network

    rList = finder.score_ngrams(self.biMeas.pmi)
    for rTuple in rList:
      total += rTuple[1]
      if tmin > rTuple[1]:
        tmin = rTuple[1]
      if tmax < rTuple[1]:
        tmax = rTuple[1]

    for rTuple in sorted(rList, reverse=True, key=lambda x: x[1]):
      if (len(self.terms) < 1000  and finder.ngram_fd[rTuple[0]] > 2) or \
			((finder.ngram_fd[rTuple[0]] > 1.0 and rTuple[0][0] in query) or \
			 (rTuple[0][1] in query and len(self.terms) < 1500)):
        noSymbA = SYMBreg.sub('', rTuple[0][0])
        noSymbB = SYMBreg.sub('', rTuple[0][1])

        if noSymbA not in stopSet and noSymbB not in stopSet:
          a = self.porter.stem(noSymbA)
          b = self.porter.stem(noSymbB)
          if len(a) > 2 and hasAlpha(a) and a not in stopSet and not hasWebsite(a) \
					and len(b) > 2 and hasAlpha(b) and b not in stopSet and not hasWebsite(b):
            if a not in self.network:
              self.network[a] = {}
              self.terms.add(a)
            if b not in self.network[a]:
              self.network[a][b] = {}
              self.terms.add(b)
            self.network[a][b][ntype] = self.network[a][b].setdefault(
                ntype, 0.0) + ((rTuple[1] - tmin) / (tmax - tmin))

    print query, ntype, len(self.terms)
  def getVectorFromTuple(self, tTuple):
    tDict = {}
    #print tTuple

    for entry in tTuple:
      split = entry.split()
      for unstem in split:
        word = self.porter.stem(unstem)
        if word not in stopSet and hasAlpha(word) and len(
            word) > 2 and word not in ashleelString:
          tDict[word] = tDict.setdefault(word, 0.0) + 1.0
    #print tDict
    return tDict
Пример #6
0
  def expandText(self, query, limit):
    #get the entities
    spotDict = self.dexter.tagText(query)
    #P(c)
    pC = 1.0 / self.entityCatManager.getUniqueTermCount()

    termList = self.termTermManager.getUniqueTerms()

    pEC = {}
    for entity, edict in spotDict.iteritems():
      catList = edict['cat'].split()
      #SUM(P(e|c)P(c))
      pEC[entity] = 0.0
      for cat in catList:
        #SUM(P(e|c)P(c))
        #print entity, cat, self.entityCatManager.getProb(cat, entity)
        pEC[entity] += pC * self.entityCatManager.getProb(cat, entity)
      #print entity, 'pEC', pEC[entity]

    termScore = {}

    for term in termList:
      pTE = 0.0
      termScore[term] = 0.0
      #P(t|e)
      for entity, score in pEC.iteritems():
        repQuery = query.replace(entity, '')
        pTT = 0.0000001
        qsplit = repQuery.split()
        for entry in qsplit:
          if len(entry) > 2 and entry not in stopSet and hasAlpha(entry):
            pTT += self.termTermManager.getProb(term, entry)
            #print term, entry, self.termTermManager.getProb(term, entry)
        pTE = self.termEntityManager.getProb(entity, term)
        if pTE == 0.0:
          pTE = 0.0000001
        #print term, entity, pTE, score, pTT
        termScore[term] += pTE * score * pTT
      if termScore[term] < 0.0 or termScore[term] > 0.0:
        termScore[term] = math.log(termScore[term])

    resultSet = {}
    for ttuple in sorted(termScore.items(), reverse=True, key=lambda x: x[1]):
      #print ttuple,
      resultSet[ttuple[0]] = ttuple[1]
      if len(resultSet) == limit + 10:
        break
    #print
    #print query, '\t', resultSet
    return resultSet
Пример #7
0
  def getTaskTermSet(self, rSort, text):

    termSet = {}
    #qterms = getQueryTerms(text);
    for entry in rSort:
      #tDict = taskDict[entry[0]]
      tDict = text_to_vector(entry[0])
      for tentry, value in tDict.iteritems():
        stem = self.porter.stem(tentry)
        if tentry not in stopSet and len(tentry) >2 and hasAlpha(tentry) \
				and (tentry not in text and stem not in text) and tentry in self.vocab:
          termSet[stem] = termSet.setdefault(stem, 0.0) + value
    #for term in termSet.keys():
    #	for qterm in qTerms:
    #		termSet[term] *=
    return termSet
Пример #8
0
 def getCoOcScore(self, stemSet, phrase):
     total = 0.0
     tCount = 0.0
     for qRep in stemSet:
         # stem the term
         # get PMI
         if len(qRep) > 2 and qRep not in stopSet and hasAlpha(qRep):
             # total += self.coMan.getPMI(phrase, qRep,50)
             c1, c2 = self.coMan.getCoOcCount(phrase, qRep)
             if c1 != c2:
                 print ":O CoOcc count diff ", phrase, qRep, c1, c2
             total += c1
             tCount += 1.0
     if tCount > 0:
         return total / tCount
     return 0
Пример #9
0
 def getCoOcScore(self, phrase, stemSet):
   total = 0.0
   tCount = 0.0
   for qRep in stemSet:
     #stem the term
     #get PMI
     if len(qRep) > 2 and qRep not in stopSet and hasAlpha(qRep):
       #total += self.catCoMan.getPMI(phrase, qRep,10)
       #c1, c2 = self.catCoMan.getCoOcCount(phrase, qRep);
       c1 = self.catCoMan.getProb(phrase, qRep, 50)
       #if c1 != c2:
       #	print ':O CoOcc count diff ',phrase, qRep, c1, c2;
       #total+= c1;
       if c1 > 0:
         total += c1
       tCount += 1.0
   if tCount > 0:
     return total / tCount
   return 0
Пример #10
0
def findClickQuery(fileName):
  """load clicked queries"""
  porter1 = porter.PorterStemmer()
  clickQuery = {}
  for line in open(fileName, 'r'):
    entry = parseLine(line.strip())
    if len(entry) > 3:
      terms = entry[QUERY].split()
      for term in terms:
        nterm = porter1.stem(term)
        if len(term) > 2 and hasAlpha(term) and term not in ashleelString and \
				nterm not in stopSet and nterm not in ashleelString:
          if nterm not in clickQuery:
            clickQuery[nterm] = {}
          clickQuery[nterm][entry[CLICKU]] = clickQuery[nterm].setdefault(
              entry[CLICKU], 0.0) + 1.0

  for entry, cdict in clickQuery.iteritems():
    print entry, '\t', cdict

  return clickQuery