示例#1
0
def generatePhraseFeatures(featureFile, spotFile, outFile):
  #load features for queries
  qfeatMan = FeatureManager()
  qfeatMan.readFeatures(featureFile)

  pid = 0
  pfeatMan = FeatureManager()

  #generate features for phrases
  for query, pList in generatePhrases(spotFile):
    qkey, qfeat = qfeatMan.returnFeature(query)
    #print query, qkey
    if qkey:
      #print query, pList
      for phrase in pList:
        qVect = getDictFromSet(phrase.split())
        ngrams = getNGramsAsList(phrase, 2)
        url = qfeat.returnUrl()
        user = qfeat.returnUsers()
        ent = qfeat.returnEntities()
        cat = qfeat.returnCategories()
        typ = qfeat.returnType()
        sess = qfeat.returnSessions()
        if 'tournament' in phrase:
          print query, phrase
          print sess
          print typ
          print ent
        nFeature = QueryFeature(phrase, ngrams, qVect, url, user, sess, ent,
                                cat, typ)
        pfeatMan.addFeature(phrase, pid, nFeature)
        pid += 1

  pfeatMan.writeFeatures(outFile)
  def scoreWithCosine(self, qSet, clustList, cIndex, limit):
    toEvaluate = []
    done = set()
    for entry in qSet:
      try:
        clusters = cIndex[entry]
        #print 'CLUSTERS',entry, clusters
        for cind in clusters:
          if cind not in done:
            toEvaluate.append(clustList[cind])
            done.add(cind)
      except:
        pass

    #for each cluster find cosine similarity
    clustScore = {}
    i = 0
    qDict = getDictFromSet(qSet)
    for clust in toEvaluate:
      cos = get_cosine(qDict, clust)
      if cos > 0:
        clustScore[i] = cos
      i += 1

    toReturn = []
    for entry in sorted(clustScore.items(), reverse=True, key=lambda x: x[1]):
      toReturn.append(toEvaluate[entry[0]].keys())

    return toReturn
  def expandTextWithStepAndSubcluster(self, qText, clickText, topC, limit1,
                                      limit2, step):
    spotDict = self.dexter.tagText(qText)
    entStatus = False
    scoredTerms = {}
    if len(spotDict) == 0:
      print 'No Entitity Found\t', qText
    else:
      print 'Tagged ', qText, '\t', spotDict
      entStatus = True
    qSplit = qText.split()
    qSet = set(qSplit)
    qDict = getDictFromSet(qSplit)
    #Rank cateogories
    print 'Click Text ', clickText
    catList = self.scoreCategories(qSet, qDict, clickText, spotDict, topC)
    print qText, 'CatList ', catList
    #Rank subclusters
    terms = self.aggregateTermsFromSubclusters(qSet, catList, limit2 + 400)
    print terms
    #print 'total term set',len(termSet);

    for i in xrange(limit1, limit2, step):
      if i == 0:
        scoredTerms[i] = self.ranker.getTopK(terms, i + 1)
        #getTopKWithFilter(terms,i+1,i+50)
      else:
        scoredTerms[i] = self.ranker.getTopK(terms, i)
        #getTopKWithFilter(terms,i,i+50)

    return entStatus, scoredTerms
示例#4
0
def convertListToDict(iList):
  tSet = {}
  for entry in iList:
    for word, count in getDictFromSet(entry.split()).items():
      try:
        tSet[word] += count
      except:
        tSet[word] = count
  return tSet
示例#5
0
def getSessionTerms(session, porter):
  lSet = getDictFromSet(session[-1].split())
  nlSet = normalizeDict(lSet, porter)

  tSet = convertListToDict(session)
  ntSet = normalizeDict(tSet, porter)

  tSet = removeSameKeys(nlSet, ntSet)

  return tSet
 def expandText(self, text, topC, limit):
   spotDict = self.dexter.tagText(text)
   if len(spotDict) == 0:
     print 'No Entity found\t', text, spotDict
   else:
     print 'Tagged\t', text, '\t', spotDict
   qsplit = text.split()
   termSet = set(qsplit)
   termDict = getDictFromSet(qsplit)
   catList = self.scoreCategories(termSet, termDict, spotDict, topC)
   terms = self.aggregateTerms(text, catList)
   scoredTerms = self.ranker.getTopKWithFilter(terms, limit, limit + 50)
   return scoredTerms
 def getTopEntityCategoryTerms(self, query, topC, limit):
   entCatTerms = {}
   spotDict = self.dexter.tagText(query)
   qsplit = query.split()
   termSet = set(qsplit)
   termDict = getDictFromSet(qsplit)
   catList = self.scoreCategories(termSet, termDict, spotDict, topC)
   for entity, cats in catList.iteritems():
     entCatTerms[entity] = {}
     for cat, score in cats:
       terms = self.aggregateTermsForCategory(query, termSet, cat)
       entCatTerms[entity][cat] = self.ranker.getTopKWithFilter(terms, limit,
                                                                limit + 50)
   return entCatTerms
示例#8
0
def getNString(string, glen):
  string = string.strip()

  gString = ''
  ngrams = getNGramsAsList(string, glen)
  #print glen, string, bi, ind

  gString = ' '.join('{0}:{1}'.format(x.replace(' ', '_'), y)
                     for x, y in ngrams.items())

  queryVect = getDictFromSet(string.split())
  qVectString = ' '.join('{0}:{1}'.format(x, y) for x, y in queryVect.items())

  return gString + '\t' + qVectString
 def getTopSubclusters(self, qText, clickText, topC, limit):
   spotDict = self.dexter.tagText(qText)
   entStatus = False
   if len(spotDict) == 0:
     print 'No Entitity Found\t', qText
   else:
     print 'Tagged ', qText, '\t', spotDict
     entStatus = True
   qSplit = qText.split()
   qSet = set(qSplit)
   qDict = getDictFromSet(qSplit)
   #Rank cateogories
   #catList = self.scoreCategories(qSet,qDict,clickText,spotDict,topC);
   #print qText,'CatList ',catList;
   #Rank subclusters
   topClusters = None
   #self.rankClusters(qSet,catList, limit);
   return entStatus, topClusters
示例#10
0
def getClickedSummaryTerms(session, cSummary, cTitle, porter):
  tSet = {}
  sSet = {}

  qSet = getDictFromSet(session[-1].split())
  nqSet = normalizeDict(qSet, porter)

  #print cTitle, cSummary

  tSet = convertListToDict(cTitle)
  sSet = convertListToDict(cSummary)

  ntSet = normalizeDict(tSet, porter)
  nsSet = normalizeDict(sSet, porter)

  tSet = removeSameKeys(nqSet, ntSet)
  sSet = removeSameKeys(nqSet, nsSet)

  return tSet, sSet
示例#11
0
  def expandTextWithStep(self, text, topC, limit1, limit2, step, spotDict=None):
    if not spotDict:
      spotDict = self.dexter.tagText(text)
    if len(spotDict) == 0:
      print 'No Entity found\t', text, spotDict
    else:
      print 'Tagged\t', text, '\t', spotDict
    qsplit = text.split()
    termSet = set(qsplit)
    termDict = getDictFromSet(qsplit)
    catList = self.scoreCategories(termSet, termDict, spotDict, topC)
    terms = self.aggregateTerms(text, catList)
    scoredTerms = {}
    for i in xrange(limit1, limit2, step):
      if i == 0:
        scoredTerms[i] = self.ranker.getTopKWithFilter(terms, i + 1, i + 50)
      else:
        scoredTerms[i] = self.ranker.getTopKWithFilter(terms, i, i + 50)

    return scoredTerms
示例#12
0
  def expandTextWithSubClusters(self, qText, clickText, topC, limit):

    spotDict = self.dexter.tagText(qText)
    entStatus = False
    if (not spotDict) or len(spotDict) == 0:
      print 'No Entitity Found\t', qText
    else:
      print 'Tagged ', qText, '\t', spotDict
      entStatus = True

    qSplit = qText.split()
    qSet = set(qSplit)
    qDict = getDictFromSet(qSplit)
    #Rank cateogories
    catList = self.scoreCategories(qSet, qDict, clickText, spotDict, topC)
    print qText, 'CatList ', catList
    #Rank subclusters
    termSet = self.aggregateTermsFromSubclusters(qSet, catList, limit + 100)
    #print len(termSet);
    #Rank terms
    scoredTerms = self.ranker.getTopKWithFilter(termSet, limit, limit + 50)
    return entStatus, scoredTerms
示例#13
0
  def scoreCategories(self, querySet, queryDict, clickText, spotDict, k):
    entityCatScore = {}

    cDict = getDictFromSet(clickText.split())
    combDict = combineDict(cDict, queryDict)
    cSet = set(cDict.keys())
    print 'cDict ', cDict, ' combDict ', combDict
    for entry, eDict in spotDict.iteritems():
      catList = eDict['cat'].lower().split()
      queryTerms = (querySet | cSet)
      queryTerms = queryTerms - set([entry])
      catScore = {}
      for cat in catList:
        pset = self.catManager.getPhraseSet(cat)  #unique phrases in cat
        if len(pset) == 0:
          print 'CAT NO PHRASE ', cat
        qInt = pset & queryTerms  #no of query terms cat contains
        score = 0.0
        for iphrase in qInt:
          score += self.catManager.getPhraseProb(cat, iphrase)
        if len(queryTerms) > 0:
          score *= (1.0 * len(qInt)) / len(queryTerms)

        #cosine score
        cVector = self.catManager.getVector(cat)
        cscore = get_cosine(combDict, cVector)

        #total score
        catScore[cat] = (cscore + score) / 2.0
      sortedScore = sorted(catScore.items(), reverse=True, key=lambda x: x[1])

      #get terms from all categories
      if k == 1000 or k > len(sortedScore):
        k = len(sortedScore)

      entityCatScore[entry] = sortedScore[0:k]

      print 'Query\t', querySet, ' Entity\t', entry, entityCatScore[entry]
    return entityCatScore
示例#14
0
def main():
    parser = ap.ArgumentParser(description = 'Generate features for entity tagged queries')
    parser.add_argument('-i', '--iFile', help='Query log file', required=True)
    parser.add_argument('-o', '--oFile', help='Output feature file', required=True)
    parser.add_argument('-t', '--typeFile', help='DBPedia type file', required=True)
    parser.add_argument('-c', '--catFile', help='DBPedia cat file', required=True)
    parser.add_argument('-u', '--uid', help='User id present or not', required=True,type=bool)
    parser.add_argument('-w', '--wtype', help='Phrase (phrase) or query (query) features', required=True)

    args = parser.parse_args()

    boolUid = args.uid

    #load the category list
    dbCatList = loadCategories(args.catFile)
    #print 'Categories',len(dbCatList)
    #load the type list
    dbTypeList = loadInstancesInList(args.typeFile)
    #print 'Types',len(dbTypeList)

    #query list
    queryList = {}
    #user list
    userList = {}
    #url list
    urlList = {}
    #session list
    sessionList = {}
    #entity List
    entityList = {}
    #category List
    categoryList = {}
    #type list
    typeList = {}

    ipaddress = 'localhost'
    tagURL = 'http://'+ipaddress+':8080/dexter-webapp/api/rest/annotate'

    cqid = 1
    sid = 1
    qid = None
    for session in getSessionTuples(args.iFile,'\t', 1560):
        print 'Session id and length' , sid, len(session)
        for entry in session:
            query = entry[QUERY]
            #tag it with dexter and get all 3 parameters
            spotDict = tagQueryWithDexter(query,tagURL)
            if 'spots' in spotDict:
                updatedSpotDict = getCatAndTypeInfo(spotDict,dbCatList, dbTypeList)
            if args.wtype == 'query':
                #given wtype find the following
                if query not in queryList:
                    #print 'Mapping ', query , 'to ', cqid
                    queryList[query] = cqid
                    qid = cqid
                    cqid+=1
                else:
                    qid = queryList[query]
                updateDict(sessionList,sid, qid)

                if boolUid:
                    updateDict(userList, entry[USER], qid)
                if CLICKU in entry:
                    updateDict(urlList, entry[CLICKU],qid)
                if updatedSpotDict:
                    for spot in updatedSpotDict['spots']:
                        updateDict(categoryList,spot['cat'], qid)
                        updateDict(typeList,spot['type'], qid)
                        updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid)

            if args.wtype == 'phrase':
                for spot in updatedSpotDict['spots']:
                    splits = query.split(spot['mention'])
                    for split in splits:
                        split = split.strip()
                        #remove stop words
                        split = filterStopWordsFromQuery(split)
                        if len(split) > 1:
                            if split not in queryList:
                                queryList[split] = cqid
                                qid = cqid
                                cqid+=1
                            else:
                                qid = queryList[split]
                            updateDict(sessionList,sid, qid)

                            if boolUid:
                                updateDict(userList, entry[USER], qid)
                            if CLICKU in entry:
                                updateDict(urlList, entry[CLICKU],qid)
                            if updatedSpotDict:
                                updateDict(categoryList,spot['cat'], qid)
                                updateDict(typeList,spot['type'], qid)
                                updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid)
        sid+=1

    #write the features to the outfile
    outF = open(args.oFile,'w')

    for query, qid in queryList.items():
        outF.write(query)
        #generate ngrams
        queryVect = getDictFromSet(query.split())
        ngramString = getNGramsAsList(query,3)
        #ngrams = 1
        outF.write('\t'+str(ngramString))
        #query vect = 2
        outF.write('\t'+str(queryVect))


        if qid in urlList:
            outF.write('\t'+str(urlList[qid]))
        else:
            outF.write('\t{}')

        if qid in userList:
            outF.write('\t'+str(userList[qid]))
        else:
            outF.write('\t{}')

        if qid in entityList:
            outF.write('\t'+str(entityList[qid]))
        else:
            outF.write('\t{}')

        if qid in categoryList:
            outF.write('\t'+str(categoryList[qid]))
        else:
            outF.write('\t{}')

        if qid in typeList:
            outF.write('\t'+str(typeList[qid]))
        else:
            outF.write('\t{}')

        if qid in sessionList:
            outF.write('\t'+str(sessionList[qid]))
        else:
            outF.write('\t{}')

        outF.write('\n')

    outF.close()
示例#15
0
def getStatsPerQuery(argv):
  tagURL = 'http://localhost:8080/rest/annotate'
  catURL = 'http://localhost:8080/rest/graph/get-entity-categories'

  catVector = loadCategoryVector(argv[3])
  f1Dict = getCats(argv[2])
  sFound = 0.0
  sTotal = 0.0
  eTotal = set()
  eRemov = set()
  catFoundNoTerm = set()
  catNotFound = set()
  catTermFound = set()
  catEntity = set()
  outfile = open('match_session_dom.txt', 'w')
  #categoryVectors = {}
  for session in getSessionWithNL(argv[1]):
    catCount = {}
    entCount = {}
    querySpotList = {}
    for query in session:
      #find the entities in query
      try:
        spotDict = None  #tagQueryWithDexter(query, tagURL,catURL)
        querySpotList[query] = spotDict
        for text in spotDict.keys():
          for entry in spotDict[text]['cat'].split():
            catCount[entry] = catCount.setdefault(entry, 1) + 1
          entCount[text] = entCount.setdefault(text, 1) + 1
      except Exception as err:
        print err
        #print 'SESSION', session, 'CATCOUNT', catCount, 'ENTCOUNT',entCount

    found = False
    if len(catCount) > 0:
      #find the dominant entity
      maxEnt = max(entCount.values())
      #sessionQueryMapping = {}
      for query, spotList in querySpotList.iteritems():
        matchl = spotList.keys()
        for entry in matchl:
          eTotal.add(entry)
          if entCount[entry] < maxEnt:
            spotList.pop(entry, None)
            print 'Removing spot', query, entry
            eRemov.add(entry)
          else:
            #get the categories
            #catTermMatch = {}
            rquery = query.replace(entry, '')
            queryTerms = set(rquery.split())
            for cat in spotList[entry]['cat'].lower().split():
              catEntity.add(entry + '_' + cat)
              if cat in f1Dict:
                phrase1 = loadPhrasesWithScore(argv[2] + '/' + f1Dict[cat])
                pVector = catVector[cat]
                queryDict = getDictFromSet(queryTerms)
                pTotal = sum(phrase1.values())
                pset = set(phrase1.keys())
                sint = pset & queryTerms
                score = 0.0
                cscore = get_cosine(queryDict, pVector)

                for iphrase in sint:
                  score += phrase1[iphrase] / pTotal
                if len(queryTerms) > 0:
                  score *= (1.0 * len(sint)) / len(queryTerms)

                if sint:

                  outfile.write(query + '\t' + entry + '\t' + cat + '\t' +
                                str(cscore) + '\t' + ', '.join(sint) + '\n')
                  found = True
                  catTermFound.add(entry + '_' + cat)
                else:
                  outfile.write(query + '\t' + entry + '\t' + cat + '\t0\t0\n')
                  catFoundNoTerm.add(cat + '_' + entry)
              else:
                outfile.write(
                    query + '\t' + entry + '\t' + cat + '\t0\tNOT FOUND\n')
                catNotFound.add(cat + '_' + entry)

                #load the terms for category
                #check if these terms match
    if found:
      sFound += 1
    sTotal += 1
    outfile.write('\n')

  print 'Total Sessions ', sTotal
  print 'Sessions with dominant entity in AOL', sFound
  print '# Unique Entities', len(eTotal)
  print '# Removed Entities (non dominant)', len(eRemov)
  print '# no of entity types', len(catEntity)
  print '# no of entity types with terms match ', len(catTermFound)
  print '# no of entity types with no term match', len(catFoundNoTerm)
  print '# no of entity types with no match in AOL', len(catNotFound)
示例#16
0
def combineQueryFeatures(queryFile, spotFile, featFile, newFile):
  #load features
  featDict = {}
  i = 1
  urlDict = {}

  for line in open(featFile, 'r'):
    split = line.strip().split('\t')
    featDict[split[0].strip()] = split[1:]

  querySpots = {}
  for line in open(spotFile, 'r'):
    spotDict = ast.literal_eval(line)
    querySpots[spotDict['text']] = spotDict
  outF = open(newFile, 'w')

  #all queries
  for line in open(queryFile, 'r'):
    query = line.strip()
    queryFeat = []

    #getNString(query,3).decode('utf-8')
    #triString = str(triString.encode('ascii','ignore')).strip()
    triString = getNGramsAsList(query, 3)
    if len(triString) > 0:
      queryFeat.append(triString)
    else:
      queryFeat.append({})

    queryVect = getDictFromSet(query.split())
    if len(queryVect) > 0:
      queryFeat.append(queryVect)
    else:
      queryFeat.append({})

    if query in featDict:
      #normalize the users
      userString = getUserString(featDict[query][0])
      if len(userString) > 0:
        queryFeat.append(userString)
      else:
        queryFeat.append({})

      #normalize the urls
      i, urlDict, linkString = getUrlString(featDict[query][1], urlDict, i)
      if len(linkString) > 0:
        queryFeat.append(linkString)
      else:
        queryFeat.append({})
    else:
      print 'Query not found ', query

    if query in querySpots:
      spotDict = querySpots[query]  #ast.literal_eval(line)
      #cat, ent and type info
      result = getCatEntString(spotDict)
      for entry in result:
        if len(entry) > 0:
          queryFeat.append(entry)
        else:
          queryFeat.append({})
    else:
      queryFeat += [{}, {}, {}]
      #print queryFeat
    try:
      outF.write(query)
      for entry in queryFeat:
        outF.write('\t' + str(entry))
      outF.write('\n')
    except:
      print 'ERROR ', queryFeat

  outF.close()
示例#17
0
def getPrecRecall(opt, catList, f1Dict, catVector, queryTerms, aTerms, index):

  catScore = {}
  maxQs = -1000
  maxCat = ''

  notFound = set()
  for cat in catList:
    if cat in f1Dict:
      catScore[cat
           ] = {'aP': 0.0,
                'aR': 0.0,
                'qS': 0.0,
                'qInt': set(),
                'aInt': set()}
      #phrase cat score
      phrase1 = loadPhrasesWithScore(f1Dict[cat])
      pTotal = sum(phrase1.values())  #total no of terms in cat
      pset = set(phrase1.keys())  #unique phrases in cat
      qInt = pset & queryTerms  #no of query terms cat contains
      score = 0.0
      for iphrase in qInt:
        score += phrase1[iphrase] / pTotal
      if len(queryTerms) > 0:
        score *= (1.0 * len(qInt)) / len(queryTerms)

      #cosine score
      queryDict = getDictFromSet(queryTerms)
      cVector = catVector[cat]
      cscore = get_cosine(queryDict, cVector)

      #total score
      catScore[cat]['qs'] = cscore + score
      if maxQs < catScore[cat]['qs']:
        maxQs = catScore[cat]['qs']
        maxCat = cat

      sortP = sorted(phrase1.items(), reverse=True, key=lambda x: x[1])
      #print 'sorted' , sortP[0],sortP[1]
      apset = set(x[0] for x in sortP[0:index])
      #print 'pSet ',apset

      aInt = aTerms & apset
      catScore[cat]['aP'] = (1.0 * len(aInt)) / len(aTerms)
      catScore[cat]['aR'] = (1.0 * len(aInt)) / len(apset)
      catScore[cat]['aInt'] = aInt
      catScore[cat]['qInt'] = qInt
    else:
      notFound.add(cat)

  if opt == 'max':
    if maxCat in catScore:
      return notFound, maxCat, catScore[maxCat]
    else:
      return notFound, None, {
          'aP': 0.0,
          'aR': 0.0,
          'qS': 0.0,
          'qInt': set(),
          'aInt': set()
      }

  else:
    avgScore = {'aP': 0.0, 'aR': 0.0, 'qS': 0.0, 'qInt': set(), 'aInt': set()}
    for entry, cdict in catScore.iteritems():
      avgScore['aP'] += cdict['aP']
      avgScore['aR'] += cdict['aR']
      avgScore['qS'] += cdict['qS']
      avgScore['qInt'] |= cdict['qInt']
      avgScore['aInt'] |= cdict['aInt']

    avgScore['aP'] /= len(catScore)
    avgScore['aR'] /= len(catScore)
    avgScore['qS'] /= len(catScore)

    return notFound, None, avgScore

  return notFound, None, None