예제 #1
0
def sampleEntityQueries(featFile, clusterFile):
  queries = {}
  for line in open(featFile, 'r'):
    split = line.split('\t')
    entityDict = ast.literal_eval(split[5])
    qsplit = split[0].split()

    if len(qsplit) > 2:
      queries[split[0].strip()] = entityDict

  outFile = open('EntityQueries.sample', 'w')
  globalEnt = {}
  entTerms = {}
  for cluster in loadClusters(clusterFile):
    if len(cluster) > 4:
      cent = {}
      for query in cluster:
        if query in queries:
          for entry, value in queries[query].items():
            if entry not in cent:
              cent[entry] = 0.0
            cent[entry] += value
      entSort = sorted(cent.items(), reverse=True, key=lambda x: x[1])
      toPrint = {}
      if len(entSort) > 0:
        bestEnt = entSort[0][0]
        if bestEnt not in globalEnt:
          globalEnt[bestEnt] = []
          entTerms[bestEnt] = set([])
        for query in cluster:
          if query in queries and bestEnt in queries[query]:
            qsplit = query.split()  #getNGramsAsList(query,2) #
            for entry in qsplit:
              if len(entry) > 2  and entry not in stopSet \
							and entry not in bestEnt and entry not in ashleelString:
                if entry not in toPrint:
                  toPrint[entry] = 0.0
                toPrint[entry] += 1.0
        sortP = sorted(toPrint.items(), reverse=True, key=lambda x: x[1])
        fset = set([])
        if len(sortP) > 3:
          for entry in sortP:
            if entry[1] > 1:
              fset.add(entry[0])
            if len(fset) > 7:
              break
          covered = 0
          for entry in fset:
            if entry not in entTerms[bestEnt]:
              entTerms[bestEnt].add(entry)
            else:
              covered += 1.0
          if len(fset) > 3 and covered / len(fset) < .20:
            globalEnt[bestEnt].append(fset)
          #else:
          #	print covered, len(fset)
  for bestEnt, sList in globalEnt.items():
    if len(sList) > 1:
      for sortP in sList:
        outFile.write(bestEnt.encode('utf-8') + '\t' + '\t'.join(sortP) + '\n')
  outFile.close()
def main(argv):

  #Scorer
  coSessOccur = CoOccurrence()
  coSessOcMan = CoOcManager(argv[2], coSessOccur, ' ')
  tScorer = CoOccurSimScore(coSessOcMan)
  cScorer = ScoreClusterTerms()

  #vocab = set()
  i = 0
  prec = {}
  mrr = {}
  lim = 55

  queryList = loadFileInList(argv[5])
  termList, termDict = getTermList(queryList)
  print len(termList)
  added = 0
  oracle_prec = 0.0
  oracle_mrr = 0.0
  for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    query = session[0].strip()
    aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
    if len(aTerms) > 0:
      prec1, mrr1 = getPrecRecall(termList, aTerms)
      added += 1.0
      oracle_prec += prec1
      oracle_mrr += mrr1

  print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added

  porter = stem.porter.PorterStemmer()
  ttype = argv[6]

  print ttype

  for iFile in os.listdir(argv[3]):
    qclusters = loadClusters(argv[3] + '/' + iFile)
    clusters, clusIndex = toTerms(qclusters)

    print iFile, len(clusters)
    prec[iFile] = {}
    mrr[iFile] = {}
    added = 0.0
    i = 1
    for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
        argv[1]):
      i += 1
      query = session[0].strip()
      qSet = getQueryTermsStemmed(query, porter)

      print 'Query ', query, qSet
      if ttype == 'query':
        aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
      elif ttype == 'title':
        aTerms = getTerms(cTitle, qSet, termDict, porter, range(
            1, len(session) - 1))
      else:
        aTerms = getTerms(cTitle, qSet, termDict, porter, range(
            1, len(session) - 1))
        bTerms = getTerms(cSummary, qSet, termDict, porter, range(
            1, len(session) - 1))
        aTerms = aTerms | bTerms
        #aTerms,rTerms = addedAndRemovedTerms(query, session[1:], None )

      if len(aTerms) > 0:
        terms = cScorer.scoreWithIndex(qSet, clusters, clusIndex, tScorer, lim)
        #terms = cScorer.scoreWithClustPos(qSet, clusters,tScorer, lim)
        print 'TERMS', '\t', i, '\t', ttype, '\t', iFile, '\t', len(
            terms), terms
        #for topk in range(1,lim,5):
        prec1, mrr1 = getClustPrecMrr(terms, aTerms)  # returns a list
        print 'METRIC', iFile, i, prec1, mrr1
        #print topk , prec1, mrr1
        for topk in prec1.keys():
          if topk not in prec[iFile]:
            prec[iFile][topk] = []
            mrr[iFile][topk] = []

          prec[iFile][topk].append(prec1[topk])
          mrr[iFile][topk].append(mrr1[topk])

          #prec[iFile][topk] += prec1
          #mrr[iFile][topk] += mrr1
        added += 1.0
      #if i == 3:
      #	break

  for fName, scoreDict in prec.items():
    for pos in scoreDict.keys():
      print 'Prec all', fName, pos, len(scoreDict[pos])
      total = sum(scoreDict[pos])
      prec[fName][pos] = total / added  #len(scoreDict[pos])
      print 'Prec', fName, pos, prec[fName][pos], total

  for fName, scoreDict in mrr.items():
    for pos in scoreDict.keys():
      print 'Mrr all', fName, pos, len(scoreDict[pos])
      total = sum(mrr[fName][pos])
      mrr[fName][pos] = total / added  #len(scoreDict[pos])
      print 'MRR', fName, pos, mrr[fName][pos], total
  #for entry in prec.keys():
  #for t in prec[entry].keys():
  #print 'Prec',entry, t, prec[entry][t], prec[entry][t]/added
  #prec[entry][t]/=added

  #for entry in mrr.keys():
  #for t in mrr[entry].keys():
  #print 'Mrr',entry, t, mrr[entry][t], mrr[entry][t]/added
  #mrr[entry][t]/=added

  print 'Plotting Precision and MRR'

  plotMultipleSys(prec, 'No of Terms', 'Prec', argv[4] + 'prec.png',
                  'Term Prediction Prec Plot')
  plotMultipleSys(mrr, 'No of Terms', 'MRR', argv[4] + 'mrr.png',
                  'Term Prediction MRR Plot')