示例#1
0
def getTerms(termDict, qSet, totalVocab, stemmer, indices):
    termSet = set()
    for entry in indices:
        tList = termDict[entry]
        for text in tList:
            terms = getQueryTermsStemmed(text, stemmer)
            termSet |= terms
            # print 'TermSet ',termSet, qSet;

    termSet = termSet - qSet
    termSet = termSet & totalVocab
    return termSet
def main(argv):

  #Scorer
  coSessOccur = CoOccurrence()
  coSessOcMan = CoOcManager(argv[2], coSessOccur, ' ')
  tScorer = CoOccurSimScore(coSessOcMan)
  cScorer = ScoreClusterTerms()

  #vocab = set()
  i = 0
  prec = {}
  mrr = {}
  lim = 55

  queryList = loadFileInList(argv[5])
  termList, termDict = getTermList(queryList)
  print len(termList)
  added = 0
  oracle_prec = 0.0
  oracle_mrr = 0.0
  for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    query = session[0].strip()
    aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
    if len(aTerms) > 0:
      prec1, mrr1 = getPrecRecall(termList, aTerms)
      added += 1.0
      oracle_prec += prec1
      oracle_mrr += mrr1

  print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added

  porter = stem.porter.PorterStemmer()
  ttype = argv[6]

  print ttype

  for iFile in os.listdir(argv[3]):
    qclusters = loadClusters(argv[3] + '/' + iFile)
    clusters, clusIndex = toTerms(qclusters)

    print iFile, len(clusters)
    prec[iFile] = {}
    mrr[iFile] = {}
    added = 0.0
    i = 1
    for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
        argv[1]):
      i += 1
      query = session[0].strip()
      qSet = getQueryTermsStemmed(query, porter)

      print 'Query ', query, qSet
      if ttype == 'query':
        aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
      elif ttype == 'title':
        aTerms = getTerms(cTitle, qSet, termDict, porter, range(
            1, len(session) - 1))
      else:
        aTerms = getTerms(cTitle, qSet, termDict, porter, range(
            1, len(session) - 1))
        bTerms = getTerms(cSummary, qSet, termDict, porter, range(
            1, len(session) - 1))
        aTerms = aTerms | bTerms
        #aTerms,rTerms = addedAndRemovedTerms(query, session[1:], None )

      if len(aTerms) > 0:
        terms = cScorer.scoreWithIndex(qSet, clusters, clusIndex, tScorer, lim)
        #terms = cScorer.scoreWithClustPos(qSet, clusters,tScorer, lim)
        print 'TERMS', '\t', i, '\t', ttype, '\t', iFile, '\t', len(
            terms), terms
        #for topk in range(1,lim,5):
        prec1, mrr1 = getClustPrecMrr(terms, aTerms)  # returns a list
        print 'METRIC', iFile, i, prec1, mrr1
        #print topk , prec1, mrr1
        for topk in prec1.keys():
          if topk not in prec[iFile]:
            prec[iFile][topk] = []
            mrr[iFile][topk] = []

          prec[iFile][topk].append(prec1[topk])
          mrr[iFile][topk].append(mrr1[topk])

          #prec[iFile][topk] += prec1
          #mrr[iFile][topk] += mrr1
        added += 1.0
      #if i == 3:
      #	break

  for fName, scoreDict in prec.items():
    for pos in scoreDict.keys():
      print 'Prec all', fName, pos, len(scoreDict[pos])
      total = sum(scoreDict[pos])
      prec[fName][pos] = total / added  #len(scoreDict[pos])
      print 'Prec', fName, pos, prec[fName][pos], total

  for fName, scoreDict in mrr.items():
    for pos in scoreDict.keys():
      print 'Mrr all', fName, pos, len(scoreDict[pos])
      total = sum(mrr[fName][pos])
      mrr[fName][pos] = total / added  #len(scoreDict[pos])
      print 'MRR', fName, pos, mrr[fName][pos], total
  #for entry in prec.keys():
  #for t in prec[entry].keys():
  #print 'Prec',entry, t, prec[entry][t], prec[entry][t]/added
  #prec[entry][t]/=added

  #for entry in mrr.keys():
  #for t in mrr[entry].keys():
  #print 'Mrr',entry, t, mrr[entry][t], mrr[entry][t]/added
  #mrr[entry][t]/=added

  print 'Plotting Precision and MRR'

  plotMultipleSys(prec, 'No of Terms', 'Prec', argv[4] + 'prec.png',
                  'Term Prediction Prec Plot')
  plotMultipleSys(mrr, 'No of Terms', 'MRR', argv[4] + 'mrr.png',
                  'Term Prediction MRR Plot')