def findQueryCounts(queryFile):
  #coOccur = CoOccurrence();
  pairs = {}
  porter = stem.porter.PorterStemmer()
  qTerms = ''
  for line in open(queryFile, 'r'):
    split = line.strip().lower().split('\t')
    query = split[0].strip()
    freq = float(split[1])
    #for each query get nonEntTerms and update co-occurrence stats
    qTerms = ''
    qTerms = ' '.join(getQueryTerms(query))
    if len(qTerms) > 3:
      ngrams = sorted(getNGramsAsList(qTerms.strip(), 1))
      lngrams = len(ngrams)
      if lngrams > 1:
        for i in range(lngrams - 1):
          if ngrams[i] not in stopSet and len(ngrams[i]) > 2:
            for j in range(i + 1, lngrams):
              if ngrams[j] not in stopSet and len(ngrams[j]) > 2:
                stemd1 = porter.stem(ngrams[i])
                stemd2 = porter.stem(ngrams[j])
                key = stemd1 + ' ' + stemd2
                if key not in pairs:
                  pairs[key] = 0.0
                pairs[key] += freq
                #coOccur.updateStats(stemd1, stemd2, freq);
                #coOccur.setTermTotal();
                #coOccur.writeTermCo(outFile);
  return pairs
def findSessionCounts(queryFile, outFile, wordSet):
  coOccur = {}
  #CoOccurrence();

  qTerms = ''
  sess = 0
  qid = 0.0
  qSet = set()
  for session in getSessionWithQuery(queryFile):
    qSet.clear()
    for query in session:
      qid += 1
      terms = getQueryTerms(query)
      if len(terms) > 0:
        qSet |= getQueryTerms(query)
      if qid % 1000000 == 0:
        print qid
        print len(coOccur)

        #print len(session)	, len(qSet);
        #for each query get nonEntTerms and update co-occurrence stats
    qTerms = ''
    qTerms = ' '.join(qSet)
    if len(qTerms) > 3 and len(qSet) > 1:
      #print qSet;
      ngrams = sorted(getNGramsAsList(qTerms.strip(), 1))
      lngrams = len(ngrams)
      if lngrams > 1:
        for i in range(lngrams - 1):
          if ngrams[i] not in stopSet and len(
              ngrams[i]) > 2 and ngrams[i] in wordSet:
            for j in range(i + 1, lngrams):
              if ngrams[j] not in stopSet and len(
                  ngrams[j]) > 2 and ngrams[j] in wordSet:
                #coOccur.updateStats(ngrams[i],ngrams[j],1.0);
                key = ngrams[i] + ' ' + ngrams[j]
                if key not in coOccur:
                  coOccur[key] = 0.0
                coOccur[key] += 1.0
                if len(coOccur) >= 9000000:
                  writeDictToFile(outFile, coOccur, sess)
                  coOccur.clear()
                  coOccur = {}
                  sess += 1
Exemplo n.º 3
0
def calWordCount(fileName):
  words = {}
  for line in open(fileName, 'r'):
    split = line.lower().split('\t')
    query = split[1].strip()
    qsplit = getQueryTerms(query)
    for entry in qsplit:
      if entry not in words:
        words[entry] = 0.0
      words[entry] += 1

  wsort = sorted(words.items(), reverse=True, key=lambda x: x[1])
  for entry in wsort:
    if entry[1] > 3 and len(entry[0]) > 2:
      print entry[0], '\t', entry[1]
Exemplo n.º 4
0
def getURLFeatures(fileName, wordFeat):
  for line in open(fileName, 'r'):
    split = line.split('\t')
    queryTerms = getQueryTerms(split[0])
    links = linkP.findall(split[1])  #ast.literal_eval(split[3])
    urlFeatures = []
    for tup in links:
      try:
        entry = tup.rsplit(',')
        entry[0] = entry[0][1:].strip()
        entry[1] = int(entry[1][:-1])
        urlFeatures.append((entry[0], entry[1]))
      except:
        pass
    #print split[0], len(urlFeatures);
    for oentry in queryTerms:
      entry = porter.stem(oentry)
      if len(entry) > 2 and entry in wordFeat:
        for urlPair in urlFeatures:
          wordFeat[entry].updateURLStats(urlPair[0], urlPair[1])
def predictTerms(queryList, y, qclusters):
  termList, termDict = getTermList(queryList)
  oracle_prec = 0.0
  oracle_mrr = 0.0
  added = 0
  cScorer = ScoreClusterTerms()
  for session in y:
    query = session[0]
    aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
    if len(aTerms) > 0:
      prec1, mrr1 = getPrecRecall(termList, aTerms)
      added += 1.0
      oracle_prec += prec1
      oracle_mrr += mrr1
  print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added, added
  #porter = stem.porter.PorterStemmer();
  clusters, clusIndex = toTerms(qclusters)
  lim = 5
  i = 0
  prec = {}
  mrr = {}
  pf = 0.0
  pr = 0.0
  for session in y:
    query = session[0].strip()
    qSet = getQueryTerms(query)  #getQueryTermsStemmed(query, porter);
    aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
    if len(aTerms) > 0:
      terms = cScorer.scoreWithCosine(qSet, clusters, clusIndex, lim)

      if len(terms) > 0:
        #print len(aTerms), len(terms)
        prec1, mrr1 = getClustPrecRecall(terms, aTerms)  # returns a list
        #print 'METRIC',i, prec1, mrr1
        #print topk , prec1, mrr1
        if sum(prec1) > 0:
          pf += 1.0

        if sum(mrr1) > 0:
          pr += 1.0

        for topk in range(len(prec1)):
          if topk not in prec:
            prec[topk] = []
            mrr[topk] = []

          prec[topk].append(prec1[topk])
          mrr[topk].append(mrr1[topk])
      i += 1

  retPrec = {}
  retRecall = {}

  for entry, ls in prec.items():
    print 'Prec @', entry, np.mean(ls)
    retPrec[entry] = np.mean(ls)

  for entry, ls in mrr.items():
    print 'Recall @', entry, np.mean(ls)
    retRecall[entry] = np.mean(ls)

  print 'Percentage ', pf / i, pr / i

  return retPrec, retRecall
Exemplo n.º 6
0
 def findNonEntTerms(self, query, sdict):
   newQuery = query
   for entity in sdict:
     newQuery = newQuery.replace(entity, '')
   terms = getQueryTerms(newQuery)
   return terms
Exemplo n.º 7
0
def main(argv):
    ipaddress = "localhost"
    # dexter object
    tagURL = "http://" + ipaddress + ":8080/rest/annotate"
    catURL = "http://" + ipaddress + ":8080/rest/graph/get-entity-categories"
    dexter = Dexter(tagURL, catURL, argv[5])

    # load the Category co-occurrence bit
    catCoMan = CoOcManager(argv[4], CoOccurrence(), " ")

    # category vector
    catVect = loadCategoryVector(argv[2])
    catManage1 = CategoryManager(catVect, argv[3], Category)
    catManage2 = CategoryManager(catVect, argv[7], CategorySubcluster)

    # ranker
    ranker = Ranker()
    totalVocab = loadFileInList(argv[6])
    # task extraction
    # htcTask = TaskExpansion('Indexes/htcIndex',ranker,3000);
    qccTask = TaskExpansion("Indexes/qccIndex", ranker, 3000, totalVocab)
    # taskK = argv[5][argv[5].rfind('/')+1:]

    wordFeatMan = None
    # WordManager(argv[8],False);

    # expansion
    # entExp1 = CatThesExpansion(dexter, catManage1, ranker,catCoMan,wordFeatMan);
    entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan, wordFeatMan)
    # term expansion
    coOccExp = CoOccurExpansion(catCoMan, None, ranker)
    # randomWalk
    # randWalk = RandomWalk(argv[2],argv[3],ranker)
    prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}}
    mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}}

    ent_prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}}
    ent_mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}}

    """
	sess_prec = {};
	sess_mrr = {};
	"""
    covered = {}

    i = 0

    porter = stem.porter.PorterStemmer()

    ttype = argv[10]

    for session, doc, click, cTitle, cSummary in getSessionWithXML(argv[1]):
        query = session[0]
        qSet = getQueryTerms(query)
        # print 'Title, Summary clicked ',cTitle[0], cSummary[0];
        aTerms = None
        # cText = normalize(' '.join(cTitle[0]),porter);
        if ttype == "query":
            aTerms, rTerms = addedAndRemovedTerms(query, session[1:], totalVocab)
        elif ttype == "title":
            aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1))
        else:
            aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1))
            bTerms = getTerms(cSummary, qSet, totalVocab, porter, range(1, len(session) - 1))
            aTerms = aTerms | bTerms

        print i, "Query", query, aTerms, len(aTerms)

        if len(aTerms) > 0:  # and query not in covered:
            covered[query] = 1

            coExpTerms = coOccExp.expandTextWithStep(query, 0, 55, 5)

            # entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query,'',1,0,55,5);
            entStatus1, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster(query, "", 1, 0, 55, 5)

            qccTaskTerms = qccTask.expandTextWithStep(query, 0, 55, 5)
            # htcTaskTerms = htcTask.expandTextWithStep(query,0,55,5)
            # randExpTerms = randWalk.expandTextWithStep(query,55,105,5)
            if not entStatus1:
                print i, "Ent False", query

                # addLen = getBand(len(aTerms));
                # if addLen not in sess_prec:
                # 	sess_prec[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} };
                # 	sess_mrr[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} };

                # for noTerms in entExpTerms1.keys():
                # print 'ETerms\t',i,'\t',query,'\t',entExpTerms1[noTerms],'\t',noTerms;
                # prec1 , mrr1 = getPrecRecall(entExpTerms1[noTerms],aTerms);
                # prec = updateStats(noTerms, 'ent',prec1, prec);
                # mrr = updateStats(noTerms, 'ent',mrr1, mrr);
                # if entStatus1:
                # ent_prec = updateStats(noTerms, 'ent',prec1, ent_prec)
                # ent_mrr = updateStats(noTerms, 'ent',mrr1, ent_mrr);
                ##sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen])
                ##sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]);
                # print 'EMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1;
                #
            for noTerms in entExpTerms2.keys():
                print "ESubTerms\t", i, "\t", query, "\t", entExpTerms2[noTerms], "\t", noTerms
                prec1, mrr1 = getPrecRecall(entExpTerms2[noTerms], aTerms)
                prec = updateStats(noTerms, "entSub", prec1, prec)
                mrr = updateStats(noTerms, "entSub", mrr1, mrr)
                if entStatus1:
                    ent_prec = updateStats(noTerms, "entSub", prec1, ent_prec)
                    ent_mrr = updateStats(noTerms, "entSub", mrr1, ent_mrr)
                    # sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen])
                    # sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]);
                print "ESubMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1

            for noTerms in qccTaskTerms.keys():
                print "qccTaskTerms\t", i, "\t", query, "\t", qccTaskTerms[noTerms], "\t", noTerms
                prec1, mrr1 = getPrecRecall(qccTaskTerms[noTerms], aTerms)
                prec = updateStats(noTerms, "qccTask", prec1, prec)
                mrr = updateStats(noTerms, "qccTask", mrr1, mrr)
                if entStatus1:
                    ent_prec = updateStats(noTerms, "qccTask", prec1, ent_prec)
                    ent_mrr = updateStats(noTerms, "qccTask", mrr1, ent_mrr)
                """
				sess_prec[addLen] = updateStats(noTerms, 'qccTask',prec1, sess_prec[addLen])
				sess_mrr[addLen] = updateStats(noTerms, 'qccTask',mrr1, sess_mrr[addLen]);
				"""
                print "qccTaskMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1

                # for noTerms in htcTaskTerms.keys():
                # print 'htcTaskTerms\t',i,'\t',query,'\t',htcTaskTerms[noTerms],'\t',noTerms
                # prec1 , mrr1 = getPrecRecall(htcTaskTerms[noTerms],aTerms)
                # prec = updateStats(noTerms, 'htcTask',prec1, prec)
                # mrr = updateStats(noTerms, 'htcTask',mrr1, mrr);
                # if entStatus1:
                # ent_prec = updateStats(noTerms, 'htcTask',prec1, ent_prec)
                # ent_mrr = updateStats(noTerms, 'htcTask',mrr1, ent_mrr);
                ##sess_prec[addLen] = updateStats(noTerms, 'htcTask',prec1, sess_prec[addLen])
                ##sess_mrr[addLen] = updateStats(noTerms, 'htcTask',mrr1, sess_mrr[addLen]);
                #
                # print 'htcTaskMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1

            for noTerms in coExpTerms.keys():
                print "CoTerms\t", i, "\t", query, "\t", coExpTerms[noTerms], "\t", noTerms
                prec1, mrr1 = getPrecRecall(coExpTerms[noTerms], aTerms)
                prec = updateStats(noTerms, "co", prec1, prec)
                mrr = updateStats(noTerms, "co", mrr1, mrr)
                if entStatus1:
                    ent_prec = updateStats(noTerms, "co", prec1, ent_prec)
                    ent_mrr = updateStats(noTerms, "co", mrr1, ent_mrr)
                """
				sess_prec[addLen] = updateStats(noTerms, 'co',prec1, sess_prec[addLen])
				sess_mrr[addLen] = updateStats(noTerms, 'co' ,mrr1, sess_mrr[addLen]);
				"""
                print "CoMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1

        else:
            pass
            # print 'NO ADDED TERMS in', i;
        i += 1

    printMetric(prec, "entSub", "Prec")
    printMetric(mrr, "entSub", "Mrr")

    printMetric(prec, "ent", "Prec")
    printMetric(mrr, "ent", "Mrr")

    printMetric(prec, "htcTask", "Prec")
    printMetric(mrr, "htcTask", "Mrr")

    printMetric(prec, "qccTask", "Prec")
    printMetric(mrr, "qccTask", "Mrr")

    printMetric(prec, "co", "Prec")
    printMetric(mrr, "co", "Mrr")

    printMetric(ent_prec, "entSub", "EntPrec")
    printMetric(ent_mrr, "entSub", "EntMrr")

    printMetric(ent_prec, "ent", "EntPrec")
    printMetric(ent_mrr, "ent", "EntMrr")

    printMetric(ent_prec, "htcTask", "EntPrec")
    printMetric(ent_mrr, "htcTask", "EntMrr")

    printMetric(ent_prec, "qccTask", "EntPrec")
    printMetric(ent_mrr, "qccTask", "EntMrr")

    printMetric(ent_prec, "co", "EntPrec")
    printMetric(ent_mrr, "co", "EntMrr")

    plotMultipleSys(
        prec,
        "No of Terms",
        "Prec",
        argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "prec.png",
        "Term Prediction Prec Plot",
    )
    plotMultipleSys(
        mrr,
        "No of Terms",
        "MRR",
        argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "mrr.png",
        "Term Prediction MRR Plot",
    )
    plotMultipleSys(
        ent_prec,
        "No of Terms",
        "Prec",
        argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_prec.png",
        "Term Prediction Prec Plot (Ent queries)",
    )
    plotMultipleSys(
        ent_mrr,
        "No of Terms",
        "MRR",
        argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_mrr.png",
        "Term Prediction MRR Plot (Ent queries)",
    )

    # htcTask.closeIndex();
    qccTask.closeIndex()
    """