Exemplo n.º 1
0
 def _get_frequencies(self, wids):
     d = {}
     dget = d.get
     for wid in wids:
         d[wid] = dget(wid, 0) + 1
     Wsquares = 0.0
     for wid, count in d.items():
         w = doc_term_weight(count)
         Wsquares += w * w
         d[wid] = w
     W = math.sqrt(Wsquares)
     for wid, weight in d.items():
         d[wid] = scaled_int(weight / W)
     return d, scaled_int(W)
Exemplo n.º 2
0
    def _search_wids(self, wids):
        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t]  # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = scaled_int(tf * idf)
            L.append((result, 1))
        return L
Exemplo n.º 3
0
 def _ranking_idf(self):
     word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2]
     idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39]
     for i in range(len(self.words)):
         word = self.words[i]
         eq(word_freqs[i], self.index._get_ft(word))
         eq(scaled_int(idfs[i]), self.index._get_wt(word))
Exemplo n.º 4
0
    def _search_wids(self, wids):
        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = scaled_int(tf * idf)
            L.append((result, 1))
        return L
Exemplo n.º 5
0
 def _get_frequencies(self, wids):
     d = {}
     dget = d.get
     for wid in wids:
         d[wid] = dget(wid, 0) + 1
     Wsquares = 0.0
     for wid, count in d.items():
         w = doc_term_weight(count)
         Wsquares += w * w
         d[wid] = w
     W = math.sqrt(Wsquares)
     #print "W = %.3f" % W
     for wid, weight in d.items():
         #print i, ":", "%.3f" % weight,
         d[wid] = scaled_int(weight / W)
         #print "->", d[wid]
     return d, scaled_int(W)
Exemplo n.º 6
0
 def _get_frequencies(self, wids):
     d = {}
     dget = d.get
     for wid in wids:
         d[wid] = dget(wid, 0) + 1
     Wsquares = 0.0
     for wid, count in d.items():
         w = doc_term_weight(count)
         Wsquares += w * w
         d[wid] = w
     W = math.sqrt(Wsquares)
     #print "W = %.3f" % W
     for wid, weight in d.items():
         #print i, ":", "%.3f" % weight,
         d[wid] = scaled_int(weight / W)
         #print "->", d[wid]
     return d, scaled_int(W)
Exemplo n.º 7
0
 def query_weight(self, terms):
     wids = []
     for term in terms:
         wids += self._lexicon.termToWordIds(term)
     N = float(self.document_count())
     sum = 0.0
     for wid in self._remove_oov_wids(wids):
         wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
         sum += wt ** 2.0
     return scaled_int(math.sqrt(sum))
Exemplo n.º 8
0
 def query_weight(self, terms):
     wids = []
     for term in terms:
         wids += self._lexicon.termToWordIds(term)
     N = float(self.document_count())
     sum = 0.0
     for wid in self._remove_oov_wids(wids):
         wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)
         sum += wt**2.0
     return scaled_int(math.sqrt(sum))
Exemplo n.º 9
0
    def _ranking_tf(self):
        # matrix of term weights for the rows are docids
        # and the columns are indexes into this list:
        l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
                 (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
                 (0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
                 (1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7),
                 (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
                 (0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
        l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41]

        for i in range(len(l_Wd)):
            docid = i + 1
            scaled_Wd = scaled_int(l_Wd[i])
            eq(scaled_Wd, self.index._get_Wd(docid))
            wdts = [scaled_int(t) for t in l_wdt[i]]
            for j in range(len(wdts)):
                wdt = self.index._get_wdt(docid, self.words[j])
                eq(wdts[j], wdt)
Exemplo n.º 10
0
 def _ranking_queries(self):
     queries = ['eat', 'porridge', 'hot OR porridge',
                'eat OR nine OR day OR old OR porridge']
     wqs = [1.95, 1.10, 1.77, 3.55]
     results = [[(6, 0.71)],
                [(1, 0.61), (2, 0.58), (5, 0.71)],
                [(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)],
                [(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]]
     for i in range(len(queries)):
         raw = queries[i]
         q = QueryParser(self.lexicon).parseQuery(raw)
         wq = self.index.query_weight(q.terms())
         eq(wq, scaled_int(wqs[i]))
         r, n = self.zc_index.query(raw)
         self.assertEqual(len(r), len(results[i]))
         # convert the results to a dict for each checking
         d = {}
         for doc, score in results[i]:
             d[doc] = scaled_int(score)
         for doc, score in r:
             score = scaled_int(float(score / SCALE_FACTOR) / wq)
             self.assertTrue(0 <= score <= SCALE_FACTOR)
             eq(d[doc], score)
Exemplo n.º 11
0
 def _search_wids(self, wids):
     if not wids:
         return []
     N = float(self.document_count())
     L = []
     DictType = type({})
     for wid in wids:
         assert wid in self._wordinfo  # caller responsible for OOV
         d2w = self._wordinfo[wid]  # maps docid to w(docid, wid)
         idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
         if isinstance(d2w, DictType):
             d2w = IIBucket(d2w)
         L.append((d2w, scaled_int(idf)))
     return L
Exemplo n.º 12
0
    def _checkAbsoluteScores(self):
        self.assertEqual(self.index._totaldoclen(), 6)
        # So the mean doc length is 2.  We use that later.

        r, num = self.zc_index.query('one')
        self.assertEqual(num, 3)
        self.assertEqual(len(r), 3)

        # Because our Okapi's B parameter is > 0, and 'one' only appears
        # once in each doc, the verbosity hypothesis favors shorter docs.
        self.assertEqual([doc for doc, score in r], [1, 2, 3])

        # The way the Okapi math works, a word that appears exactly once in
        # an average (length) doc gets tf score 1.  Our second doc has
        # an average length, so its score should by 1 (tf) times the
        # inverse doc frequency of 'one'.  But 'one' appears in every
        # doc, so its IDF is log(1 + 3/3) = log(2).
        self.assertEqual(r[1][1], scaled_int(inverse_doc_frequency(3, 3)))

        # Similarly for 'two'.
        r, num = self.zc_index.query('two')
        self.assertEqual(num, 2)
        self.assertEqual(len(r), 2)
        self.assertEqual([doc for doc, score in r], [2, 3])
        self.assertEqual(r[0][1], scaled_int(inverse_doc_frequency(2, 3)))

        # And 'three', except that doesn't appear in an average-size doc, so
        # the math is much more involved.
        r, num = self.zc_index.query('three')
        self.assertEqual(num, 1)
        self.assertEqual(len(r), 1)
        self.assertEqual([doc for doc, score in r], [3])
        idf = inverse_doc_frequency(1, 3)
        meandoclen = 2.0
        lengthweight = 1.0 - OkapiIndex.B + OkapiIndex.B * 3 / meandoclen
        tf = (1.0 + OkapiIndex.K1) / (1.0 + OkapiIndex.K1 * lengthweight)
        self.assertEqual(r[0][1], scaled_int(tf * idf))
Exemplo n.º 13
0
 def _search_wids(self, wids):
     if not wids:
         return []
     N = float(len(self._docweight))
     L = []
     DictType = type({})
     for wid in wids:
         assert self._wordinfo.has_key(wid)  # caller responsible for OOV
         d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
         idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
         #print "idf = %.3f" % idf
         if isinstance(d2w, DictType):
             d2w = IIBucket(d2w)
         L.append((d2w, scaled_int(idf)))
     return L
Exemplo n.º 14
0
 def query_weight(self, terms):
     # Get the wids.
     wids = []
     for term in terms:
         termwids = self._lexicon.termToWordIds(term)
         wids.extend(termwids)
     # The max score for term t is the maximum value of
     #     TF(D, t) * IDF(Q, t)
     # We can compute IDF directly, and as noted in the comments below
     # TF(D, t) is bounded above by 1+K1.
     N = float(len(self._docweight))
     tfmax = 1.0 + self.K1
     sum = 0
     for t in self._remove_oov_wids(wids):
         idf = inverse_doc_frequency(len(self._wordinfo[t]), N)
         sum += scaled_int(idf * tfmax)
     return sum
Exemplo n.º 15
0
 def query_weight(self, terms):
     # Get the wids.
     wids = []
     for term in terms:
         termwids = self._lexicon.termToWordIds(term)
         wids.extend(termwids)
     # The max score for term t is the maximum value of
     #     TF(D, t) * IDF(Q, t)
     # We can compute IDF directly, and as noted in the comments below
     # TF(D, t) is bounded above by 1+K1.
     N = float(len(self._docweight))
     tfmax = 1.0 + self.K1
     sum = 0
     for t in self._remove_oov_wids(wids):
         idf = inverse_doc_frequency(len(self._wordinfo[t]), N)
         sum += scaled_int(idf * tfmax)
     return sum
Exemplo n.º 16
0
 def _get_wt(self, t):
     wid, = self._lexicon.termToWordIds(t)
     map = self._wordinfo[wid]
     return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
Exemplo n.º 17
0
def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
    if abs(scaled1 - scaled2) > epsilon:
        raise AssertionError('{0} != {1}'.format(scaled1, scaled2))
Exemplo n.º 18
0
 def _get_wt(self, t):
     wid, = self._lexicon.termToWordIds(t)
     map = self._wordinfo[wid]
     return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
Exemplo n.º 19
0
def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
    if abs(scaled1 - scaled2) > epsilon:
        raise AssertionError, "%s != %s" % (scaled1, scaled2)
Exemplo n.º 20
0
def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
    if abs(scaled1 - scaled2) > epsilon:
        raise AssertionError, "%s != %s" % (scaled1, scaled2)