Exemplo n.º 1
0
    def testConstructor(self):
        self.assertRaises(ValueError, NBest, 0)
        self.assertRaises(ValueError, NBest, -1)

        for n in range(1, 11):
            nb = NBest(n)
            self.assertEqual(len(nb), 0)
            self.assertEqual(nb.capacity(), n)
Exemplo n.º 2
0
    def testConstructor(self):
        self.assertRaises(ValueError, NBest, 0)
        self.assertRaises(ValueError, NBest, -1)

        for n in range(1, 11):
            nb = NBest(n)
            self.assertEqual(len(nb), 0)
            self.assertEqual(nb.capacity(), n)
Exemplo n.º 3
0
 def query(self, query, nbest=10):
     # returns a total hit count and a mapping from docids to scores
     parser = QueryParser(self.lexicon)
     tree = parser.parseQuery(query)
     results = tree.executeQuery(self.index)
     if results is None:
         return [], 0
     chooser = NBest(nbest)
     chooser.addmany(results.items())
     return chooser.getbest(), len(results)
Exemplo n.º 4
0
 def query(self, query, nbest=10):
     # returns a total hit count and a mapping from docids to scores
     parser = QueryParser(self.lexicon)
     tree = parser.parseQuery(query)
     results = tree.executeQuery(self.index)
     if results is None:
         return [], 0
     chooser = NBest(nbest)
     chooser.addmany(results.items())
     return chooser.getbest(), len(results)
Exemplo n.º 5
0
def main(rt):
    index = rt["index"]
    files = rt["files"]
    times = {}
    ITERS = range(50)
    for i in range(11):
        for q in QUERIES:
            terms = q.split()
            for c in " OR ", " AND ":
                query = c.join(terms)
                t0 = clock()
                if TEXTINDEX:
                    if c == " OR ":
                        op = Or
                    else:
                        op = And
                    _q = " ".join(terms)
                    for _ in ITERS:
                        b = index.query(_q, op).bucket()
                        num = len(b)
                        chooser = NBest(10)
                        chooser.addmany(b.items())
                        results = chooser.getbest()

                else:
                    try:
                        for _ in ITERS:
                            results, num = index.query(query)
                    except:
                        continue
                t1 = clock()
                print "<p>Query: \"%s\"" % query
                print "<br>Num results: %d" % num
                print "<br>time.clock(): %s" % (t1 - t0)
                key = query
                if i == 0:
                    print "<ol>"
                    for docid, score in results:
                        url = path2url(files[docid])
                        fmt = '<li><a href="%s">%s</A> score = %s'
                        print fmt % (url, url, score)
                    print "</ol>"
                    continue
                l = times.setdefault(key, [])
                l.append(t1 - t0)

    l = times.keys()
    l.sort()
    print "<hr>"
    for k in l:
        v = times[k]
        print "<p>Query: \"%s\"" % k
        print "<br>Min time: %s" % min(v)
        print "<br>All times: %s" % " ".join(map(str, v))
Exemplo n.º 6
0
def main(rt):
    index = rt["index"]
    files = rt["files"]
    times = {}
    ITERS = range(50)
    for i in range(11):
        for q in QUERIES:
            terms = q.split()
            for c in " OR ", " AND ":
                query = c.join(terms)
                t0 = clock()
                if TEXTINDEX:
                    if c == " OR ":
                        op = Or
                    else:
                        op = And
                    _q = " ".join(terms)
                    for _ in ITERS:
                        b = index.query(_q, op).bucket()
                        num = len(b)
                        chooser = NBest(10)
                        chooser.addmany(b.items())
                        results = chooser.getbest()

                else:
                    try:
                        for _ in ITERS:
                            results, num = index.query(query)
                    except:
                        continue
                t1 = clock()
                print "<p>Query: \"%s\"" % query
                print "<br>Num results: %d" % num
                print "<br>time.clock(): %s" % (t1 - t0)
                key = query
                if i == 0:
                    print "<ol>"
                    for docid, score in results:
                        url = path2url(files[docid])
                        fmt = '<li><a href="%s">%s</A> score = %s'
                        print fmt % (url, url, score)
                    print "</ol>"
                    continue
                l = times.setdefault(key, [])
                l.append(t1 - t0)

    l = times.keys()
    l.sort()
    print "<hr>"
    for k in l:
        v = times[k]
        print "<p>Query: \"%s\"" % k
        print "<br>Min time: %s" % min(v)
        print "<br>All times: %s" % " ".join(map(str, v))
Exemplo n.º 7
0
    def query(self, query, nbest=10):
        """Return pair (mapping from docids to scores, num results).

        The num results is the total number of results before trimming
        to the nbest results.
        """
        tree = QueryParser(self.getLexicon()).parseQuery(query)
        results = tree.executeQuery(self.index)
        if results is None:
            return [], 0
        chooser = NBest(nbest)
        chooser.addmany(results.items())
        return chooser.getbest(), len(results)
Exemplo n.º 8
0
    def query(self, query, nbest=10):
        """Return pair (mapping from docids to scores, num results).

        The num results is the total number of results before trimming
        to the nbest results.
        """
        tree = QueryParser(self.getLexicon()).parseQuery(query)
        results = tree.executeQuery(self.index)
        if results is None:
            return [], 0
        chooser = NBest(nbest)
        chooser.addmany(results.items())
        return chooser.getbest(), len(results)
Exemplo n.º 9
0
def mass_weightedUnion(l_):
    "A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
    if len(l_) < 2:
        return _trivial(l_)
    # Balance unions as closely as possible, smallest to largest.
    merge = NBest(len(l_))
    for x, weight in l_:
        merge.add((x, weight), len(x))
    while len(merge) > 1:
        # Merge the two smallest so far, and add back to the queue.
        (x, wx), dummy = merge.pop_smallest()
        (y, wy), dummy = merge.pop_smallest()
        dummy, z = weightedUnion(x, y, wx, wy)
        merge.add((z, 1), len(z))
    (result, weight), dummy = merge.pop_smallest()
    return result
Exemplo n.º 10
0
def mass_weightedUnion(L):
    "A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
    if len(L) < 2:
        return _trivial(L)
    # Balance unions as closely as possible, smallest to largest.
    merge = NBest(len(L))
    for x, weight in L:
        merge.add((x, weight), len(x))
    while len(merge) > 1:
        # Merge the two smallest so far, and add back to the queue.
        (x, wx), dummy = merge.pop_smallest()
        (y, wy), dummy = merge.pop_smallest()
        dummy, z = weightedUnion(x, y, wx, wy)
        merge.add((z, 1), len(z))
    (result, weight), dummy = merge.pop_smallest()
    return result
Exemplo n.º 11
0
    def cosine_ranking(self, index, hits=250):
        """ Calculate the ranking of the document based on the 
            cosine rule.
        """

        IDF = {}                # mapping term -> inverse document frequency
        cache = {}              # mapping term -> found docids
        wid_cache = {}          # mapping term -> wid
        N = len(index)          # length of collection
        nbest = NBest(hits)

        for term in self.words().keys():

            wid_cache[term] = wid = index.getLexicon().getWordId(term)                         
            docids = index.getStorage().getDocumentIdsForWordId(wid)
            cache[term] = docids

            # term frequence = number of documents a term appears in
            tf = len(docids)

            # calc and store the inverse document frequency given as
            # log(1+N/TF)
            if tf == 0: IDF[term] = 0
            else:       IDF[term] = log(1.0 + N / tf) 

        terms = list(self.words().keys())
        num_terms = len(terms)
        get_frequency = index.getStorage().getWordFrequency
        for docid in self.docIds():   # iterate over all found documents

            rank = 0.0                # ranking
            total_dwt = 0.0           # document weight

            for term in terms:
                if not docid in cache[term]: continue 

                # document term frequency = the number of times a term
                # appears within a particular document
                try:
                    dtf = get_frequency(docid, wid_cache[term])
                except KeyError:
                    continue

                # document term weight = the weight of a term within a
                # document and is calculated as:
                dtw = (1.0 + log(dtf)) * IDF[term] 

                # query term frequency and query max frequency are set
                # to 1 by default
                qtf = qmf = 1    

                # query term weight is the weight given to each term in the
                # query and is calculated as:        
                qtw = (0.5 + (0.5 * qtf/qmf)) * IDF[term] * self.words()[term]

                # add this stuff to the ranking
                rank += (qtw * dtw) 
                total_dwt += (dtw * dtw)
#                print 'q:%12d/%10s: dtf=%8.5f dtw=%8.5f rank=%8.5f totaldtw=%8.5f' % (docid, term.encode('iso-8859-15'),dtf, dtw,rank, total_dwt)

            total_dwt = sqrt(total_dwt)
            if total_dwt == 0:
                rank = 0
            else:
#                print "\t",rank, total_dwt, rank/total_dwt
#                rank = rank / total_dwt     # normalization
                rank = rank  / num_terms
                rank = int(rank * 1000 + 0.5)   # scale rank to be an integer

            nbest.add(docid, rank)

        self._result = IIBTree()
        for docid, score in nbest.getbest():
            self._result[docid] = score
Exemplo n.º 12
0
    def testMany(self):
        import random
        inputs = [(-i, i) for i in range(50)]

        reversed_inputs = inputs[:]
        reversed_inputs.reverse()

        # Test the N-best for a variety of n (1, 6, 11, ... 50).
        for n in range(1, len(inputs) + 1, 5):
            expected = inputs[-n:]
            expected.reverse()

            random_inputs = inputs[:]
            random.shuffle(random_inputs)

            for source in inputs, reversed_inputs, random_inputs:
                # Try feeding them one at a time.
                nb = NBest(n)
                for item, score in source:
                    nb.add(item, score)
                self.assertEqual(len(nb), n)
                self.assertEqual(nb.capacity(), n)
                self.assertEqual(nb.getbest(), expected)

                # And again in one gulp.
                nb = NBest(n)
                nb.addmany(source)
                self.assertEqual(len(nb), n)
                self.assertEqual(nb.capacity(), n)
                self.assertEqual(nb.getbest(), expected)

                for i in range(1, n + 1):
                    self.assertEqual(nb.pop_smallest(), expected[-i])
                self.assertRaises(IndexError, nb.pop_smallest)
Exemplo n.º 13
0
    def testOne(self):
        nb = NBest(1)
        nb.add('a', 0)
        self.assertEqual(nb.getbest(), [('a', 0)])

        nb.add('b', 1)
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('b', 1)])

        nb.add('c', -1)
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('b', 1)])

        nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('f', 5)])
Exemplo n.º 14
0
    def testMany(self):
        import random
        inputs = [(-i, i) for i in range(50)]

        reversed_inputs = inputs[:]
        reversed_inputs.reverse()

        # Test the N-best for a variety of n (1, 6, 11, ... 50).
        for n in range(1, len(inputs)+1, 5):
            expected = inputs[-n:]
            expected.reverse()

            random_inputs = inputs[:]
            random.shuffle(random_inputs)

            for source in inputs, reversed_inputs, random_inputs:
                # Try feeding them one at a time.
                nb = NBest(n)
                for item, score in source:
                    nb.add(item, score)
                self.assertEqual(len(nb), n)
                self.assertEqual(nb.capacity(), n)
                self.assertEqual(nb.getbest(), expected)

                # And again in one gulp.
                nb = NBest(n)
                nb.addmany(source)
                self.assertEqual(len(nb), n)
                self.assertEqual(nb.capacity(), n)
                self.assertEqual(nb.getbest(), expected)

                for i in range(1, n+1):
                    self.assertEqual(nb.pop_smallest(), expected[-i])
                self.assertRaises(IndexError, nb.pop_smallest)
Exemplo n.º 15
0
    def testOne(self):
        nb = NBest(1)
        nb.add('a', 0)
        self.assertEqual(nb.getbest(), [('a', 0)])

        nb.add('b', 1)
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('b', 1)])

        nb.add('c', -1)
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('b', 1)])

        nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('f', 5)])