Exemplo n.º 1
0
def test_cosine_docv_len():
    vocabulary = {
        'xx': {
            'df': 10,
            'termid': 0
        },
        'yy': {
            'df': 15,
            'termid': 4
        },
        'zz': {
            'df': 30,
            'termid': 100
        },
        'tt': {
            'df': 45,
            'termid': 101
        },
        'kk': {
            'df': 10,
            'termid': 1000
        }
    }
    D = 10
    docv = [('xx', 1), ('yy', 3), ('zz', 5), ('tt', 2)]
    t = CosineScoring()
    dl = t._docv_len(docv, vocabulary, D)

    expected = math.sqrt(
        t._tfidf(D, 10, 1)**2 + t._tfidf(D, 15, 3)**2 + t._tfidf(D, 30, 5)**2 +
        t._tfidf(D, 45, 2)**2)

    assert dl == expected
Exemplo n.º 2
0
def test_cosine_score():
    def idf(df, N):
        return math.log(N / df, 10)

    pl = [
        [None, (100, None, (20, 1), (11, 1), (20, 1), (4, 2))],
        [0, (100, 20, (20, 0), (11, 15), (20, 0), (4, 1))],
        [2, (100, 10, (20, 1), (11, 0), (20, 0), (4, 20))],
        [4, (100, 100, (20, 0), (11, 0), (20, 5), (4, 7))],
    ]

    expected_scores = [
        [
            2,
            ((1 * 1 * idf(20, 100) * idf(20, 100)) + 0 + 0 +
             (2 * 20 * idf(4, 100) * idf(4, 100))) / 10
        ],
        [
            0,
            (0 + (15 * 1 * idf(11, 100) * idf(11, 100)) + 0 +
             (1 * 2 * idf(4, 100) * idf(4, 100))) / 20
        ],
        [
            4,
            (0 + 0 + (1 * 5 * idf(20, 100) * idf(20, 100)) +
             (2 * 7 * idf(4, 100) * idf(4, 100))) / 100
        ],
    ]

    t = CosineScoring()
    scores = t.score(pl)

    assert len(scores) == len(expected_scores)
    for a, b in zip(expected_scores, scores):
        len(a) == len(b)
        for c, d in zip(a, b):
            assert c == d
Exemplo n.º 3
0
def test__merge_pl1():
    # test more than one keyword found
    indexes = [
        {
            'termid': 1,
            'pl': [[0, 1, 11], [1, 1, 1]]
        },
        {
            'termid': 3,
            'pl': [[0, 1, 14], [1, 1, 2]]
        },
        {
            'termid': 0,
            'pl': [[0, 2, 10, 15], [1, 1, 3]]
        },
        {
            'termid': 2,
            'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]]
        },
    ]

    indexing = Indexing(db=db)
    preprocessing = Preprocessing()
    ranking = CosineScoring()
    b = Retrieval(db=db,
                  indexing=indexing,
                  preprocessing=preprocessing,
                  ranking=ranking)
    merged_indexes, docids = b._merge_indexes(indexes)
    merged_indexes = sorted(merged_indexes,
                            key=lambda x: x[0])  # for testing only

    expected_merged_indexes = [
        [0, [(0, 2), (1, 1), (2, 1), (3, 1)], [10, 11, 12, 14, 15]],
        [1, [(0, 1), (1, 1), (2, 1), (3, 1)], [1, 2, 3, 12]],
    ]

    assert len(merged_indexes) == len(expected_merged_indexes)
    for i, j in zip(merged_indexes, expected_merged_indexes):
        assert len(i) == len(j)
        for t, k in zip(i, j):
            assert t == k
Exemplo n.º 4
0
	def __init__ (self, db=None, dbname='market', max_queue=100, max_delete_cache=100, max_update_wait_time=300, max_delete_wait_time=300):
		'''
		Set up parameters and index documents from the last queue and cache.

		::param max_queue:: max number of docids in the queue
		::param max_delete_cache:: max number of docids in the cache
		::param max_update_wait_time:: max waiting time in seconds before update documents with docids in queue
		::param max_delete_wait_time:: max waiting time in seconds before delete documents with docids in the delete cache.
		'''
		self.max_queue = max_queue
		self.max_delete_cache = max_delete_cache
		self.max_update_wait_time = max_update_wait_time
		self.max_delete_wait_time = max_delete_wait_time
		self.db = db
		self._create_cache ()
		self._ranking = CosineScoring ()
		self._preprocessing = Preprocessing () 		
		self._indexing = Indexing (db=db, preprocessing=self._preprocessing) 
		self._retrieval = Retrieval (db=db, preprocessing=self._preprocessing, ranking=self._ranking, indexing=self._indexing)
		self.index ()
Exemplo n.º 5
0
def test_rank():
    docs = [[1, 'xx xx'], [3, 'yy xy'], [7, 'xx zz']]

    scores = [[3, 5], [1, 4], [7, 1]]

    indexing = Indexing(db=db)
    preprocessing = Preprocessing()
    ranking = CosineScoring()
    b = Retrieval(db=db,
                  indexing=indexing,
                  preprocessing=preprocessing,
                  ranking=ranking)

    ranked_docs = b._rank(docs, scores)

    exp_ranked_docs = [[3, 'yy xy'], [1, 'xx xx'], [7, 'xx zz']]

    assert len(ranked_docs) == len(exp_ranked_docs)
    for a, b in zip(ranked_docs, exp_ranked_docs):
        assert len(a) == len(b)
        for c, d in zip(a, b):
            assert c == d
Exemplo n.º 6
0
def test_retrieve():
    q = 'tt yy zz'

    collection = ['xx yy zz. xx tt.', 'yy yy zz. zz tt kk.', 'kk gh mk']
    db.content_coll.insert_many(collection)
    collection = list(db.content_coll.find_many())
    collection = [[d['_id'], d['content'], 0] for d in collection]

    preprocessing = Preprocessing()
    ranking = CosineScoring()
    indexing = Indexing(db=db, preprocessing=preprocessing)
    indexing.index(collection)
    b = Retrieval(db=db,
                  indexing=indexing,
                  preprocessing=preprocessing,
                  ranking=ranking)

    try:
        ranked_docs = b.retrieve(q)
        ranked_docsids = [d[0] for d in ranked_docs]
        expected_docids = [collection[1][0], collection[0][0]]

        assert len(ranked_docsids) == len(expected_docids)
        for a, b in zip(ranked_docsids, expected_docids):
            assert a == b

        db.index_coll.drop()
        db.vocabulary_coll.drop()
        db.contentvectors_coll.drop()
        db.content_coll.drop()
    except Exception as ex:
        print(ex)
        db.index_coll.drop()
        db.vocabulary_coll.drop()
        db.contentvectors_coll.drop()
        db.content_coll.drop()
        assert False
Exemplo n.º 7
0
def test__merge_pl0():
    # Test: only one term found
    indexes = [
        {
            'termid': 1,
            'pl': [[0, 1, 11]]
        },
        {
            'termid': 3,
            'pl': [[0, 1, 14]]
        },
        {
            'termid': 0,
            'pl': [[0, 1, 10], [1, 1, 20]]
        },
        {
            'termid': 2,
            'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]]
        },
    ]

    indexing = Indexing(db=db)
    preprocessing = Preprocessing()
    ranking = CosineScoring()
    b = Retrieval(db=db,
                  indexing=indexing,
                  preprocessing=preprocessing,
                  ranking=ranking)
    merged_indexes, docids = b._merge_indexes([indexes[0]])

    expected_merged_indexes = [[0, (1, 1), [11]]]
    assert len(merged_indexes) == len(expected_merged_indexes)
    for i, j in zip(merged_indexes, expected_merged_indexes):
        assert len(i) == len(j)
        for t, k in zip(i, j):
            assert t == k
Exemplo n.º 8
0
def test_cosine_create_scoring_data():
    '''
	Assume docv_len is calculated correctly, and thus not being tested.
	'''

    vocabulary = {
        'xx': {
            'df': 10,
            'termid': 0
        },
        'yy': {
            'df': 15,
            'termid': 4
        },
        'zz': {
            'df': 30,
            'termid': 100
        },
        'tt': {
            'df': 45,
            'termid': 101
        },
        'kk': {
            'df': 10,
            'termid': 1000
        }
    }

    vocabulary = defaultdict(lambda: {'df': 0, 'termid': None}, vocabulary)

    class Indexing:
        def __init__(self, vocabulary):
            self._vocabulary = vocabulary

        def get_vocabulary(self):
            return self._vocabulary

        def get_doc_vectors(self, docid):
            return [
                {
                    'docid': 0,
                    'tf': [(0, 1), (4, 3), (100, 5), (101, 2)]
                },
                {
                    'docid': 1,
                    'tf': [(0, 10), (4, 1), (100, 2), (101, 4)]
                },
                {
                    'docid': 2,
                    'tf': [(0, 1), (4, 1), (100, 1), (101, 3)]
                },
            ]

    class Retrieval:
        def __init__(self, indexing=None):
            self.D = 10
            self.indexing = indexing

    pl = [
        [0, [(0, 1), (4, 5), (100, 2), (101, 4)], []],
        [1, [(0, 2), (4, 2), (101, 1)], []],
        [2, [(4, 1), (100, 1), (101, 2)], []],
    ]

    # Test: all terms query in vocabulary.

    tokens = ['yy', 'xx', 'zz', 'tt']
    i = Indexing(vocabulary)
    r = Retrieval(indexing=i)
    t = CosineScoring()
    scoring_data = t.create_scoring_data(r, pl, tokens)

    expected_score_data = [  # the scoring is incorrect order. But doing so makes it easier to tests.
        [None, 10, None, (10, 1), (15, 1), (30, 1), (45, 1)],
        [0, 10, 10, (10, 1), (15, 5), (30, 2), (45, 4)],
        [1, 10, 20, (1, 0), (10, 2), (15, 2), (45, 1)],
        [2, 10, 30, (1, 0), (15, 1), (30, 1), (45, 2)],
    ]

    # sort for testing
    for d in scoring_data:
        d[3:] = sorted(d[3:], key=lambda x: x[0])

    assert len(scoring_data) == len(expected_score_data)
    for a, b in zip(scoring_data, expected_score_data):
        assert len(a) == len(b)
        for c, d in zip(a[:2], b[:2]):
            assert c == d

        for c, d in zip(a[3:], b[3:]):
            assert len(c) == len(d)
            for e, f in zip(c, d):
                assert e == f
Exemplo n.º 9
0
def test__fetch_indexes():
    '''
	Test: Fetch existing terms
	'''

    vocabulary = [
        {
            'termid': 0,
            'term': 'xx',
            'df': 1
        },
        {
            'termid': 1,
            'term': 'yy',
            'df': 1
        },
        {
            'termid': 2,
            'term': 'zz',
            'df': 1
        },
        {
            'termid': 3,
            'term': 'tt',
            'df': 1
        },
    ]

    index = [
        {
            'termid': 0,
            'pl': [[0, 1, 10], [1, 1, 20]]
        },
        {
            'termid': 1,
            'pl': [[0, 1, 11]]
        },
        {
            'termid': 2,
            'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]]
        },
        {
            'termid': 3,
            'pl': [[0, 1, 14]]
        },
    ]

    tokens = ['xx', 'yy', 'zz', 'tt']

    expected_pl = [
        {
            'termid': 1,
            'pl': [[0, 1, 11]]
        },
        {
            'termid': 3,
            'pl': [[0, 1, 14]]
        },
        {
            'termid': 0,
            'pl': [[0, 1, 10], [1, 1, 20]]
        },
        {
            'termid': 2,
            'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]]
        },
    ]

    indexing = Indexing(db=db)
    preprocessing = Preprocessing()
    ranking = CosineScoring()
    indexing._vocabulary_coll.insert_many(vocabulary)
    indexing._index_coll.insert_many(index)
    indexing.create_cache()
    r = Retrieval(db=db,
                  indexing=indexing,
                  preprocessing=preprocessing,
                  ranking=ranking)

    try:
        pl = r._fetch_indexes(tokens)

        assert len(pl) == len(expected_pl)
        for a, b in zip(pl, expected_pl):
            assert a['termid'] == b['termid']
            assert len(a['pl']) == len(b['pl'])
            for c, d in zip(a['pl'], b['pl']):
                assert len(c) == len(d)
                for e, f in zip(c, d):
                    assert e == f

    except Exception as ex:
        print(ex)
        indexing._vocabulary_coll.drop()
        indexing._index_coll.drop()
        assert False
    else:
        indexing._vocabulary_coll.drop()
        indexing._index_coll.drop()