예제 #1
0
def test_class_invertedindex_build_inverted_index_and_dump_load_16(tmp_path):
    """
        Проверка работы записи и перезаписи файла с инвертированным индексом
        :param tmp_path: создаю файл во временной директории, для записи тестируемого документа
    """
    f1 = tmp_path / 'temp'

    test_index = [0, 1, 3]
    test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'},
                  {'test', 'two', 'cant'}]
    test_stop_words = {'hi', 'cant', 'lol'}
    test_result = inverted_index.build_inverted_index(
        indexs=test_index, stop_words=test_stop_words, words=test_words)
    test_result.rewriting(filepath=f1)  # запись первого набора данных

    test_index2 = [15, 25]
    test_words2 = [{'one'}, {'two'}]
    test_result2 = inverted_index.build_inverted_index(
        indexs=test_index2, stop_words=test_stop_words, words=test_words2)

    test_result2.rewriting(filepath=f1)  # запись второго набора данных

    test_inverted_index_load = inverted_index.InvertedIndex.load(filepath=f1)

    assert test_inverted_index_load.word_to_docs_mapping == {
        'test': {0, 1, 3},
        'null': {0},
        'one': {1, 15},
        'two': {3, 25}
    }
예제 #2
0
def test_not_number_doc_where_lot_documents_26(creat_not_corect_data):
    indexs, words, stop_words = creat_not_corect_data

    test_inverted_index = inverted_index.build_inverted_index(
        indexs=indexs, words=words, stop_words=stop_words).word_to_docs_mapping

    etelon = {
        'sit': {4},
        'window': {4},
        'make': {4},
        'program': {4},
        'sasha': {None},
        'porridge': {None},
        'little': {None},
        'was': {None},
        'delicious': {None},
        'third': {5},
        'test': {None, 5, 4},
        'about': {5},
        'something': {5},
        'watching': {None},
        'south': {None},
        'park': {None},
        'now': {None}
    }
    assert etelon == test_inverted_index
def test_build_inverted_index_working_right(tiny_dataset_documents):
    test_inverted_index = build_inverted_index(tiny_dataset_documents)
    expected_documents = {
        'same': [123, 2],
        'and': [123, 37],
        'nothing': [123],
        'words': [123, 2, 37],
        'in': [2],
        'this': [2],
        'to': [5],
        'be': [5],
        'or': [5],
        'not': [5],
        'all': [37],
        'such': [37],
        'as': [37],
        'are': [37],
        'here': [37],
        'A_word': [123, 37],
        'B_word': [2, 37],
        'dataset': [2],
        'famous_phrases': [5],
    }
    assert test_inverted_index.term2doc == expected_documents, (
        "build_inverted_index build incorrect"
    )
예제 #4
0
def creat_inverted_index(creat_index_and_words_with_temp_file,
                         creat_stop_words_wher_load_stop_words_with_fiel):
    indexs, words = creat_index_and_words_with_temp_file
    stop_words = creat_stop_words_wher_load_stop_words_with_fiel
    return inverted_index.build_inverted_index(indexs=indexs,
                                               words=words,
                                               stop_words=stop_words)
예제 #5
0
def test_all_22(tmp_path, tmpdir):
    """
        Общий тест функционала: обратока документов, создание инверт индеса, запись его и считывание. Поиск слова
        :param tmp_path: временная диретория
        :param tmpdir: добавление временого файла
    """
    test_doc = tmpdir.join(
        'datatest.txt')  # создаю временный тестовый документ
    test_doc.write(
        '0\tTest text! test number one...\n1\tTest text... number two!\n'
        '2\tKent!!!! red gay\n3\tBoys len lan, two\n12\tTrest!!! best wreit!')
    test_doc2 = tmpdir.join('stop_words.txt')
    test_doc2.write('Test\ntest\nnumber\nTe\ntext\nnumber2')

    test_doc3 = tmpdir.join('inverted.index')

    indexs, words = inverted_index.load_documents(test_doc)
    stop_words = inverted_index.load_stop_words(test_doc2)

    test_inverted_index2 = inverted_index.build_inverted_index(
        indexs=indexs, words=words, stop_words=stop_words)
    test_inverted_index2.dump(test_doc3)  # json записывается на диск

    test_inverted_index_load = inverted_index.InvertedIndex.load(test_doc3)
    document_ids = test_inverted_index_load.query(["two"])
    etalon = {1, 3}
    assert etalon == document_ids
예제 #6
0
def test_not_number_doc_where_lot_documents_with_two_doc_have_not_number_27(
        creat_not_corect_data, tmp_path):
    tile_path = tmp_path / 'tmp'

    indexs, words, stop_words = creat_not_corect_data

    test_inverted_index = inverted_index.build_inverted_index(
        indexs=indexs, words=words, stop_words=stop_words)

    test_inverted_index.dump(filepath=tile_path)
    load_test_inverted_index = inverted_index.InvertedIndex.load(tile_path)
    etelon = {
        'sit': {4},
        'window': {4},
        'make': {4},
        'program': {4},
        'sasha': {None},
        'porridge': {None},
        'little': {None},
        'was': {None},
        'delicious': {None},
        'third': {5},
        'test': {None, 5, 4},
        'about': {5},
        'something': {5},
        'watching': {None},
        'south': {None},
        'park': {None},
        'now': {None}
    }
    assert etelon == load_test_inverted_index.word_to_docs_mapping
예제 #7
0
def test_build_inverted_one_doc_have_doc_with_dont_new_index_30(
        create_not_corect_data_with_two_doc_have_one_ndex):
    test_file = create_not_corect_data_with_two_doc_have_one_ndex
    indexs, words = inverted_index.load_documents(test_file)
    stop_words = {'i', 'a', 'am', 'is', 'by', 'and', 'the'}

    test_inverted_idex = inverted_index.build_inverted_index(
        stop_words=stop_words, indexs=indexs, words=words)
    etalan = {
        'test': {4, 5, 8, 6},
        'sit': {4},
        'window': {4},
        'make': {4},
        'program': {4},
        'third': {5},
        'about': {5},
        'something': {5},
        'number': {8},
        'tree': {8},
        'now': {6},
        'south': {6},
        'watching': {6},
        'park': {6}
    }
    assert etalan == test_inverted_idex.word_to_docs_mapping
예제 #8
0
def creat_test_file_for_load_with_him():
    index = [0, 1]
    words = [{'words', 'name', 'test'}, {'words', 'green', 'man'}]
    test_inverted_index = inverted_index.build_inverted_index(indexs=index,
                                                              words=words,
                                                              stop_words={})
    return test_inverted_index
예제 #9
0
def fixture_inverted_index(tmp_path):
    test_index = [0, 1, 3]
    test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'},
                  {'test', 'two', 'cant'}]
    test_stop_words = {'hi', 'cant', 'lol'}
    test_result = inverted_index.build_inverted_index(
        indexs=test_index, stop_words=test_stop_words, words=test_words)
    return test_result
def test_dump_and_load_index(tmp_path, tiny_sample_document):
    dir = tmp_path / "tiny_example_dir"
    dir.mkdir()
    index_file = dir / "tiny_example.index"
    docs = tiny_sample_document
    inv_table = build_inverted_index(docs)
    inv_table.dump(index_file)
    assert inv_table == TINY_SAMPLE_INV_TABLE
    loaded_inv_table = InvertedIndex.load(index_file)
    assert inv_table == loaded_inv_table
예제 #11
0
def test_inv_index_query(load_inverted_index, wiki_docs):
    words = ['after', 'were']
    doc_ind = InvertedIndex.query(load_inverted_index, words)
    assert {25, 290}.issubset(doc_ind)
    words = ['neizvesnie', 'slova']
    doc_ind = InvertedIndex.query(load_inverted_index, words)
    assert len(doc_ind) == 0
    words = ['after', 'were']
    index_dict = build_inverted_index(wiki_docs)
    inv_index = InvertedIndex(index_dict)
    doc_ind = inv_index.query(words)
    assert {25, 290}.issubset(doc_ind)
예제 #12
0
def test_class_invertedindex_query_with_two_words_19():
    """
        Поиск двух одинаовых слов в методе InvertedIndex.query
    """
    test_index = [0, 1, 3]
    test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'},
                  {'test', 'two', 'cant'}]
    test_stop_words = {'hi', 'cant', 'lol'}
    test_result = inverted_index.build_inverted_index(
        indexs=test_index, stop_words=test_stop_words, words=test_words)
    test_query = test_result.query(['two', 'two'])
    etalon = {3}
    assert test_query == etalon
예제 #13
0
def test_class_invertedindex_query_18():
    """
        Как производится посик слова InvertedIndex.query и вывод результата
    """
    test_index = [0, 1, 3]
    test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'},
                  {'test', 'two', 'cant'}]
    test_stop_words = {'hi', 'cant', 'lol'}
    test_result = inverted_index.build_inverted_index(
        indexs=test_index, stop_words=test_stop_words, words=test_words)
    test_query = test_result.query(['two', 'test'])
    etalon = {3}
    assert test_query == etalon
예제 #14
0
def test_query_not_number_doc_where_lot_documents_with_two_doc_have_not_number_28(
        creat_not_corect_data, tmp_path):
    tile_path = tmp_path / 'tmp'

    indexs, words, stop_words = creat_not_corect_data

    test_inverted_index = inverted_index.build_inverted_index(
        indexs=indexs, words=words, stop_words=stop_words)

    test_inverted_index.dump(filepath=tile_path)
    load_test_inverted_index = inverted_index.InvertedIndex.load(tile_path)
    result_query = load_test_inverted_index.query(['south'])
    assert result_query == {None}
def test_query_inverted_index_with_query_file_utf_8():
    documents = load_documents(TINY_DATASET_FPATH)
    tiny_inverted_index = build_inverted_index(documents)
    tiny_inverted_index.dump_binary(TINY_INVERTED_INDEX_STORE_PATH)
    count = 1
    with open(QUERY_FILE_UTF8_FPATH) as q_file:
        for line in q_file:
            line = line.split()
            answer = tiny_inverted_index.query(line)
            if count == 1:
                etalon_answer = [12, 25]
            else:
                etalon_answer = [25]
            assert sorted(answer) == sorted(etalon_answer), (
                f"Expected answer is {etalon_answer},but you got {answer}")
            count += 1
예제 #16
0
def test_build_inverted_index11():
    """
        Проферка работы функции build_inverted_index на коректных данных
    """
    test_index = [0, 1, 3]
    test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'},
                  {'test', 'two', 'cant'}]
    test_stop_words = {'hi', 'cant', 'lol'}
    test_result = inverted_index.build_inverted_index(
        indexs=test_index, stop_words=test_stop_words, words=test_words)
    assert test_result.word_to_docs_mapping == {
        'test': {0, 1, 3},
        'null': {0},
        'one': {1},
        'two': {3}
    }
예제 #17
0
def build_inverted_index_for_creat_data_not_corect(tmpdir):
    test_doc = tmpdir.join('wiki_doc')
    test_doc.write(
        '\tName Shasha train this program and work with data like!\n'
        '4\tTest name number two and test, i like programming!')

    test_doc_stop_words = tmpdir.join('stop_words.txt')
    test_doc_stop_words.write('and\ni\n')

    result_load_doc = inverted_index.load_documents(filepath=test_doc)
    result_load_stop_words = inverted_index.load_stop_words(
        filepath=test_doc_stop_words)

    result_inverted_index_build_inverted_index = inverted_index.build_inverted_index(
        stop_words=result_load_stop_words,
        words=result_load_doc[1],
        indexs=result_load_doc[0])
    return result_inverted_index_build_inverted_index
예제 #18
0
 def test_can_build_inverted_index(
         self, creat_index_and_words_with_temp_file,
         creat_stop_words_wher_load_stop_words_with_fiel):
     index, words = creat_index_and_words_with_temp_file
     tinverted_index = inverted_index.build_inverted_index(
         indexs=index,
         words=words,
         stop_words=creat_stop_words_wher_load_stop_words_with_fiel)
     assert tinverted_index.word_to_docs_mapping == {
         'believe': {0},
         'tears': {0},
         'wind': {1},
         'making': {1},
         'noise': {1},
         'head': {1},
         'walking': {2},
         'with': {2},
         'spring': {2}
     }
예제 #19
0
def test_class_invertedindex_and_build_inverted_index_15(tmp_path):
    """
        Как работает build_inverted_index, запись в файл InvertedIndex.dump и считывание InvertedIndex.load
        :param tmp_path: создаю файл во временной директории, для записи тестируемого документа
    """
    f1 = tmp_path / 'temp'  # временная директория
    test_index = [0, 1, 3]  # полученые номера документов
    test_words = [{'test', 'null', 'hi'}, {'test', 'one', 'lol'},
                  {'test', 'two', 'cant'}]  # слова документов
    test_stop_words = {'hi', 'cant', 'lol'}
    # строю инвертированный индекс
    test_result = inverted_index.build_inverted_index(
        indexs=test_index, stop_words=test_stop_words, words=test_words)
    test_result.dump(
        filepath=f1)  # записываю инвертированный индекс в временную директорию
    test_inverted_index_load = inverted_index.InvertedIndex.load(
        filepath=f1)  # считываю инверт индекс
    result = test_inverted_index_load.word_to_docs_mapping
    assert result == {'test': {0, 1, 3}, 'null': {0}, 'one': {1}, 'two': {3}}
예제 #20
0
def test_build_inverted_index_not_core_data_12():
    """
        Как отработает build_inverted_index на данные с отстуствием индекса
    :return:
    """
    test_indexs = [None]
    test_words = [{'name', 'begin', 'where', 'test', 'work', 'he', 'i'}]
    test_stop_words = {'he', 'i'}

    test_inverted_index = inverted_index.build_inverted_index(
        indexs=test_indexs, words=test_words, stop_words=test_stop_words)

    etalon = {
        'name': {None},
        'begin': {None},
        'where': {None},
        'test': {None},
        'work': {None}
    }
    assert test_inverted_index.word_to_docs_mapping == etalon
예제 #21
0
def test_number_and_no_words_in_document_28(creat_doc_have_not_words):
    indexs, words = creat_doc_have_not_words
    stop_words = {'i', 'a', 'am', 'is', 'by', 'and', 'the'}
    test_inverted_index = inverted_index.build_inverted_index(
        indexs=indexs, words=words, stop_words=stop_words)
    etalon = {
        'sit': {4},
        'window': {4},
        'make': {4},
        'program': {4},
        'test': {4, 5, 6},
        'third': {5},
        'about': {5},
        'something': {5},
        'watching': {6},
        'south': {6},
        'park': {6},
        'now': {6}
    }
    assert etalon == test_inverted_index.word_to_docs_mapping
예제 #22
0
def test_can_query(tmpdir):
    dataset_fio = tmpdir.join("dataset.txt")
    dataset_fio.write(
        dedent("""\
        1\thappy cat wow
        2\thappy cat good
        3\tgood cat audi
        4\t audi and bmw
        """))
    documents = load_documents(dataset_fio)
    inverted_index = build_inverted_index(documents)
    document_ids = inverted_index.query(["happy", "good"])
    assert document_ids == [2]
    assert inverted_index.query(["happy", "good", "cat"]) == [2]
    assert inverted_index.query(["cat"]) == [1, 2, 3]
    assert inverted_index.query(["cat", "audi"]) == [3]
    assert inverted_index.query(["cat", "audi", 'cat']) == [3]
    assert inverted_index.query(["cat", "audi", 'audi']) == [3]
    assert inverted_index.query(["audi", 'bmw']) == [4]
    assert inverted_index.query(["audi", 'bmw', 'cat']) == list()
def test_binary_dump_and_load_index(tmp_path,
                                    tiny_sample_document,
                                    words=['of', 'words']):
    dir_ = tmp_path / "tiny_example_dir"
    dir_.mkdir()
    index_file = dir_ / "tiny_example.bin.index"
    docs = tiny_sample_document
    inv_table = build_inverted_index(docs)
    inv_table.dump(index_file, storage_policy='binary')
    assert inv_table == TINY_SAMPLE_INV_TABLE
    loaded_inv_table = InvertedIndex.load(index_file, storage_policy='binary')
    assert inv_table == loaded_inv_table
    # test query callback
    Args = namedtuple('Args', ['index_path', 'words'])
    args = Args(index_path=index_file, words=words)
    response = query_callback(args)
    ethalon_response = [
        {14, 1000},
    ]
    assert response == ethalon_response
def test_index_creation():
    docs = load_documents(TINY_SAMPLE_FILEPATH)
    inv_idx = build_inverted_index(docs)
    assert TINY_SAMPLE_INV_TABLE == inv_idx
    assert repr(TINY_SAMPLE_WORD_DICT) == repr(inv_idx)
def test_invert_table_eq(tiny_sample_document):
    docs = tiny_sample_document
    inv_table1 = build_inverted_index(docs)
    inv_table2 = build_inverted_index(docs)
    assert inv_table1 == inv_table2
예제 #26
0
파일: main.py 프로젝트: jgromeros/ir
# -*- coding: latin-1 -*-
'''
Created on 7/02/2013
@author: 74187593
'''
from inverted_index import build_inverted_index
from boolean_query import intersect_several, union

if __name__ == '__main__':
    my_path = 'C:\\temp\\benedetti'
    document_list = {}
    dictionary = build_inverted_index(my_path, document_list)
    for d in sorted(dictionary.keys()):
        print d + " : " + str(dictionary[d].frequency) + " : " + str(dictionary[d].postings)
    print len(dictionary)
    answer = intersect_several([dictionary[u"tu"], dictionary[u"por"], dictionary[u"te"]])
    answer = union(dictionary[u"tu"], dictionary[u"por"])
    for document_found in answer.postings:
        print document_list[document_found]
def test_build_inverted_index_do_not_raise_exception():
    documents = []
    build_inverted_index(documents)
예제 #28
0
def test_inverted_index_dump(wiki_docs):
    index_dict = build_inverted_index(wiki_docs)
    inv_index = InvertedIndex(index_dict)
    inv_index.dump('inv_index.dat')
    assert True
예제 #29
0
def test_build_inverted_index(wiki_docs):
    index_dict = build_inverted_index(wiki_docs)
    assert isinstance(index_dict, dict)
예제 #30
0
        this is done over just putting query results in a shelf simply because
        the stored results are deleted after the browser is closed and thus
        are less permanent
    """
    parser = argparse.ArgumentParser(description="Boolean IR system")
    parser.add_argument("--build")
    parser.add_argument("--run", action="store_true")
    parser.add_argument("--test", action="store_true")
    args = parser.parse_args()
    print(args)
    if args.test:
        wapo_path = data_dir.joinpath("test_corpus.jl")
        # print("TEST")

    if args.build:
        build_inverted_index(wapo_path, str(data_dir.joinpath(
            args.build)))  # shelve.open cannot recognize Path
        shelve_wapo(wapo_path, shelf_path)
    if args.run:
        # Use context managers for safe open and close
        with shelve.open(shelf_path, flag='r') as wapo:
            # Name full inverted index FULL
            with shelve.open(str(data_dir.joinpath("FULL")),
                             flag='r') as index:
                with shelve.open(str(data_dir.joinpath("QUERY")),
                                 flag='n') as query:
                    WAPO_SHELF = wapo
                    INDEX_SHELF = index
                    QUERY_SHELF = query
                    app.run(debug=True, port=5000)
def get_inverted_index():
    documents = load_documents(DATASET_SMALL_FPATH)
    inverted_index = build_inverted_index(documents)
    return inverted_index