def test_build_inverted_one_doc_have_doc_with_dont_new_index_30( create_not_corect_data_with_two_doc_have_one_ndex): test_file = create_not_corect_data_with_two_doc_have_one_ndex indexs, words = inverted_index.load_documents(test_file) stop_words = {'i', 'a', 'am', 'is', 'by', 'and', 'the'} test_inverted_idex = inverted_index.build_inverted_index( stop_words=stop_words, indexs=indexs, words=words) etalan = { 'test': {4, 5, 8, 6}, 'sit': {4}, 'window': {4}, 'make': {4}, 'program': {4}, 'third': {5}, 'about': {5}, 'something': {5}, 'number': {8}, 'tree': {8}, 'now': {6}, 'south': {6}, 'watching': {6}, 'park': {6} } assert etalan == test_inverted_idex.word_to_docs_mapping
def test_all_22(tmp_path, tmpdir): """ Общий тест функционала: обратока документов, создание инверт индеса, запись его и считывание. Поиск слова :param tmp_path: временная диретория :param tmpdir: добавление временого файла """ test_doc = tmpdir.join( 'datatest.txt') # создаю временный тестовый документ test_doc.write( '0\tTest text! test number one...\n1\tTest text... number two!\n' '2\tKent!!!! red gay\n3\tBoys len lan, two\n12\tTrest!!! best wreit!') test_doc2 = tmpdir.join('stop_words.txt') test_doc2.write('Test\ntest\nnumber\nTe\ntext\nnumber2') test_doc3 = tmpdir.join('inverted.index') indexs, words = inverted_index.load_documents(test_doc) stop_words = inverted_index.load_stop_words(test_doc2) test_inverted_index2 = inverted_index.build_inverted_index( indexs=indexs, words=words, stop_words=stop_words) test_inverted_index2.dump(test_doc3) # json записывается на диск test_inverted_index_load = inverted_index.InvertedIndex.load(test_doc3) document_ids = test_inverted_index_load.query(["two"]) etalon = {1, 3} assert etalon == document_ids
def test_can_load_documents(): documents = load_documents(TINY_DATASET_FPATH) etalon_documents = { "12": "another sentense four two one one three.\n", "25": "one two three four words.\n" } assert etalon_documents == documents, "load_documents incorrectly load dataset"
def test_read_docs_sample_v1(): loaded_docs = load_documents(SMALL_SAMPLE_FILEPATH) res = { 1: 'Article 1 Some text to test inverted index', 2: 'Article 2 Another paragraph with no common words with first one', 17: 'Article 3 Sample text similar to first article for test', 5: "АФЫФЫё фывфапфва фывтлавы фывтлфы ΔG ‡" } assert res == loaded_docs
def creat_doc_have_not_words(tmpdir): not_corect_data = '4\tI sit by the window and make a program test\n' \ '8\t\n' \ '5\tThe third test is about something\n' \ '6\tI am watching south park now, test' test_file = tmpdir.join('wiki_file') test_file.write(not_corect_data) return inverted_index.load_documents(test_file)
def test_read_docs_sample_v2(tmpdir): dataset_str = dedent("""\ 14 BOW Bag of words 1000 CBOW Continius bag of words """) dataset_fio = tmpdir.join("light.dataset") dataset_fio.write(dataset_str) docs = load_documents(dataset_fio) etalon_docs = {14: "BOW Bag of words", 1000: "CBOW Continius bag of words"} assert docs == etalon_docs
def test_can_open_file_in_load_documents(self, creat_data_file_wiki_sample): index, words = inverted_index.load_documents( filepath=creat_data_file_wiki_sample) assert index == [0, 1, 2] assert words == [{'believe', 'in', 'tears'}, { 'the', 'wind', 'is', 'making', 'noise', 'in', 'my', 'head' }, {'walking', 'with', 'spring'}]
def test_can_load_documents(tiny_dataset_fio): documents = load_documents(tiny_dataset_fio) expected_documents = { 123: "same words A_word and nothing", 2: "same words B_word in this dataset", 5: "famous_phrases to be or not to be", 37: "all words such as A_word and B_word are here", } assert expected_documents == documents, ( "load_documents work incorrectly" )
def test_number_and_no_words_in_document_29( create_not_corect_data_with_two_doc_have_one_ndex): test_file = create_not_corect_data_with_two_doc_have_one_ndex indexs, words = inverted_index.load_documents(test_file) etalon_indexs = [4, 8, 5, 6, 8] etalon_words = [{ 'i', 'sit', 'by', 'the', 'window', 'and', 'make', 'a', 'program', 'test' }, None, {'the', 'third', 'test', 'is', 'about', 'something'}, {'i', 'am', 'watching', 'south', 'park', 'now', 'test'}, {'test', 'number', 'tree'}] assert etalon_words == words assert etalon_indexs == indexs
def test_work_function_load_documents_with_lot_doc_5(tmpdir): """ Как отрабатывает функция при работе с двумя нормальным документом в файле, для записи тестируемого документа :param tmpdir: создаю файл во временной директории, для записи тестируемого документа """ test_doc = tmpdir.join('datatest.txt') test_doc.write( '0\tTest text! test number one...\n1\tTest text... number two!') test_index, test_words = inverted_index.load_documents(test_doc) etalon_index = [0, 1] etalon_words = [{'test', 'text', 'number', 'one'}, {'test', 'number', 'two', 'text'}] assert test_words == etalon_words assert test_index == etalon_index
def creat_not_corect_data(tmpdir): not_corect_data = '4\tI sit by the window and make a program test\n' \ '\tSasha ate porridge, a little, but it was delicious test\n' \ '5\tThe third test is about something\n' \ '\tI am watching south park now, test' test_doc = tmpdir.join('wiki_file') test_doc.write(not_corect_data) stop_words = 'the\ni\na\nate\nby\nand\nbut\nit\nis\nam\n' test_stop_words = tmpdir.join('stop_words') test_stop_words.write(stop_words) result_stop_words = inverted_index.load_stop_words(test_stop_words) result_index, result_words = inverted_index.load_documents(test_doc) return result_index, result_words, result_stop_words
def test_doc_do_not_contaon_index_25(tmpdir): """ Тест для отработки ситуации если документ не содержит индеса :param tmpdir: временная директория для тестового файла """ test_doc = tmpdir.join('test_wiki_doc.txt') test_doc.write( '\tName Doc this test doc. I will use this text for the test.') test_indexs, test_words = inverted_index.load_documents(test_doc) etalon_index = [None] # вместо индеса должна прийти пустая список # создается множество слов etalon_words = [{ 'name', 'doc', 'this', 'test', 'i', 'will', 'use', 'text', 'for', 'the' }] assert test_indexs == etalon_index assert etalon_words == test_words
def test_work_function_load_documents_with_an_extra_newline_character_6( tmpdir): """ Как отработает функция если в файле будет один документ, который содержит в конце символ "\n" :param tmpdir: создаю файл во временной директории, для записи тестируемого документа """ test_doc = tmpdir.join( 'datatest.txt') # создаю временный тестовый документ test_doc.write( '0\tTest text! test number one...\n1\tTest text... number two!\n') test_index, test_words = inverted_index.load_documents(test_doc) etalon_index = [0, 1] etalon_words = [{'test', 'text', 'number', 'one'}, {'test', 'number', 'two', 'text'}] assert test_index == etalon_index assert test_words == etalon_words
def test_query_inverted_index_with_query_file_utf_8(): documents = load_documents(TINY_DATASET_FPATH) tiny_inverted_index = build_inverted_index(documents) tiny_inverted_index.dump_binary(TINY_INVERTED_INDEX_STORE_PATH) count = 1 with open(QUERY_FILE_UTF8_FPATH) as q_file: for line in q_file: line = line.split() answer = tiny_inverted_index.query(line) if count == 1: etalon_answer = [12, 25] else: etalon_answer = [25] assert sorted(answer) == sorted(etalon_answer), ( f"Expected answer is {etalon_answer},but you got {answer}") count += 1
def test_can_load_documents(tmpdir): dataset_fio = tmpdir.join("dataset.txt") dataset_fio.write( dedent("""\ 1\thappy cat 2\thappy cat good 3\tgood cat """)) documents = load_documents(dataset_fio) etalon_documents = { 1: "happy cat", 2: "happy cat good", 3: "good cat", } assert etalon_documents == documents, ( "load_documents incorrectly loaded dataset")
def test_work_function_load_documents_with_lot_doc_7(tmpdir): """ Проверка работы функции load_documents при большом числе документов записанных в файл :param tmpdir: создаю файл во временной директории, для записи тестируемого документа """ test_doc = tmpdir.join( 'datatest.txt') # создаю временный тестовый документ test_doc.write( '0\tTest text! test number one...\n1\tTest text... number two!\n' '2\tKent!!!! red gay\n3\tBoys len lan\n12\tTrest!!! best wreit!') test_index, test_words = inverted_index.load_documents(test_doc) etalon_index = [0, 1, 2, 3, 12] etalon_words = [{'test', 'text', 'number', 'one'}, {'test', 'number', 'two', 'text'}, {'kent', 'red', 'gay'}, {'boys', 'len', 'lan'}, {'trest', 'best', 'wreit'}] assert test_index == etalon_index assert test_words == etalon_words
def build_inverted_index_for_creat_data_not_corect(tmpdir): test_doc = tmpdir.join('wiki_doc') test_doc.write( '\tName Shasha train this program and work with data like!\n' '4\tTest name number two and test, i like programming!') test_doc_stop_words = tmpdir.join('stop_words.txt') test_doc_stop_words.write('and\ni\n') result_load_doc = inverted_index.load_documents(filepath=test_doc) result_load_stop_words = inverted_index.load_stop_words( filepath=test_doc_stop_words) result_inverted_index_build_inverted_index = inverted_index.build_inverted_index( stop_words=result_load_stop_words, words=result_load_doc[1], indexs=result_load_doc[0]) return result_inverted_index_build_inverted_index
def test_work_function_load_documents_with_one_doc_4(tmpdir): """ Как отрабатывает функция при работе с одним нормальным документом в файле, для записи тестируемого документа :param tmpdir: создаю файл во временной директории """ test_doc = tmpdir.join( 'datatest.txt') # создаю файл во временной директории # временные тестовые данные test_doc.write( '0\tTest text testing, my work now! Train testing this, working, bool?' ) test_index, test_words = inverted_index.load_documents(test_doc) etalon_index = [0] # Ожидается что будет один индекс документа etalon_words = [{ 'test', 'text', 'testing', 'my', 'work', 'now', 'train', 'this', 'working', 'bool' }] assert etalon_index == test_index assert etalon_words == test_words
def test_can_query(tmpdir): dataset_fio = tmpdir.join("dataset.txt") dataset_fio.write( dedent("""\ 1\thappy cat wow 2\thappy cat good 3\tgood cat audi 4\t audi and bmw """)) documents = load_documents(dataset_fio) inverted_index = build_inverted_index(documents) document_ids = inverted_index.query(["happy", "good"]) assert document_ids == [2] assert inverted_index.query(["happy", "good", "cat"]) == [2] assert inverted_index.query(["cat"]) == [1, 2, 3] assert inverted_index.query(["cat", "audi"]) == [3] assert inverted_index.query(["cat", "audi", 'cat']) == [3] assert inverted_index.query(["cat", "audi", 'audi']) == [3] assert inverted_index.query(["audi", 'bmw']) == [4] assert inverted_index.query(["audi", 'bmw', 'cat']) == list()
def test_index_creation(): docs = load_documents(TINY_SAMPLE_FILEPATH) inv_idx = build_inverted_index(docs) assert TINY_SAMPLE_INV_TABLE == inv_idx assert repr(TINY_SAMPLE_WORD_DICT) == repr(inv_idx)
def test_load_documents(): test_file_name = prepare_file() docs = inverted_index.load_documents(test_file_name) assert docs os.remove(test_file_name)
def test_not_existing_file(): with pytest.raises(FileNotFoundError): check_filepath_existance(NOT_REAL_FILENAME) check_filepath_existance(SMALL_SAMPLE_FILEPATH) with pytest.raises(FileNotFoundError): load_documents(NOT_REAL_FILENAME)
def test_can_load_wikipedia_sample(): documents = load_documents(WIKIPEDIA_DATASET_FPATH) assert len(documents) == 4100, "you incorrectly loaded Wikipedia sample"
def tiny_sample_document(): tiny_documents = load_documents(TINY_SAMPLE_FILEPATH) return tiny_documents
def test_can_load_right_len_document(filepath, document_len): documents = load_documents(filepath) assert len(documents) == document_len, ( "load_documents work load incorrect len" )
def wiki_docs(): return load_documents('small_wiki_sample')
def test_load_documents_exception(): with pytest.raises(FileNotFoundError) as fnfe: articles = load_documents('wikipedia_sample/ikipedia_sample')
def get_inverted_index(): documents = load_documents(DATASET_SMALL_FPATH) inverted_index = build_inverted_index(documents) return inverted_index
def small_dataset_documents(): documents = load_documents(DATASET_SMALL_FPATH) return documents
def tiny_dataset_documents(): documents = load_documents(DATASET_TINY_FPATH) return documents