def get_db_builder(dump_file, stemmer): db_builder = DbBuilder(stemmer) clean_docs = parse_tools.iterate_wikidocs_from_dump( dump_file, cleaner_WikiExtractor ) # (dump_file, keep_sections=False, keep_links=False) for doc in clean_docs: db_builder.add_document(doc) return db_builder
def test__advanced_doc(self): """tests that new form of test equals to old one from test__advanced_doc""" expected_wf = simple_wf() builder = DbBuilder(StopWordsStemmer([])) for doc in expected_wf.docs: builder.add_document(doc) actual_wf = WorkFlow() builder.build(wf=actual_wf, normalization=False) #workaround to handle dimensions mismatch expected_wf.df_vec = matrix(expected_wf.df_vec) assert_allclose(actual_wf.df_vec.todense(), expected_wf.df_vec.todense()) assert_allclose(actual_wf.wieghts_mat.todense(), expected_wf.wieghts_mat.todense())
def add_docs_from_parsed_xml_to_builder(parsed_dump, stemmer=None): """ builds WikiRep database. @param parsed_dump: Wikipedia parced xml (etc. wikiparsed.xml) @return: db_builder """ _log.debug("-"*80) _log.info("Building DB from parsed dump:{0}".format(parsed_dump)) if stemmer is None: stemmer = stemmers.StopWordsStemmer() db_builder = DbBuilder(stemmer) if not os.path.isfile(parsed_dump): raise Exception("Parsed dump doesnt exists: File {0}".format(parsed_dump)) #TODO: add parsed page reader xml_pages = parse_tools.iterate_wiki_doc(parsed_dump) doc_count = 0 for doc in xml_pages: doc_count+=1 _log.debug("Adding document #{0}:{0}".format(doc_count, doc.title)) db_builder.add_document(doc) _log.info("Added #{0} documents".format(doc_count)) return db_builder
def add_docs_from_parsed_xml_to_builder(parsed_dump, stemmer=None): """ builds WikiRep database. @param parsed_dump: Wikipedia parced xml (etc. wikiparsed.xml) @return: db_builder """ _log.debug("-" * 80) _log.info("Building DB from parsed dump:{0}".format(parsed_dump)) if stemmer is None: stemmer = stemmers.StopWordsStemmer() db_builder = DbBuilder(stemmer) if not os.path.isfile(parsed_dump): raise Exception( "Parsed dump doesnt exists: File {0}".format(parsed_dump)) #TODO: add parsed page reader xml_pages = parse_tools.iterate_wiki_doc(parsed_dump) doc_count = 0 for doc in xml_pages: doc_count += 1 _log.debug("Adding document #{0}:{0}".format(doc_count, doc.title)) db_builder.add_document(doc) _log.info("Added #{0} documents".format(doc_count)) return db_builder
class Test(test_utils.TestBase): def setUp(self): stemmer = StopWordsStemmer([]) self.db_builder = DbBuilder(stemmer) def test__simple_doc(self): doc = test_utils.DocumentStub(title="Testing", raw_text="The eagle has landed") self.db_builder.add_document(doc) expected_titles_index = ["Testing"] expected_words_index = ["The", "eagle", "has", "landed"] expceted_wieghts_matrix = [[0], [0], [0], [0]] #create db actual_db = self.db_builder.build() self.assert_dbs_equal(actual_db, expected_titles_index, expected_words_index, expceted_wieghts_matrix) def tfidf(self, counter): return (1 + math.log(counter)) if counter else 0 def test__advanced_doc(self): #first doc doc = test_utils.DocumentStub(title="Testing advanced", raw_text="a b c c c d d d d e") self.db_builder.add_document(doc) #second doc doc = test_utils.DocumentStub(title="Testing advanced 2", raw_text="a a a a a b c c c d d d e e") self.db_builder.add_document(doc) #third doc doc = test_utils.DocumentStub(title="Testing advanced 3", raw_text="b b b b f f f f") self.db_builder.add_document(doc) docs_num = 3 expected_titles_index = [ "Testing advanced", "Testing advanced 2", "Testing advanced 3" ] expected_words_index = ['a', 'b', 'c', 'd', 'e', 'f'] wieghts_matrix = [ [ self.tfidf(counter) * math.log(docs_num / 2.0) for counter in [1, 5, 0] ], #a [ self.tfidf(counter) * math.log(docs_num / 3.0) for counter in [1, 1, 4] ], #b [ self.tfidf(counter) * math.log(docs_num / 2.0) for counter in [3, 3, 0] ], #c [ self.tfidf(counter) * math.log(docs_num / 2.0) for counter in [4, 3, 0] ], #d [ self.tfidf(counter) * math.log(docs_num / 2.0) for counter in [1, 2, 0] ], #e [ self.tfidf(counter) * math.log(docs_num / 1.0) for counter in [0, 0, 4] ], #f ] #cosine normalization expceted_wieghts_matrix = [] for row in wieghts_matrix: norm = math.sqrt(sum([t**2 for t in row])) normalized_row = [norm * t for t in row] expceted_wieghts_matrix.append(normalized_row) #create db actual_db = self.db_builder.build(normalization=False) self.assert_dbs_equal(actual_db, expected_titles_index, expected_words_index, wieghts_matrix) #TODO: use the normalized matrix for comparison once db_builder supports normalization #self.assert_dbs_equal(actual_db, expected_titles_index, expected_words_index, expceted_wieghts_matrix) def assert_dbs_equal(self, actual_db, expected_titles_index, expected_words_index, expceted_wieghts_matrix): #validate results self.assertEqual( set(actual_db.words_index), set(expected_words_index), "Mismatch words index content (not validating order)") self.assertEqual(actual_db.get_titles_index(), expected_titles_index, "Mismatch concepts index") #validate db_matrix content for j in range(len(expected_titles_index)): for i in range(len(expected_words_index)): title = expected_titles_index[j] #extract expected values from index and matrix expected_word = expected_words_index[i] expected_wieght = expceted_wieghts_matrix[i][j] actual_word_index = actual_db.words_index.index(expected_word) actual_concept_index = actual_db.get_titles_index().index( title) actual_wieght = actual_db.wieght_matrix[actual_word_index, actual_concept_index] self.assertAlmostEqual( expected_wieght, actual_wieght, msg="Wrong table value at word/concept [{}, {}]".format( expected_word, title)) def test__parsing_fails_on_duplicated_title(self): pass
def get_db_builder(dump_file, stemmer): db_builder = DbBuilder(stemmer) clean_docs = parse_tools.iterate_wikidocs_from_dump(dump_file, cleaner_WikiExtractor)# (dump_file, keep_sections=False, keep_links=False) for doc in clean_docs: db_builder.add_document(doc) return db_builder
class Test(test_utils.TestBase): def setUp(self): stemmer = StopWordsStemmer([]) self.db_builder = DbBuilder(stemmer) def test__simple_doc(self): doc = test_utils.DocumentStub(title = "Testing",raw_text = "The eagle has landed") self.db_builder.add_document(doc) expected_titles_index = ["Testing"] expected_words_index = ["The", "eagle", "has", "landed"] expceted_wieghts_matrix = [ [0], [0], [0], [0] ] #create db actual_db = self.db_builder.build() self.assert_dbs_equal(actual_db, expected_titles_index, expected_words_index, expceted_wieghts_matrix) def tfidf(self, counter): return (1 + math.log(counter)) if counter else 0 def test__advanced_doc(self): #first doc doc = test_utils.DocumentStub(title = "Testing advanced",raw_text = "a b c c c d d d d e") self.db_builder.add_document(doc) #second doc doc = test_utils.DocumentStub(title = "Testing advanced 2",raw_text = "a a a a a b c c c d d d e e") self.db_builder.add_document(doc) #third doc doc = test_utils.DocumentStub(title = "Testing advanced 3",raw_text = "b b b b f f f f") self.db_builder.add_document(doc) docs_num = 3 expected_titles_index = ["Testing advanced", "Testing advanced 2", "Testing advanced 3"] expected_words_index = ['a', 'b', 'c', 'd', 'e', 'f'] wieghts_matrix = [ [self.tfidf(counter) * math.log(docs_num / 2.0) for counter in [1, 5, 0]], #a [self.tfidf(counter) * math.log(docs_num / 3.0) for counter in [1, 1, 4]], #b [self.tfidf(counter) * math.log(docs_num / 2.0) for counter in [3, 3, 0]], #c [self.tfidf(counter) * math.log(docs_num / 2.0) for counter in [4, 3, 0]], #d [self.tfidf(counter) * math.log(docs_num / 2.0) for counter in [1, 2, 0]], #e [self.tfidf(counter) * math.log(docs_num / 1.0) for counter in [0, 0, 4]], #f ] #cosine normalization expceted_wieghts_matrix = [] for row in wieghts_matrix: norm = math.sqrt( sum([t**2 for t in row]) ) normalized_row = [norm * t for t in row] expceted_wieghts_matrix.append(normalized_row) #create db actual_db = self.db_builder.build(normalization=False) self.assert_dbs_equal(actual_db, expected_titles_index, expected_words_index, wieghts_matrix) #TODO: use the normalized matrix for comparison once db_builder supports normalization #self.assert_dbs_equal(actual_db, expected_titles_index, expected_words_index, expceted_wieghts_matrix) def assert_dbs_equal(self, actual_db, expected_titles_index, expected_words_index, expceted_wieghts_matrix): #validate results self.assertEqual(set(actual_db.words_index), set(expected_words_index), "Mismatch words index content (not validating order)") self.assertEqual(actual_db.get_titles_index(), expected_titles_index, "Mismatch concepts index") #validate db_matrix content for j in range(len(expected_titles_index)): for i in range(len(expected_words_index)): title = expected_titles_index[j] #extract expected values from index and matrix expected_word = expected_words_index[i] expected_wieght = expceted_wieghts_matrix[i][j] actual_word_index = actual_db.words_index.index(expected_word) actual_concept_index = actual_db.get_titles_index().index(title) actual_wieght = actual_db.wieght_matrix[actual_word_index,actual_concept_index] self.assertAlmostEqual(expected_wieght, actual_wieght, msg="Wrong table value at word/concept [{}, {}]".format(expected_word, title)) def test__parsing_fails_on_duplicated_title(self): pass