def test__same_text_correlation(self): """ Test that for same text correlation is 1""" _log.info('-'*80) # arrange text1 = "love is rain as long story short" text2 = text1 dump_file = getInputFile("swiki_knowledge_output.xml") parsed_file = getOutputFile("swiki_knowledge_output.parsed.xml") #wdb_file = getOutputFile("swiki_knowledge_output.wdb") articles = ['Rain', 'Love', 'Tree'] # act wn.make_dump(dump_file, articles, compress=False) wn.parse_dump(dump_file, parsed_file) db_wrapper = wn.build_database_wrapper(parsed_file, StopWordsStemmer([])) #self.addCleanup(os.remove, self.tmp_dump_file) comparer = SemanticComparer(db_wrapper) correlation = comparer.compare(text1, text2) _log.info(test_utils.get_texts_correlation_message(text1, text2, correlation)) self.assertAlmostEqual(correlation, 1.0, msg="for same text correlation should be 1")
def test__same_text_correlation(self): """ Test that for same text correlation is 1""" _log.info('-' * 80) # arrange text1 = "love is rain as long story short" text2 = text1 dump_file = getInputFile("swiki_knowledge_output.xml") parsed_file = getOutputFile("swiki_knowledge_output.parsed.xml") #wdb_file = getOutputFile("swiki_knowledge_output.wdb") articles = ['Rain', 'Love', 'Tree'] # act wn.make_dump(dump_file, articles, compress=False) wn.parse_dump(dump_file, parsed_file) db_wrapper = wn.build_database_wrapper(parsed_file, StopWordsStemmer([])) #self.addCleanup(os.remove, self.tmp_dump_file) comparer = SemanticComparer(db_wrapper) correlation = comparer.compare(text1, text2) _log.info( test_utils.get_texts_correlation_message(text1, text2, correlation)) self.assertAlmostEqual(correlation, 1.0, msg="for same text correlation should be 1")
def test__many_articles(self): wiki_dump_path = getInputFile("many_articles_dump.xml") parsed_xml_path = getOutputFile("many_articles_dump.parsed.xml") wiki_knowledge.parse_dump(wiki_dump_path, parsed_xml_path) db_wrapper = wiki_knowledge.build_database_wrapper(parsed_xml_path, PorterStemmer()) c = db_wrapper.get_readable_centroid(ibm_licence_text) print c
def test__many_articles(self): wdb_path = getInputFile("many_articles_dump.wdb") db_wrapper = wiki_knowledge.load_db_wrapper_from_wdb(wdb_path) d = db_wrapper.get_readable_centroid(ibm_licence_text_full) s = sorted(d.items(), key=lambda x: x[1], reverse=True)[:5] print s d = db_wrapper.get_readable_centroid(ibm_licence_text) s = sorted(d.items(), key=lambda x: x[1], reverse=True)[:5] print s
def test__many_articles(self): wiki_dump_path = getInputFile("many_articles_dump.xml") parsed_xml_path = getOutputFile("many_articles_dump.parsed.xml") wiki_knowledge.parse_dump(wiki_dump_path, parsed_xml_path) db_wrapper = wiki_knowledge.build_database_wrapper( parsed_xml_path, PorterStemmer()) c = db_wrapper.get_readable_centroid(ibm_licence_text) print c
def test__many_articles(self): wdb_path = getInputFile("many_articles_dump.wdb") db_wrapper = wiki_knowledge.load_db_wrapper_from_wdb(wdb_path) d = db_wrapper.get_readable_centroid(ibm_licence_text_full) top = get_top(d, 5) self.assertIn("Computer", dict(top)) d = db_wrapper.get_readable_centroid(ibm_licence_text) top = get_top(d, 5) self.assertIn("Computer", dict(top))
def test__many_articles(self): wdb_path = getInputFile("many_articles_dump.wdb") db_wrapper = wiki_knowledge.load_db_wrapper_from_wdb(wdb_path) d = db_wrapper.get_readable_centroid(ibm_licence_text_full) top = get_top(d,5) self.assertIn("Computer", dict(top)) d = db_wrapper.get_readable_centroid(ibm_licence_text) top = get_top(d,5) self.assertIn("Computer", dict(top))
def test_extract_pages(self): '''regression check that extract_pages works well''' # template.format('id', ' title', 'length'); template = "{:<12}{:<30}{:>12}" expected =[ (243478, 'Ross Ice Shelf', 13734), (18798090, 'Southern Cross Expedition', 39110), (343246, 'Ice shelf', 8262) ] test__parse_tools_xml = getInputFile(FilesList.test__parse_tools) actual = [(wdoc.id, wdoc.title, len(wdoc.wiki_text)) for wdoc in pt.iterate_wiki_pages(test__parse_tools_xml)] # extract_pages(test__parse_tools_xml)] self.assertSequenceEqual(actual, expected, "Assertion failure: \nActual={}\nExpected={}".format(actual, expected))
def test_extract_pages(self): '''regression check that extract_pages works well''' # template.format('id', ' title', 'length'); template = "{:<12}{:<30}{:>12}" expected = [(243478, 'Ross Ice Shelf', 13734), (18798090, 'Southern Cross Expedition', 39110), (343246, 'Ice shelf', 8262)] test__parse_tools_xml = getInputFile(FilesList.test__parse_tools) actual = [(wdoc.id, wdoc.title, len(wdoc.wiki_text)) for wdoc in pt.iterate_wiki_pages(test__parse_tools_xml) ] # extract_pages(test__parse_tools_xml)] self.assertSequenceEqual( actual, expected, "Assertion failure: \nActual={}\nExpected={}".format( actual, expected))
def test_number_of_concepts(self): """ db builder reads parsed xml properly""" _log.info('-'*80) # arrange dump_file = getInputFile("wikidump_Knowledge_Love_War.xml") parsed_file = getOutputFile("wikidump_Knowledge_Love_War.parsed.xml") # act wn.parse_dump(dump_file, parsed_file) db_wrapper = wn.build_database_wrapper(parsed_file, StopWordsStemmer([])) titles_count =len(db_wrapper.title_index) concepts_count =len(db_wrapper.concepts_index) # assert self.assertEqual(titles_count, 3, "number of tiltes should be 3, got {0}".format(titles_count)) self.assertEqual(concepts_count, 3, "number of tiltes should be 3, got {0}".format(concepts_count))
def test_number_of_concepts(self): """ db builder reads parsed xml properly""" _log.info('-' * 80) # arrange dump_file = getInputFile("wikidump_Knowledge_Love_War.xml") parsed_file = getOutputFile("wikidump_Knowledge_Love_War.parsed.xml") # act wn.parse_dump(dump_file, parsed_file) db_wrapper = wn.build_database_wrapper(parsed_file, StopWordsStemmer([])) titles_count = len(db_wrapper.title_index) concepts_count = len(db_wrapper.concepts_index) # assert self.assertEqual( titles_count, 3, "number of tiltes should be 3, got {0}".format(titles_count)) self.assertEqual( concepts_count, 3, "number of tiltes should be 3, got {0}".format(concepts_count))
def test__parse_dump(self): wiki_dump_path = io_tu.getInputFile(io_tu.FilesList.test__parse_tools) wiki_parsed_dump_path = io_tu.getOutputFile( io_tu.FilesList.test__parse_tools) wn.parse_dump(wiki_dump_path, wiki_parsed_dump_path)
def test__many_articles_files(self): wdb_path = getInputFile("many_articles_dump.wdb") text_path = getInputFile("ibm_licence.txt") d = wiki_knowledge.get_value_from_file(wdb_path, text_path) top = get_top(d, 5) self.assertIn("Computer", dict(top))
def test__parse_dump(self): wiki_dump_path = io_tu.getInputFile(io_tu.FilesList.test__parse_tools) wiki_parsed_dump_path = io_tu.getOutputFile(io_tu.FilesList.test__parse_tools) wn.parse_dump(wiki_dump_path, wiki_parsed_dump_path)
def test__many_articles_files(self): wdb_path = getInputFile("many_articles_dump.wdb") text_path = getInputFile("ibm_licence.txt") d = wiki_knowledge.get_value_from_file(wdb_path, text_path) top = get_top(d,5) self.assertIn("Computer", dict(top))