def test_empty_documents(self): working_directory = "/tmp/test_empty_documents" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } timemap_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ empty_html_document = b"<html><body></body></html>" # if the first document is empty and all subsequent docs are empty, # then we are still on-topic, but this is to be debated cm.addTimeMap("timemap1", timemap_content, headers) cm.addMemento("memento11", empty_html_document, headers) cm.addMemento("memento12", empty_html_document, headers) cm.addMemento("memento13", empty_html_document, headers) mm = MeasureModel() mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=None, stemming=True) mm = compute_cosine_across_TimeMap( cm, mm, tokenize=None, stemming=True) mm = compute_gensim_lsi_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_gensim_lda_across_TimeMap( cm, mm, tokenize=True, stemming=True ) for urit in mm.get_TimeMap_URIs(): for urim in mm.get_Memento_URIs_in_TimeMap(urit): for measurename in ["cosine", "jaccard", "gensim_lda", "gensim_lsi"]: self.assertEquals( mm.get_Memento_measurement_error_message(urim, "timemap measures", measurename), "After processing content, the first memento in TimeMap is now empty, cannot effectively compare memento content" ) shutil.rmtree(working_directory)
def test_empty_document_in_middle(self): working_directory = "/tmp/test_empty_documents" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } timemap_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ full_html_document = b"<html>The quick brown fox jumps over the lazy dog<body></html>" empty_html_document = b"<html><body></body></html>" cm.addTimeMap("timemap1", timemap_content, headers) cm.addMemento("memento11", full_html_document, headers) cm.addMemento("memento12", empty_html_document, headers) cm.addMemento("memento13", full_html_document, headers) mm = MeasureModel() mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=None, stemming=True) # Rather than dealing with empty documents, this throws # ValueError: empty vocabulary; perhaps the documents only contain stop words # it should handle the error gracefully, and this test confirms that it does mm = compute_cosine_across_TimeMap( cm, mm, tokenize=None, stemming=True) self.assertAlmostEqual( mm.get_score("timemap1", "memento11", "timemap measures", "cosine"), 1.0 ) self.assertAlmostEqual( mm.get_score("timemap1", "memento12", "timemap measures", "cosine"), 0.0 ) self.assertAlmostEqual( mm.get_score("timemap1", "memento13", "timemap measures", "cosine"), 1.0 ) shutil.rmtree(working_directory)
def test_handle_boilerplateremoval_error_due_to_empty_first_document(self): working_directory = "/tmp/test_handle_boilerplateremoval_error" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } timemap_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ full_html_document = b"<html>The quick brown fox jumps over the lazy dog<body></html>" really_empty_document = b"" cm.addTimeMap("timemap1", timemap_content, headers) cm.addMemento("memento11", really_empty_document, headers) cm.addMemento("memento12", full_html_document, headers) cm.addMemento("memento13", full_html_document, headers) # TODO: how to handle the empty document? mm = MeasureModel() mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=None, stemming=True) pp.pprint(mm.scoremodel) self.assertEqual( mm.get_Memento_measurement_error_message("memento11", "timemap measures", "jaccard"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) self.assertEqual( mm.get_Memento_measurement_error_message("memento12", "timemap measures", "jaccard"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) self.assertEqual( mm.get_Memento_measurement_error_message("memento13", "timemap measures", "jaccard"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) mm = compute_cosine_across_TimeMap( cm, mm, tokenize=None, stemming=True) self.assertEqual( mm.get_Memento_measurement_error_message("memento11", "timemap measures", "cosine"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) self.assertEqual( mm.get_Memento_measurement_error_message("memento12", "timemap measures", "cosine"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) self.assertEqual( mm.get_Memento_measurement_error_message("memento13", "timemap measures", "cosine"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) shutil.rmtree(working_directory)
def test_cosine(self): working_directory = "/tmp/test_tf_intersection" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } full_sentence = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'etaoin', 'shrdlu', 'Now','is', 'the', 'time', 'for', 'all', 'good', 'men', 'to', 'come', 'to', 'the', 'aid', 'of', 'their', 'country', 'Jived', 'fox', 'nymph', 'grabs', 'quick', 'waltz', 'Glib', 'jocks', 'quiz', 'nymph', 'to', 'vex', 'dwarf', 'Sphinx', 'of', 'black', 'quartz,', 'judge', 'my', 'vow', 'How', 'vexingly', 'quick', 'daft', 'zebras', 'jump', 'The', 'five', 'boxing', 'wizards', 'jump', 'quickly', 'Pack', 'my', 'box', 'with', 'five', 'dozen', 'liquor', 'jugs' ] memcontent1 = bytes("<html><body>{}</body></html>".format(" ".join(full_sentence[0:20])), "utf8") memcontent2 = bytes("<html><body>{}</body></html>".format(" ".join(full_sentence[20:-1])), "utf8") timemap_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ cm.addTimeMap("timemap1", timemap_content, headers) cm.addMemento("memento11", memcontent1, headers) cm.addMemento("memento12", memcontent2, headers) mm = MeasureModel() mm = compute_cosine_across_TimeMap(cm, mm, tokenize=None, stemming=True) self.assertNotEqual( same_scores['cosine'], mm.get_score("timemap1", "memento12", "timemap measures", "cosine") ) # after removing stop words, the first document consists of 11 words # the comparison document consists of more than 20 words # the terms 'quick' and 'jump' overlap, giving 2 overlapping terms # 11 - 2 = 9, hence the comparison score of 9 expected_scores = { 'timemaps': { 'timemap1': { 'memento11': { 'timemap measures': { 'cosine': { 'comparison score': 1.0}}}, 'memento12': { 'timemap measures': { 'cosine': { 'comparison score': 0.12882843018556128}}}}}} # for regression self.assertAlmostEqual( expected_scores['timemaps']['timemap1']['memento12']['timemap measures']['cosine']['comparison score'], mm.get_score("timemap1", "memento12", "timemap measures", "cosine") ) shutil.rmtree(working_directory)
def test_all_mementos_same(self): working_directory = "/tmp/test_all_mementos_same" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } contents = [] contents.append(b"<html><body>Content1 is wonderful</body></html>") contents.append(b"<html><body>Content2 is great</body></html>") timemap1_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ timemap2_content ="""<original1>; rel="original", <timemap2>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento21>; rel="first memento"; datetime="Tue, 21 Mar 2016 15:45:06 GMT", <memento22>; rel="memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT", <memento23>; rel="last memento"; datetime="Tue, 21 Mar 2018 15:45:12 GMT" """ cm.addTimeMap("timemap1", timemap1_content, headers) cm.addTimeMap("timemap2", timemap2_content, headers) urits = cm.getTimeMapURIList() for i in range(0, 2): timemap = cm.getTimeMap(urits[i]) for memento in timemap["mementos"]["list"]: urim = memento["uri"] cm.addMemento(urim, contents[i], headers) mm = MeasureModel() mm = compute_bytecount_across_TimeMap( cm, mm, tokenize=False, stemming=False ) mm = compute_wordcount_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_cosine_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_sorensen_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_levenshtein_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_nlevenshtein_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_tfintersection_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_rawsimhash_across_TimeMap( cm, mm, tokenize=False, stemming=False ) mm = compute_tfsimhash_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_gensim_lsi_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_gensim_lda_across_TimeMap( cm, mm, tokenize=True, stemming=True ) self.assertTrue( "timemap1" in mm.get_TimeMap_URIs() ) self.assertTrue( "timemap2" in mm.get_TimeMap_URIs() ) self.assertTrue( "memento11" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento12" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento13" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento21" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) self.assertTrue( "memento22" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) self.assertTrue( "memento23" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) for measure in same_scores: print("evaluating measure {}".format(measure)) for urit in mm.get_TimeMap_URIs(): for urim in mm.get_Memento_URIs_in_TimeMap(urit): # LDA does not appear to be deterministic if measure == "gensim_lda": self.assertGreaterEqual( mm.get_score(urit, urim, "timemap measures", measure), same_scores[measure], msg="measure {} does not compute the correct score " "for document sameness with URI-M {}".format(measure, urim) ) else: self.assertAlmostEqual( mm.get_score(urit, urim, "timemap measures", measure), same_scores[measure], msg="measure {} does not compute the correct score " "for document sameness with URI-M {}".format(measure, urim) ) shutil.rmtree(working_directory)