def test_string_not_bytes_memento(self): working_directory = "/tmp/collectionmodel_test/test_mementos" memento_directory = "{}/mementos/".format(working_directory) cm = collectionmodel.CollectionModel( working_directory=working_directory) self.assertIsNotNone(cm, "CollectionModel failed to instantiate") testmemheaders = { "header1": "value1", "header2": "value2", "memento-datetime": "value3" } testmemcontent = b"<html><body>mementotext</body></html>" testurim1 = "testing-storage:memento1" cm.addMemento(testurim1, testmemcontent, testmemheaders) self.assertEqual(cm.getMementoContent(testurim1), testmemcontent) filename_digest = hashlib.sha3_256(bytes(testurim1, "utf8")).hexdigest() files_to_check = [ "{}/{}_headers.json".format(memento_directory, filename_digest), "{}/{}.orig".format(memento_directory, filename_digest) ] self.check_fileobjects_exist(files_to_check) shutil.rmtree(working_directory)
def test_empty_documents(self): working_directory = "/tmp/test_empty_documents" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } timemap_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ empty_html_document = b"<html><body></body></html>" # if the first document is empty and all subsequent docs are empty, # then we are still on-topic, but this is to be debated cm.addTimeMap("timemap1", timemap_content, headers) cm.addMemento("memento11", empty_html_document, headers) cm.addMemento("memento12", empty_html_document, headers) cm.addMemento("memento13", empty_html_document, headers) mm = MeasureModel() mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=None, stemming=True) mm = compute_cosine_across_TimeMap( cm, mm, tokenize=None, stemming=True) mm = compute_gensim_lsi_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_gensim_lda_across_TimeMap( cm, mm, tokenize=True, stemming=True ) for urit in mm.get_TimeMap_URIs(): for urim in mm.get_Memento_URIs_in_TimeMap(urit): for measurename in ["cosine", "jaccard", "gensim_lda", "gensim_lsi"]: self.assertEquals( mm.get_Memento_measurement_error_message(urim, "timemap measures", measurename), "After processing content, the first memento in TimeMap is now empty, cannot effectively compare memento content" ) shutil.rmtree(working_directory)
def test_empty_document_in_middle(self): working_directory = "/tmp/test_empty_documents" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } timemap_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ full_html_document = b"<html>The quick brown fox jumps over the lazy dog<body></html>" empty_html_document = b"<html><body></body></html>" cm.addTimeMap("timemap1", timemap_content, headers) cm.addMemento("memento11", full_html_document, headers) cm.addMemento("memento12", empty_html_document, headers) cm.addMemento("memento13", full_html_document, headers) mm = MeasureModel() mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=None, stemming=True) # Rather than dealing with empty documents, this throws # ValueError: empty vocabulary; perhaps the documents only contain stop words # it should handle the error gracefully, and this test confirms that it does mm = compute_cosine_across_TimeMap( cm, mm, tokenize=None, stemming=True) self.assertAlmostEqual( mm.get_score("timemap1", "memento11", "timemap measures", "cosine"), 1.0 ) self.assertAlmostEqual( mm.get_score("timemap1", "memento12", "timemap measures", "cosine"), 0.0 ) self.assertAlmostEqual( mm.get_score("timemap1", "memento13", "timemap measures", "cosine"), 1.0 ) shutil.rmtree(working_directory)
def test_all_measures_same(self): working_directory = "/tmp/test_all_mementos_same" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel( working_directory=working_directory) headers = {"key1": "value1", "key2": "value2"} contents = [] contents.append(b"<html><body>Content1 is wonderful</body></html>") contents.append(b"<html><body>Content2 is great</body></html>") timemap1_content = """<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ timemap2_content = """<original1>; rel="original", <timemap2>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento21>; rel="first memento"; datetime="Tue, 21 Mar 2016 15:45:06 GMT", <memento22>; rel="memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT", <memento23>; rel="last memento"; datetime="Tue, 21 Mar 2018 15:45:12 GMT" """ cm.addTimeMap("timemap1", timemap1_content, headers) cm.addTimeMap("timemap2", timemap2_content, headers) urits = cm.getTimeMapURIList() for i in range(0, 2): timemap = cm.getTimeMap(urits[i]) for memento in timemap["mementos"]["list"]: urim = memento["uri"] cm.addMemento(urim, contents[i], headers) mm = MeasureModel() mm = compute_jaccard_accross_collection(cm, mm) mm = compute_sorensen_accross_collection(cm, mm) pp.pprint(mm.generate_dict())
def test_missing_memento(self): working_directory = "/tmp/collectionmodel_test/test_missing_memento" cm = collectionmodel.CollectionModel( working_directory=working_directory) self.assertRaises( collectionmodel.CollectionModelNoSuchMementoException, cm.getMementoContent, "testing-storage:bad-memento") shutil.rmtree(working_directory)
def test_memento_error_path(self): working_directory = "/tmp/collectionmodel_test/test_memento_errors" memento_error_directory = "{}/memento_errors".format(working_directory) uri = "testing-storage:bad-memento1" filename_digest = hashlib.sha3_256(bytes(uri, "utf8")).hexdigest() cm = collectionmodel.CollectionModel( working_directory=working_directory) headers = {"key1": "value1", "key2": "value2"} content = b"<html><body>404 Not Found</body></html>" errorinformation = b"ERROR MESSAGE" cm.addMementoError(uri, content, headers, errorinformation) files_to_check = [ "{}/{}_error_info.txt".format(memento_error_directory, filename_digest), "{}/{}_headers.json".format(memento_error_directory, filename_digest), "{}/{}.orig".format(memento_error_directory, filename_digest) ] self.assertRaises(collectionmodel.CollectionModelMementoErrorException, cm.getMementoContent, uri) self.assertRaises(collectionmodel.CollectionModelMementoErrorException, cm.getMementoHeaders, uri) self.assertEqual(cm.getMementoErrorInformation(uri), errorinformation) # logger.debug("hi there...") # logger.debug(cm.getMementoErrorInformation(uri)) uri = "testing-storage:good-memento1" content = b"<html><body>It works!</body></html>" cm.addMemento(uri, content, headers) self.assertEquals(cm.getMementoErrorInformation(uri), None) self.check_fileobjects_exist(files_to_check) shutil.rmtree(working_directory)
def test_directory_creation_happy_path(self): working_directory = "/tmp/collectionmodel_test/test_directory_creation" collectionmodel.CollectionModel(working_directory=working_directory) files_to_check = [ working_directory, "{}/timemaps".format(working_directory), "{}/mementos".format(working_directory), "{}/timemaps/metadata.csv".format(working_directory), "{}/mementos/metadata.csv".format(working_directory) ] self.check_fileobjects_exist(files_to_check) shutil.rmtree(working_directory)
def test_mementos_happy_path(self): """ The following should not happen: TypeError: write() argument must be str, not bytes so our input is bytes and our output is bytes to conform to the libraries that will use CollectionModel. """ working_directory = "/tmp/collectionmodel_test/test_mementos" memento_directory = "{}/mementos/".format(working_directory) cm = collectionmodel.CollectionModel( working_directory=working_directory) self.assertIsNotNone(cm, "CollectionModel failed to instantiate") testmemheaders = { "header1": "value1", "header2": "value2", "memento-datetime": "value3" } testmemcontent = b"<html><body>mementotext</body></html>" testurim1 = "testing-storage:memento1" cm.addMemento(testurim1, testmemcontent, testmemheaders) self.assertEqual(cm.getMementoContent(testurim1), testmemcontent) self.assertEqual(cm.getMementoContentWithoutBoilerplate(testurim1), b"mementotext\n") filename_digest = hashlib.sha3_256(bytes(testurim1, "utf8")).hexdigest() files_to_check = [ "{}/{}_headers.json".format(memento_directory, filename_digest), "{}/{}.orig".format(memento_directory, filename_digest) ] self.check_fileobjects_exist(files_to_check) shutil.rmtree(working_directory)
def test_single_memento(self): working_directory = "/tmp/collectionmodel_test/test_single_memento" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel( working_directory=working_directory) headers = {"key1": "value1", "key2": "value2"} timemap_content = """<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first last memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT" """ cm.addTimeMap("timemap1", timemap_content, headers) pp.pprint(cm.getTimeMapURIList()) self.assertEqual(len(cm.getTimeMapURIList()), 1) self.assertTrue("timemap1" in cm.getTimeMapURIList()) timemap = cm.getTimeMap("timemap1") self.assertEqual("memento11", timemap["mementos"]["first"]["uri"]) self.assertEqual("memento11", timemap["mementos"]["last"]["uri"]) self.assertEqual(len(timemap["mementos"]["list"]), 1) self.assertEqual(timemap["mementos"]["list"][0]["uri"], "memento11") shutil.rmtree(working_directory)
def test_all_mementos_same(self): working_directory = "/tmp/test_all_mementos_same" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } contents = [] contents.append(b"<html><body>Content1 is wonderful</body></html>") contents.append(b"<html><body>Content2 is great</body></html>") timemap1_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ timemap2_content ="""<original1>; rel="original", <timemap2>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento21>; rel="first memento"; datetime="Tue, 21 Mar 2016 15:45:06 GMT", <memento22>; rel="memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT", <memento23>; rel="last memento"; datetime="Tue, 21 Mar 2018 15:45:12 GMT" """ cm.addTimeMap("timemap1", timemap1_content, headers) cm.addTimeMap("timemap2", timemap2_content, headers) urits = cm.getTimeMapURIList() for i in range(0, 2): timemap = cm.getTimeMap(urits[i]) for memento in timemap["mementos"]["list"]: urim = memento["uri"] cm.addMemento(urim, contents[i], headers) mm = MeasureModel() mm = compute_bytecount_across_TimeMap( cm, mm, tokenize=False, stemming=False ) mm = compute_wordcount_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_cosine_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_sorensen_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_levenshtein_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_nlevenshtein_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_tfintersection_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_rawsimhash_across_TimeMap( cm, mm, tokenize=False, stemming=False ) mm = compute_tfsimhash_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_gensim_lsi_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_gensim_lda_across_TimeMap( cm, mm, tokenize=True, stemming=True ) self.assertTrue( "timemap1" in mm.get_TimeMap_URIs() ) self.assertTrue( "timemap2" in mm.get_TimeMap_URIs() ) self.assertTrue( "memento11" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento12" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento13" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento21" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) self.assertTrue( "memento22" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) self.assertTrue( "memento23" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) for measure in same_scores: print("evaluating measure {}".format(measure)) for urit in mm.get_TimeMap_URIs(): for urim in mm.get_Memento_URIs_in_TimeMap(urit): # LDA does not appear to be deterministic if measure == "gensim_lda": self.assertGreaterEqual( mm.get_score(urit, urim, "timemap measures", measure), same_scores[measure], msg="measure {} does not compute the correct score " "for document sameness with URI-M {}".format(measure, urim) ) else: self.assertAlmostEqual( mm.get_score(urit, urim, "timemap measures", measure), same_scores[measure], msg="measure {} does not compute the correct score " "for document sameness with URI-M {}".format(measure, urim) ) shutil.rmtree(working_directory)
def test_all_mementos_different(self): working_directory = "/tmp/test_all_mementos_different" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } timemap1_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ timemap2_content ="""<original1>; rel="original", <timemap2>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento21>; rel="first memento"; datetime="Tue, 21 Mar 2016 15:45:06 GMT", <memento22>; rel="memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT", <memento23>; rel="last memento"; datetime="Tue, 21 Mar 2018 15:45:12 GMT" """ cm.addTimeMap("timemap1", timemap1_content, headers) cm.addTimeMap("timemap2", timemap2_content, headers) urits = cm.getTimeMapURIList() # see: https://en.wikipedia.org/wiki/Pangram full_sentence = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'etaoin', 'shrdlu', 'Now','is', 'the', 'time', 'for', 'all', 'good', 'men', 'to', 'come', 'to', 'the', 'aid', 'of', 'their', 'country', 'Jived', 'fox', 'nymph', 'grabs', 'quick', 'waltz', 'Glib', 'jocks', 'quiz', 'nymph', 'to', 'vex', 'dwarf', 'Sphinx', 'of', 'black', 'quartz,', 'judge', 'my', 'vow', 'How', 'vexingly', 'quick', 'daft', 'zebras', 'jump', 'The', 'five', 'boxing', 'wizards', 'jump', 'quickly', 'Pack', 'my', 'box', 'with', 'five', 'dozen', 'liquor', 'jugs' ] for i in range(0, 2): timemap = cm.getTimeMap(urits[i]) index = i + 1 for memento in timemap["mementos"]["list"]: index += 1 urim = memento["uri"] mdt = memento["datetime"] innercontent = urim for j in range(0, index): innercontent += "\n" + " ".join(full_sentence[(i + j + index):]) + " " innercontent += "\n" + str(mdt) content = "<html><body>{}</body></html>".format(innercontent) cm.addMemento(urim, bytes(content, "utf8"), headers) mm = MeasureModel() mm = compute_bytecount_across_TimeMap( cm, mm, tokenize=False, stemming=False ) mm = compute_wordcount_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=True, stemming=True ) # mm = compute_cosine_across_TimeMap( # cm, scores=scores, stemming=True # ) mm = compute_sorensen_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_levenshtein_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_nlevenshtein_across_TimeMap( cm, mm, tokenize=True, stemming=True ) # mm = compute_tfintersection_across_TimeMap( # cm, scores=scores, tokenize=True, stemming=True # ) # mm = compute_rawsimhash_across_TimeMap( # cm, mm, tokenize=False, stemming=False # ) self.assertTrue( "timemap1" in mm.get_TimeMap_URIs() ) self.assertTrue( "timemap2" in mm.get_TimeMap_URIs() ) self.assertTrue( "memento11" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento12" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento13" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento21" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) self.assertTrue( "memento22" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) self.assertTrue( "memento23" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) expected_scores = { 'timemaps': { 'timemap1': { 'memento11': { 'timemap measures': { 'bytecount': { 'comparison score': 0.0, 'individual score': 723}, 'jaccard': { 'comparison score': 0.0}, 'levenshtein': { 'comparison score': 0}, 'nlevenshtein': { 'comparison score': 0.0}, 'sorensen': { 'comparison score': 0.0}, 'wordcount': { 'comparison score': 0.0, 'individual score': 94}}}, 'memento12': { 'timemap measures': { 'bytecount': { 'comparison score': 0.43015214384508993, 'individual score': 1034}, 'jaccard': { 'comparison score': 0.11363636363636365}, 'levenshtein': { 'comparison score': 45}, 'nlevenshtein': { 'comparison score': 0.3333333333333333}, 'sorensen': { 'comparison score': 0.06024096385542166}, 'wordcount': { 'comparison score': 0.43617021276595747, 'individual score': 135}}}, 'memento13': { 'timemap measures': { 'bytecount': { 'comparison score': 0.8409405255878284, 'individual score': 1331}, 'jaccard': { 'comparison score': 0.15555555555555556}, 'levenshtein': { 'comparison score': 86}, 'nlevenshtein': { 'comparison score': 0.48863636363636365}, 'sorensen': { 'comparison score': 0.08433734939759041}, 'wordcount': { 'comparison score': 0.8723404255319149, 'individual score': 176}}}}, 'timemap2': { 'memento21': { 'timemap measures': { 'bytecount': { 'comparison score': 0.0, 'individual score': 1019}, 'jaccard': { 'comparison score': 0.0}, 'levenshtein': { 'comparison score': 0}, 'nlevenshtein': { 'comparison score': 0.0}, 'sorensen': { 'comparison score': 0.0}, 'wordcount': { 'comparison score': 0.0, 'individual score': 133}}}, 'memento22': { 'timemap measures': { 'bytecount': { 'comparison score': 0.28655544651619236, 'individual score': 1311}, 'jaccard': { 'comparison score': 0.09302325581395354}, 'levenshtein': { 'comparison score': 45}, 'nlevenshtein': { 'comparison score': 0.25862068965517243}, 'sorensen': { 'comparison score': 0.04878048780487809}, 'wordcount': { 'comparison score': 0.30827067669172936, 'individual score': 174}}}, 'memento23': { 'timemap measures': { 'bytecount': { 'comparison score': 0.5593719332679097, 'individual score': 1589}, 'jaccard': { 'comparison score': 0.13636363636363635}, 'levenshtein': { 'comparison score': 86}, 'nlevenshtein': { 'comparison score': 0.4056603773584906}, 'sorensen': { 'comparison score': 0.07317073170731703}, 'wordcount': { 'comparison score': 0.593984962406015, 'individual score': 212}}}}}} for measure in same_scores: # we'll have to test TF intersection separately, # the way that I build the sentences does not # have enough different words if measure == "tfintersection" or measure == "cosine" or \ measure == "raw_simhash" or measure == "tf_simhash" or \ measure == "gensim_lda" or measure == "gensim_lsi": continue for urit in mm.get_TimeMap_URIs(): for urim in mm.get_Memento_URIs_in_TimeMap(urit): # comparisons with themselves should match if urim == "memento11" or urim == "memento21": self.assertEqual( mm.get_score(urit, urim, "timemap measures", measure), same_scores[measure], "measure {} does not compute the correct score " "for document sameness".format(measure) ) else: self.assertNotEqual( mm.get_score(urit, urim, "timemap measures", measure), same_scores[measure], "measure {} does not compute the correct score " "for document differentness for URI-M {}".format( measure, urim) ) # for regression self.assertAlmostEqual( mm.get_score(urit, urim, "timemap measures", measure), expected_scores["timemaps"][urit][urim]["timemap measures"][measure]["comparison score"], msg="measure {} does not compute the expected score " "for URI-M {}".format(measure, urim) ) shutil.rmtree(working_directory)
def test_timemaps_happy_path(self): working_directory = "/tmp/collectionmodel_test/test_timemaps" timemap_directory = "{}/timemaps/".format(working_directory) cm = collectionmodel.CollectionModel( working_directory=working_directory) self.assertIsNotNone(cm, "CollectionModel failed to instantiate") testtimemapheaders = {"header1": "value1", "header2": "value2"} testtimemap2 = """<http://a.example.org>;rel="original", <http://arxiv.example.net/timemap/http://a.example.org> ; rel="self";type="application/link-format" ; from="Tue, 20 Jun 2000 18:02:59 GMT" ; until="Wed, 09 Apr 2008 20:30:51 GMT", <http://arxiv.example.net/timegate/http://a.example.org> ; rel="timegate", <http://arxiv.example.net/web/20000620180259/http://a.example.org> ; rel="first memento";datetime="Tue, 20 Jun 2000 18:02:59 GMT" ; license="http://creativecommons.org/publicdomain/zero/1.0/", <http://arxiv.example.net/web/20091027204954/http://a.example.org> ; rel="last memento";datetime="Tue, 27 Oct 2009 20:49:54 GMT" ; license="http://creativecommons.org/publicdomain/zero/1.0/", <http://arxiv.example.net/web/20000621011731/http://a.example.org> ; rel="memento";datetime="Wed, 21 Jun 2000 01:17:31 GMT" ; license="http://creativecommons.org/publicdomain/zero/1.0/", <http://arxiv.example.net/web/20000621044156/http://a.example.org> ; rel="memento";datetime="Wed, 21 Jun 2000 04:41:56 GMT" ; license="http://creativecommons.org/publicdomain/zero/1.0/", """ testtimemap2dict = { "original_uri": "http://a.example.org", "timegate_uri": "http://arxiv.example.net/timegate/http://a.example.org", "timemap_uri": { "link_format": "http://arxiv.example.net/timemap/http://a.example.org" }, "mementos": { "first": { "datetime": datetime(2000, 6, 20, 18, 2, 59), "uri": "http://arxiv.example.net/web/20000620180259/http://a.example.org" }, "last": { "datetime": datetime(2009, 10, 27, 20, 49, 54), "uri": "http://arxiv.example.net/web/20091027204954/http://a.example.org" }, "list": [{ "datetime": datetime(2000, 6, 20, 18, 2, 59), "uri": "http://arxiv.example.net/web/20000620180259/http://a.example.org" }, { "datetime": datetime(2009, 10, 27, 20, 49, 54), "uri": "http://arxiv.example.net/web/20091027204954/http://a.example.org" }, { "datetime": datetime(2000, 6, 21, 1, 17, 31), "uri": "http://arxiv.example.net/web/20000621011731/http://a.example.org" }, { "datetime": datetime(2000, 6, 21, 4, 41, 56), "uri": "http://arxiv.example.net/web/20000621044156/http://a.example.org" }] } } testurit2 = "testing-storage:timemap2" testurit2filename_digest = hashlib.sha3_256(bytes(testurit2, "utf8")).hexdigest() files_to_check = [ "{}/{}_headers.json".format(timemap_directory, testurit2filename_digest), "{}/{}.json".format(timemap_directory, testurit2filename_digest), "{}/{}.orig".format(timemap_directory, testurit2filename_digest) ] cm.addTimeMap(testurit2, testtimemap2, testtimemapheaders) self.assertEqual(cm.getTimeMap(testurit2), testtimemap2dict) self.check_fileobjects_exist(files_to_check) shutil.rmtree(working_directory)
def test_problematic_timemap(self): timemapcontent = """<http://digitalinnovations.ucla.edu/>; rel="original", <http://wayback.archive-it.org/7877/timemap/link/http://digitalinnovations.ucla.edu/>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2017 15:45:06 GMT"; until="Tue, 21 Mar 2017 15:45:12 GMT", <http://wayback.archive-it.org/7877/http://digitalinnovations.ucla.edu/>; rel="timegate", <http://wayback.archive-it.org/7877/20170321154506/http://digitalinnovations.ucla.edu/>; rel="first memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT", <http://wayback.archive-it.org/7877/20170321154512/http://digitalinnovations.ucla.edu/>; rel="last memento"; datetime="Tue, 21 Mar 2017 15:45:12 GMT" """ expectedtimemapdict = { "original_uri": "http://digitalinnovations.ucla.edu/", "timegate_uri": "http://wayback.archive-it.org/7877/http://digitalinnovations.ucla.edu/", "timemap_uri": { "link_format": "http://wayback.archive-it.org/7877/timemap/link/http://digitalinnovations.ucla.edu/" }, "mementos": { "first": { "datetime": datetime(2017, 3, 21, 15, 45, 6), "uri": "http://wayback.archive-it.org/7877/20170321154506/http://digitalinnovations.ucla.edu/" }, "last": { "datetime": datetime(2017, 3, 21, 15, 45, 12), "uri": "http://wayback.archive-it.org/7877/20170321154512/http://digitalinnovations.ucla.edu/" }, "list": [{ "datetime": datetime(2017, 3, 21, 15, 45, 6), "uri": "http://wayback.archive-it.org/7877/20170321154506/http://digitalinnovations.ucla.edu/" }, { "datetime": datetime(2017, 3, 21, 15, 45, 12), "uri": "http://wayback.archive-it.org/7877/20170321154512/http://digitalinnovations.ucla.edu/" }] } } test_directory = "/tmp/collectionmodel_test" if not os.path.exists(test_directory): os.makedirs(test_directory) working_directory = "{}/test_problematic_timemap".format( test_directory) cm = collectionmodel.CollectionModel( working_directory=working_directory) urit = "testing1" testtimemapheaders = {"header1": "value1", "header2": "value2"} cm.addTimeMap(urit, timemapcontent, testtimemapheaders) self.assertEqual(expectedtimemapdict, cm.getTimeMap(urit)) shutil.rmtree(test_directory)
def test_data_load(self): # working_directory="{}/testdata/test_loaddata".format( # os.path.dirname(os.path.realpath(__file__)) # ) testdatafile = "{}/testdata/test_loaddata.zip".format( os.path.dirname(os.path.realpath(__file__))) test_directory = "/tmp/collectionmodel_test" if not os.path.exists(test_directory): os.makedirs(test_directory) working_directory = "{}/test_loaddata".format(test_directory) zipref = zipfile.ZipFile(testdatafile, 'r') zipref.extractall(test_directory) zipref.close() badurim = "testing-storage:bad-memento1" # badurim_headers = { # "key1": "value1", # "key2": "value2" # } # badurim_content = b"<html><body>404 Not Found</body></html>" errorinformation = b"ERROR MESSAGE" testurim = "testing-storage:memento1" testmemheaders = { "header1": "value1", "header2": "value2", "memento-datetime": "value3" } testmemcontent = b"<html><body>mementotext</body></html>" testtimemapheaders = {"header1": "value1", "header2": "value2"} testurit = "testing-storage:timemap2" testtimemapdict = { "original_uri": "http://a.example.org", "timegate_uri": "http://arxiv.example.net/timegate/http://a.example.org", "timemap_uri": { "link_format": "http://arxiv.example.net/timemap/http://a.example.org" }, "mementos": { "first": { "datetime": datetime(2000, 6, 20, 18, 2, 59), "uri": "http://arxiv.example.net/web/20000620180259/http://a.example.org" }, "last": { "datetime": datetime(2009, 10, 27, 20, 49, 54), "uri": "http://arxiv.example.net/web/20091027204954/http://a.example.org" }, "list": [{ "datetime": datetime(2000, 6, 20, 18, 2, 59), "uri": "http://arxiv.example.net/web/20000620180259/http://a.example.org" }, { "datetime": datetime(2009, 10, 27, 20, 49, 54), "uri": "http://arxiv.example.net/web/20091027204954/http://a.example.org" }, { "datetime": datetime(2000, 6, 21, 1, 17, 31), "uri": "http://arxiv.example.net/web/20000621011731/http://a.example.org" }, { "datetime": datetime(2000, 6, 21, 4, 41, 56), "uri": "http://arxiv.example.net/web/20000621044156/http://a.example.org" }] } } cm = collectionmodel.CollectionModel( working_directory=working_directory) self.assertEqual(testmemcontent, cm.getMementoContent(testurim)) self.assertEqual(testmemheaders, cm.getMementoHeaders(testurim)) self.assertEqual([testurim], cm.getMementoURIList()) self.maxDiff = None self.assertEqual(testtimemapdict, cm.getTimeMap(testurit)) self.assertEqual(testtimemapheaders, cm.getTimeMapHeaders(testurit)) self.assertEqual([testurit], cm.getTimeMapURIList()) self.assertEqual(cm.getMementoErrorInformation(badurim), errorinformation) shutil.rmtree(test_directory)
def test_handle_boilerplateremoval_error_due_to_empty_first_document(self): working_directory = "/tmp/test_handle_boilerplateremoval_error" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } timemap_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ full_html_document = b"<html>The quick brown fox jumps over the lazy dog<body></html>" really_empty_document = b"" cm.addTimeMap("timemap1", timemap_content, headers) cm.addMemento("memento11", really_empty_document, headers) cm.addMemento("memento12", full_html_document, headers) cm.addMemento("memento13", full_html_document, headers) # TODO: how to handle the empty document? mm = MeasureModel() mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=None, stemming=True) pp.pprint(mm.scoremodel) self.assertEqual( mm.get_Memento_measurement_error_message("memento11", "timemap measures", "jaccard"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) self.assertEqual( mm.get_Memento_measurement_error_message("memento12", "timemap measures", "jaccard"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) self.assertEqual( mm.get_Memento_measurement_error_message("memento13", "timemap measures", "jaccard"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) mm = compute_cosine_across_TimeMap( cm, mm, tokenize=None, stemming=True) self.assertEqual( mm.get_Memento_measurement_error_message("memento11", "timemap measures", "cosine"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) self.assertEqual( mm.get_Memento_measurement_error_message("memento12", "timemap measures", "cosine"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) self.assertEqual( mm.get_Memento_measurement_error_message("memento13", "timemap measures", "cosine"), "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content" ) shutil.rmtree(working_directory)
def test_raw_simhash(self): working_directory = "/tmp/test_raw_simhash" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } full_sentence = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'etaoin', 'shrdlu', 'Now','is', 'the', 'time', 'for', 'all', 'good', 'men', 'to', 'come', 'to', 'the', 'aid', 'of', 'their', 'country', 'Jived', 'fox', 'nymph', 'grabs', 'quick', 'waltz', 'Glib', 'jocks', 'quiz', 'nymph', 'to', 'vex', 'dwarf', 'Sphinx', 'of', 'black', 'quartz,', 'judge', 'my', 'vow', 'How', 'vexingly', 'quick', 'daft', 'zebras', 'jump', 'The', 'five', 'boxing', 'wizards', 'jump', 'quickly', 'Pack', 'my', 'box', 'with', 'five', 'dozen', 'liquor', 'jugs' ] memcontent1 = bytes("<html><body>{}</body></html>".format(" ".join(full_sentence[0:20])), "utf8") memcontent2 = bytes("<html><body>{}</body></html>".format(" ".join(full_sentence[20:-1])), "utf8") timemap_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ cm.addTimeMap("timemap1", timemap_content, headers) cm.addMemento("memento11", memcontent1, headers) cm.addMemento("memento12", memcontent2, headers) mm = MeasureModel() mm = compute_rawsimhash_across_TimeMap(cm, mm, tokenize=None, stemming=True) self.assertNotEqual( same_scores['raw_simhash'], mm.get_score("timemap1", "memento12", "timemap measures", "raw_simhash") ) # after removing stop words, the first document consists of 11 words # the comparison document consists of more than 20 words # the terms 'quick' and 'jump' overlap, giving 2 overlapping terms # 11 - 2 = 9, hence the comparison score of 9 expected_scores = { 'timemaps': { 'timemap1': { 'memento11': { 'timemap measures': { 'raw_simhash': { 'comparison score': 0}}}, 'memento12': { 'timemap measures': { 'raw_simhash': { 'comparison score': 36}}}}}} # for regression self.assertAlmostEqual( expected_scores['timemaps']['timemap1']['memento12']['timemap measures']['raw_simhash']['comparison score'], mm.get_score("timemap1", "memento12", "timemap measures", "raw_simhash") ) shutil.rmtree(working_directory)
def test_boilerplate_problem(self): working_directory = "/tmp/collectionmodel_test/test_boilerplate_problem" cm = collectionmodel.CollectionModel( working_directory=working_directory) testtimemap2 = """<http://a.example.org>;rel="original", <http://arxiv.example.net/timemap/http://a.example.org> ; rel="self";type="application/link-format" ; from="Tue, 20 Jun 2000 18:02:59 GMT" ; until="Wed, 09 Apr 2008 20:30:51 GMT", <http://arxiv.example.net/timegate/http://a.example.org> ; rel="timegate", <http://arxiv.example.net/web/20000620180259/http://a.example.org> ; rel="first memento";datetime="Tue, 20 Jun 2000 18:02:59 GMT" ; license="http://creativecommons.org/publicdomain/zero/1.0/", <http://arxiv.example.net/web/20091027204954/http://a.example.org> ; rel="last memento";datetime="Tue, 27 Oct 2009 20:49:54 GMT" ; license="http://creativecommons.org/publicdomain/zero/1.0/", <http://arxiv.example.net/web/20000621011731/http://a.example.org> ; rel="memento";datetime="Wed, 21 Jun 2000 01:17:31 GMT" ; license="http://creativecommons.org/publicdomain/zero/1.0/", <http://arxiv.example.net/web/20000621044156/http://a.example.org> ; rel="memento";datetime="Wed, 21 Jun 2000 04:41:56 GMT" ; license="http://creativecommons.org/publicdomain/zero/1.0/", """ testheaders = {"header1": "value1", "header2": "value2"} testurit2 = "testing-storage:timemap2" cm.addTimeMap(testurit2, testtimemap2, testheaders) testcontent = b"<html><body>hi</body></html>" cm.addMemento( "http://arxiv.example.net/web/20000620180259/http://a.example.org", testcontent, testheaders) cm.addMemento( "http://arxiv.example.net/web/20091027204954/http://a.example.org", testcontent, testheaders) cm.addMemento( "http://arxiv.example.net/web/20000621011731/http://a.example.org", testcontent, testheaders) cm.addMemento( "http://arxiv.example.net/web/20000621044156/http://a.example.org", b"", testheaders) self.assertEqual( cm.getMementoContentWithoutBoilerplate( "http://arxiv.example.net/web/20000620180259/http://a.example.org" ), b"hi\n") with self.assertRaises( collectionmodel. CollectionModelBoilerPlateRemovalFailureException): data = cm.getMementoContentWithoutBoilerplate( "http://arxiv.example.net/web/20000621044156/http://a.example.org" ) data # here to shut up pylint shutil.rmtree(working_directory)