def test_string_not_bytes_memento(self):

        working_directory = "/tmp/collectionmodel_test/test_mementos"
        memento_directory = "{}/mementos/".format(working_directory)

        cm = collectionmodel.CollectionModel(
            working_directory=working_directory)

        self.assertIsNotNone(cm, "CollectionModel failed to instantiate")

        testmemheaders = {
            "header1": "value1",
            "header2": "value2",
            "memento-datetime": "value3"
        }

        testmemcontent = b"<html><body>mementotext</body></html>"

        testurim1 = "testing-storage:memento1"

        cm.addMemento(testurim1, testmemcontent, testmemheaders)

        self.assertEqual(cm.getMementoContent(testurim1), testmemcontent)

        filename_digest = hashlib.sha3_256(bytes(testurim1,
                                                 "utf8")).hexdigest()

        files_to_check = [
            "{}/{}_headers.json".format(memento_directory, filename_digest),
            "{}/{}.orig".format(memento_directory, filename_digest)
        ]

        self.check_fileobjects_exist(files_to_check)

        shutil.rmtree(working_directory)
    def test_empty_documents(self):

        working_directory = "/tmp/test_empty_documents"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(working_directory=working_directory)

        headers = {
            "key1": "value1",
            "key2": "value2"
        }

        timemap_content ="""<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT",
<memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT",
<memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT"
"""

        empty_html_document = b"<html><body></body></html>"

        # if the first document is empty and all subsequent docs are empty, 
        # then we are still on-topic, but this is to be debated
        cm.addTimeMap("timemap1", timemap_content, headers)
        cm.addMemento("memento11", empty_html_document, headers)
        cm.addMemento("memento12", empty_html_document, headers)
        cm.addMemento("memento13", empty_html_document, headers)

        mm = MeasureModel()

        mm = compute_jaccard_across_TimeMap(
            cm, mm, tokenize=None, stemming=True)

        mm = compute_cosine_across_TimeMap(
            cm, mm, tokenize=None, stemming=True)

        mm = compute_gensim_lsi_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_gensim_lda_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )
        
        for urit in mm.get_TimeMap_URIs():

            for urim in mm.get_Memento_URIs_in_TimeMap(urit):

                for measurename in ["cosine", "jaccard", "gensim_lda", "gensim_lsi"]:

                    self.assertEquals(
                        mm.get_Memento_measurement_error_message(urim, "timemap measures", measurename),
                        "After processing content, the first memento in TimeMap is now empty, cannot effectively compare memento content"
                    )

        shutil.rmtree(working_directory)
    def test_empty_document_in_middle(self):

        working_directory = "/tmp/test_empty_documents"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(working_directory=working_directory)

        headers = {
            "key1": "value1",
            "key2": "value2"
        }

        timemap_content ="""<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT",
<memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT",
<memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT"
"""

        full_html_document = b"<html>The quick brown fox jumps over the lazy dog<body></html>"
        empty_html_document = b"<html><body></body></html>"

        cm.addTimeMap("timemap1", timemap_content, headers)
        cm.addMemento("memento11", full_html_document, headers)
        cm.addMemento("memento12", empty_html_document, headers)
        cm.addMemento("memento13", full_html_document, headers)

        mm = MeasureModel()

        mm = compute_jaccard_across_TimeMap(
            cm, mm, tokenize=None, stemming=True)

        # Rather than dealing with empty documents, this throws
        # ValueError: empty vocabulary; perhaps the documents only contain stop words
        # it should handle the error gracefully, and this test confirms that it does
        mm = compute_cosine_across_TimeMap(
            cm, mm, tokenize=None, stemming=True)

        self.assertAlmostEqual(
            mm.get_score("timemap1", "memento11", "timemap measures", "cosine"),
            1.0
        )

        self.assertAlmostEqual(
            mm.get_score("timemap1", "memento12", "timemap measures", "cosine"),
            0.0
        )

        self.assertAlmostEqual(
            mm.get_score("timemap1", "memento13", "timemap measures", "cosine"),
            1.0
        )

        shutil.rmtree(working_directory)
Exemplo n.º 4
0
    def test_all_measures_same(self):

        working_directory = "/tmp/test_all_mementos_same"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(
            working_directory=working_directory)

        headers = {"key1": "value1", "key2": "value2"}

        contents = []

        contents.append(b"<html><body>Content1 is wonderful</body></html>")
        contents.append(b"<html><body>Content2 is great</body></html>")

        timemap1_content = """<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT",
<memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT",
<memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT"
"""

        timemap2_content = """<original1>; rel="original",
<timemap2>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento21>; rel="first memento"; datetime="Tue, 21 Mar 2016 15:45:06 GMT",
<memento22>; rel="memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT",
<memento23>; rel="last memento"; datetime="Tue, 21 Mar 2018 15:45:12 GMT"
"""

        cm.addTimeMap("timemap1", timemap1_content, headers)
        cm.addTimeMap("timemap2", timemap2_content, headers)

        urits = cm.getTimeMapURIList()

        for i in range(0, 2):

            timemap = cm.getTimeMap(urits[i])

            for memento in timemap["mementos"]["list"]:

                urim = memento["uri"]

                cm.addMemento(urim, contents[i], headers)

        mm = MeasureModel()

        mm = compute_jaccard_accross_collection(cm, mm)

        mm = compute_sorensen_accross_collection(cm, mm)

        pp.pprint(mm.generate_dict())
    def test_missing_memento(self):

        working_directory = "/tmp/collectionmodel_test/test_missing_memento"

        cm = collectionmodel.CollectionModel(
            working_directory=working_directory)

        self.assertRaises(
            collectionmodel.CollectionModelNoSuchMementoException,
            cm.getMementoContent, "testing-storage:bad-memento")

        shutil.rmtree(working_directory)
    def test_memento_error_path(self):

        working_directory = "/tmp/collectionmodel_test/test_memento_errors"
        memento_error_directory = "{}/memento_errors".format(working_directory)

        uri = "testing-storage:bad-memento1"

        filename_digest = hashlib.sha3_256(bytes(uri, "utf8")).hexdigest()

        cm = collectionmodel.CollectionModel(
            working_directory=working_directory)

        headers = {"key1": "value1", "key2": "value2"}

        content = b"<html><body>404 Not Found</body></html>"

        errorinformation = b"ERROR MESSAGE"

        cm.addMementoError(uri, content, headers, errorinformation)

        files_to_check = [
            "{}/{}_error_info.txt".format(memento_error_directory,
                                          filename_digest),
            "{}/{}_headers.json".format(memento_error_directory,
                                        filename_digest),
            "{}/{}.orig".format(memento_error_directory, filename_digest)
        ]

        self.assertRaises(collectionmodel.CollectionModelMementoErrorException,
                          cm.getMementoContent, uri)

        self.assertRaises(collectionmodel.CollectionModelMementoErrorException,
                          cm.getMementoHeaders, uri)

        self.assertEqual(cm.getMementoErrorInformation(uri), errorinformation)

        # logger.debug("hi there...")
        # logger.debug(cm.getMementoErrorInformation(uri))

        uri = "testing-storage:good-memento1"
        content = b"<html><body>It works!</body></html>"

        cm.addMemento(uri, content, headers)

        self.assertEquals(cm.getMementoErrorInformation(uri), None)

        self.check_fileobjects_exist(files_to_check)

        shutil.rmtree(working_directory)
    def test_directory_creation_happy_path(self):

        working_directory = "/tmp/collectionmodel_test/test_directory_creation"

        collectionmodel.CollectionModel(working_directory=working_directory)

        files_to_check = [
            working_directory, "{}/timemaps".format(working_directory),
            "{}/mementos".format(working_directory),
            "{}/timemaps/metadata.csv".format(working_directory),
            "{}/mementos/metadata.csv".format(working_directory)
        ]

        self.check_fileobjects_exist(files_to_check)

        shutil.rmtree(working_directory)
    def test_mementos_happy_path(self):
        """
            The following should not happen:

            TypeError: write() argument must be str, not bytes

            so our input is bytes and our output is bytes to conform to the 
            libraries that will use CollectionModel.
        """

        working_directory = "/tmp/collectionmodel_test/test_mementos"
        memento_directory = "{}/mementos/".format(working_directory)

        cm = collectionmodel.CollectionModel(
            working_directory=working_directory)

        self.assertIsNotNone(cm, "CollectionModel failed to instantiate")

        testmemheaders = {
            "header1": "value1",
            "header2": "value2",
            "memento-datetime": "value3"
        }

        testmemcontent = b"<html><body>mementotext</body></html>"

        testurim1 = "testing-storage:memento1"

        cm.addMemento(testurim1, testmemcontent, testmemheaders)

        self.assertEqual(cm.getMementoContent(testurim1), testmemcontent)

        self.assertEqual(cm.getMementoContentWithoutBoilerplate(testurim1),
                         b"mementotext\n")

        filename_digest = hashlib.sha3_256(bytes(testurim1,
                                                 "utf8")).hexdigest()

        files_to_check = [
            "{}/{}_headers.json".format(memento_directory, filename_digest),
            "{}/{}.orig".format(memento_directory, filename_digest)
        ]

        self.check_fileobjects_exist(files_to_check)

        shutil.rmtree(working_directory)
    def test_single_memento(self):

        working_directory = "/tmp/collectionmodel_test/test_single_memento"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(
            working_directory=working_directory)

        headers = {"key1": "value1", "key2": "value2"}

        timemap_content = """<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first last memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT"
"""

        cm.addTimeMap("timemap1", timemap_content, headers)

        pp.pprint(cm.getTimeMapURIList())

        self.assertEqual(len(cm.getTimeMapURIList()), 1)

        self.assertTrue("timemap1" in cm.getTimeMapURIList())

        timemap = cm.getTimeMap("timemap1")

        self.assertEqual("memento11", timemap["mementos"]["first"]["uri"])
        self.assertEqual("memento11", timemap["mementos"]["last"]["uri"])

        self.assertEqual(len(timemap["mementos"]["list"]), 1)

        self.assertEqual(timemap["mementos"]["list"][0]["uri"], "memento11")

        shutil.rmtree(working_directory)
    def test_all_mementos_same(self):

        working_directory = "/tmp/test_all_mementos_same"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(working_directory=working_directory)

        headers = {
            "key1": "value1",
            "key2": "value2"
        }

        contents = []

        contents.append(b"<html><body>Content1 is wonderful</body></html>")
        contents.append(b"<html><body>Content2 is great</body></html>")

        timemap1_content ="""<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT",
<memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT",
<memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT"
"""

        timemap2_content ="""<original1>; rel="original",
<timemap2>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento21>; rel="first memento"; datetime="Tue, 21 Mar 2016 15:45:06 GMT",
<memento22>; rel="memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT",
<memento23>; rel="last memento"; datetime="Tue, 21 Mar 2018 15:45:12 GMT"
"""

        cm.addTimeMap("timemap1", timemap1_content, headers)
        cm.addTimeMap("timemap2", timemap2_content, headers)

        urits = cm.getTimeMapURIList()

        for i in range(0, 2):

            timemap = cm.getTimeMap(urits[i])

            for memento in timemap["mementos"]["list"]:
            
                urim = memento["uri"]

                cm.addMemento(urim, contents[i], headers)

        mm = MeasureModel()

        mm = compute_bytecount_across_TimeMap(
            cm, mm, tokenize=False, stemming=False
        )

        mm = compute_wordcount_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_jaccard_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_cosine_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_sorensen_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_levenshtein_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_nlevenshtein_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_tfintersection_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_rawsimhash_across_TimeMap(
            cm, mm, tokenize=False, stemming=False
        )

        mm = compute_tfsimhash_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_gensim_lsi_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_gensim_lda_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        self.assertTrue( "timemap1" in mm.get_TimeMap_URIs() )
        self.assertTrue( "timemap2" in mm.get_TimeMap_URIs() )

        self.assertTrue( "memento11" in mm.get_Memento_URIs_in_TimeMap("timemap1") )
        self.assertTrue( "memento12" in mm.get_Memento_URIs_in_TimeMap("timemap1") )
        self.assertTrue( "memento13" in mm.get_Memento_URIs_in_TimeMap("timemap1") )

        self.assertTrue( "memento21" in mm.get_Memento_URIs_in_TimeMap("timemap2") )
        self.assertTrue( "memento22" in mm.get_Memento_URIs_in_TimeMap("timemap2") )
        self.assertTrue( "memento23" in mm.get_Memento_URIs_in_TimeMap("timemap2") )

        for measure in same_scores:

            print("evaluating measure {}".format(measure))

            for urit in mm.get_TimeMap_URIs():

                for urim in mm.get_Memento_URIs_in_TimeMap(urit):

                    # LDA does not appear to be deterministic
                    if measure == "gensim_lda":

                        self.assertGreaterEqual(
                            mm.get_score(urit, urim, "timemap measures", measure),
                            same_scores[measure],
                            msg="measure {} does not compute the correct score "
                            "for document sameness with URI-M {}".format(measure, urim)                            
                        )

                    else:

                            self.assertAlmostEqual(
                            mm.get_score(urit, urim, "timemap measures", measure),
                            same_scores[measure],
                            msg="measure {} does not compute the correct score "
                            "for document sameness with URI-M {}".format(measure, urim)
                        )

        shutil.rmtree(working_directory)
    def test_all_mementos_different(self):

        working_directory = "/tmp/test_all_mementos_different"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(working_directory=working_directory)

        headers = {
            "key1": "value1",
            "key2": "value2"
        }

        timemap1_content ="""<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT",
<memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT",
<memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT"
"""

        timemap2_content ="""<original1>; rel="original",
<timemap2>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento21>; rel="first memento"; datetime="Tue, 21 Mar 2016 15:45:06 GMT",
<memento22>; rel="memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT",
<memento23>; rel="last memento"; datetime="Tue, 21 Mar 2018 15:45:12 GMT"
"""

        cm.addTimeMap("timemap1", timemap1_content, headers)
        cm.addTimeMap("timemap2", timemap2_content, headers)

        urits = cm.getTimeMapURIList()

        # see: https://en.wikipedia.org/wiki/Pangram
        full_sentence = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 
            'the', 'lazy', 'dog', 'etaoin', 'shrdlu', 'Now','is', 'the', 
            'time', 'for', 'all', 'good', 'men', 'to', 'come', 'to', 'the', 
            'aid', 'of', 'their', 'country',
            'Jived', 'fox', 'nymph', 'grabs', 'quick', 'waltz',
            'Glib', 'jocks', 'quiz', 'nymph', 'to', 'vex', 'dwarf',
            'Sphinx', 'of', 'black', 'quartz,', 'judge', 'my', 'vow',
            'How', 'vexingly', 'quick', 'daft', 'zebras', 'jump',
            'The', 'five', 'boxing', 'wizards', 'jump', 'quickly',
            'Pack', 'my', 'box', 'with', 'five', 'dozen', 'liquor', 'jugs'
            ]

        for i in range(0, 2):

            timemap = cm.getTimeMap(urits[i])
            index = i + 1

            for memento in timemap["mementos"]["list"]:

                index += 1
            
                urim = memento["uri"]
                mdt = memento["datetime"]

                innercontent = urim

                for j in range(0, index):
                    innercontent += "\n" + " ".join(full_sentence[(i + j + index):]) + " "

                innercontent += "\n" + str(mdt)

                content = "<html><body>{}</body></html>".format(innercontent)

                cm.addMemento(urim, bytes(content, "utf8"), headers)

        mm = MeasureModel()

        mm = compute_bytecount_across_TimeMap(
            cm, mm, tokenize=False, stemming=False
        )

        mm = compute_wordcount_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_jaccard_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        # mm = compute_cosine_across_TimeMap(
        #     cm, scores=scores, stemming=True
        # )

        mm = compute_sorensen_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_levenshtein_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_nlevenshtein_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        # mm = compute_tfintersection_across_TimeMap(
        #     cm, scores=scores, tokenize=True, stemming=True
        # )

        # mm = compute_rawsimhash_across_TimeMap(
        #     cm, mm, tokenize=False, stemming=False
        # )

        self.assertTrue( "timemap1" in mm.get_TimeMap_URIs() )
        self.assertTrue( "timemap2" in mm.get_TimeMap_URIs() )

        self.assertTrue( "memento11" in mm.get_Memento_URIs_in_TimeMap("timemap1") )
        self.assertTrue( "memento12" in mm.get_Memento_URIs_in_TimeMap("timemap1") )
        self.assertTrue( "memento13" in mm.get_Memento_URIs_in_TimeMap("timemap1") )

        self.assertTrue( "memento21" in mm.get_Memento_URIs_in_TimeMap("timemap2") )
        self.assertTrue( "memento22" in mm.get_Memento_URIs_in_TimeMap("timemap2") )
        self.assertTrue( "memento23" in mm.get_Memento_URIs_in_TimeMap("timemap2") )

        expected_scores = {   'timemaps': {   'timemap1': {   'memento11': {   'timemap measures': {   'bytecount': {   'comparison score': 0.0,
                                                                                              'individual score': 723},
                                                                             'jaccard': {   'comparison score': 0.0},
                                                                             'levenshtein': {   'comparison score': 0},
                                                                             'nlevenshtein': {   'comparison score': 0.0},
                                                                             'sorensen': {   'comparison score': 0.0},
                                                                             'wordcount': {   'comparison score': 0.0,
                                                                                              'individual score': 94}}},
                                                                             
                                    'memento12': {   'timemap measures': {   'bytecount': {   'comparison score': 0.43015214384508993,
                                                                                              'individual score': 1034},
                                                                             'jaccard': {   'comparison score': 0.11363636363636365},
                                                                             'levenshtein': {   'comparison score': 45},
                                                                             'nlevenshtein': {   'comparison score': 0.3333333333333333},
                                                                             'sorensen': {   'comparison score': 0.06024096385542166},
                                                                             'wordcount': {   'comparison score': 0.43617021276595747,
                                                                                              'individual score': 135}}},
                                    'memento13': {   'timemap measures': {   'bytecount': {   'comparison score': 0.8409405255878284,
                                                                                              'individual score': 1331},
                                                                             'jaccard': {   'comparison score': 0.15555555555555556},
                                                                             'levenshtein': {   'comparison score': 86},
                                                                             'nlevenshtein': {   'comparison score': 0.48863636363636365},
                                                                             'sorensen': {   'comparison score': 0.08433734939759041},
                                                                             'wordcount': {   'comparison score': 0.8723404255319149,
                                                                                              'individual score': 176}}}},
                    'timemap2': {   'memento21': {   'timemap measures': {   'bytecount': {   'comparison score': 0.0,
                                                                                              'individual score': 1019},
                                                                             'jaccard': {   'comparison score': 0.0},
                                                                             'levenshtein': {   'comparison score': 0},
                                                                             'nlevenshtein': {   'comparison score': 0.0},
                                                                             'sorensen': {   'comparison score': 0.0},
                                                                             'wordcount': {   'comparison score': 0.0,
                                                                                              'individual score': 133}}},
                                    'memento22': {   'timemap measures': {   'bytecount': {   'comparison score': 0.28655544651619236,
                                                                                              'individual score': 1311},
                                                                             'jaccard': {   'comparison score': 0.09302325581395354},
                                                                             'levenshtein': {   'comparison score': 45},
                                                                             'nlevenshtein': {   'comparison score': 0.25862068965517243},
                                                                             'sorensen': {   'comparison score': 0.04878048780487809},
                                                                             'wordcount': {   'comparison score': 0.30827067669172936,
                                                                                              'individual score': 174}}},
                                    'memento23': {   'timemap measures': {   'bytecount': {   'comparison score': 0.5593719332679097,
                                                                                              'individual score': 1589},
                                                                             'jaccard': {   'comparison score': 0.13636363636363635},
                                                                             'levenshtein': {   'comparison score': 86},
                                                                             'nlevenshtein': {   'comparison score': 0.4056603773584906},
                                                                             'sorensen': {   'comparison score': 0.07317073170731703},
                                                                             'wordcount': {   'comparison score': 0.593984962406015,
                                                                                              'individual score': 212}}}}}}

        for measure in same_scores:

            # we'll have to test TF intersection separately,
            # the way that I build the sentences does not
            # have enough different words
            if measure == "tfintersection" or measure == "cosine" or \
                measure == "raw_simhash" or measure == "tf_simhash" or \
                measure == "gensim_lda" or measure == "gensim_lsi":
                continue

            for urit in mm.get_TimeMap_URIs():

                for urim in mm.get_Memento_URIs_in_TimeMap(urit):

                    # comparisons with themselves should match
                    if urim == "memento11" or urim == "memento21":
                        self.assertEqual(
                            mm.get_score(urit, urim, "timemap measures", measure),
                            same_scores[measure],
                            "measure {} does not compute the correct score "
                            "for document sameness".format(measure)
                        )
                    else:
                        self.assertNotEqual(
                            mm.get_score(urit, urim, "timemap measures", measure),
                            same_scores[measure],
                            "measure {} does not compute the correct score "
                            "for document differentness for URI-M {}".format(
                                measure, urim)
                        )

                    # for regression
                    self.assertAlmostEqual(
                            mm.get_score(urit, urim, "timemap measures", measure),
                            expected_scores["timemaps"][urit][urim]["timemap measures"][measure]["comparison score"],
                            msg="measure {} does not compute the expected score "
                            "for URI-M {}".format(measure, urim)
                    )

        shutil.rmtree(working_directory)
    def test_timemaps_happy_path(self):

        working_directory = "/tmp/collectionmodel_test/test_timemaps"
        timemap_directory = "{}/timemaps/".format(working_directory)

        cm = collectionmodel.CollectionModel(
            working_directory=working_directory)

        self.assertIsNotNone(cm, "CollectionModel failed to instantiate")

        testtimemapheaders = {"header1": "value1", "header2": "value2"}

        testtimemap2 = """<http://a.example.org>;rel="original",
        <http://arxiv.example.net/timemap/http://a.example.org>
        ; rel="self";type="application/link-format"
        ; from="Tue, 20 Jun 2000 18:02:59 GMT"
        ; until="Wed, 09 Apr 2008 20:30:51 GMT",
        <http://arxiv.example.net/timegate/http://a.example.org>
        ; rel="timegate",
        <http://arxiv.example.net/web/20000620180259/http://a.example.org>
        ; rel="first memento";datetime="Tue, 20 Jun 2000 18:02:59 GMT"
        ; license="http://creativecommons.org/publicdomain/zero/1.0/",
        <http://arxiv.example.net/web/20091027204954/http://a.example.org>
        ; rel="last memento";datetime="Tue, 27 Oct 2009 20:49:54 GMT"
        ; license="http://creativecommons.org/publicdomain/zero/1.0/",
        <http://arxiv.example.net/web/20000621011731/http://a.example.org>
        ; rel="memento";datetime="Wed, 21 Jun 2000 01:17:31 GMT"
        ; license="http://creativecommons.org/publicdomain/zero/1.0/",
        <http://arxiv.example.net/web/20000621044156/http://a.example.org>
        ; rel="memento";datetime="Wed, 21 Jun 2000 04:41:56 GMT"
        ; license="http://creativecommons.org/publicdomain/zero/1.0/",
        """

        testtimemap2dict = {
            "original_uri": "http://a.example.org",
            "timegate_uri":
            "http://arxiv.example.net/timegate/http://a.example.org",
            "timemap_uri": {
                "link_format":
                "http://arxiv.example.net/timemap/http://a.example.org"
            },
            "mementos": {
                "first": {
                    "datetime":
                    datetime(2000, 6, 20, 18, 2, 59),
                    "uri":
                    "http://arxiv.example.net/web/20000620180259/http://a.example.org"
                },
                "last": {
                    "datetime":
                    datetime(2009, 10, 27, 20, 49, 54),
                    "uri":
                    "http://arxiv.example.net/web/20091027204954/http://a.example.org"
                },
                "list": [{
                    "datetime":
                    datetime(2000, 6, 20, 18, 2, 59),
                    "uri":
                    "http://arxiv.example.net/web/20000620180259/http://a.example.org"
                }, {
                    "datetime":
                    datetime(2009, 10, 27, 20, 49, 54),
                    "uri":
                    "http://arxiv.example.net/web/20091027204954/http://a.example.org"
                }, {
                    "datetime":
                    datetime(2000, 6, 21, 1, 17, 31),
                    "uri":
                    "http://arxiv.example.net/web/20000621011731/http://a.example.org"
                }, {
                    "datetime":
                    datetime(2000, 6, 21, 4, 41, 56),
                    "uri":
                    "http://arxiv.example.net/web/20000621044156/http://a.example.org"
                }]
            }
        }

        testurit2 = "testing-storage:timemap2"
        testurit2filename_digest = hashlib.sha3_256(bytes(testurit2,
                                                          "utf8")).hexdigest()

        files_to_check = [
            "{}/{}_headers.json".format(timemap_directory,
                                        testurit2filename_digest),
            "{}/{}.json".format(timemap_directory, testurit2filename_digest),
            "{}/{}.orig".format(timemap_directory, testurit2filename_digest)
        ]

        cm.addTimeMap(testurit2, testtimemap2, testtimemapheaders)

        self.assertEqual(cm.getTimeMap(testurit2), testtimemap2dict)

        self.check_fileobjects_exist(files_to_check)

        shutil.rmtree(working_directory)
    def test_problematic_timemap(self):

        timemapcontent = """<http://digitalinnovations.ucla.edu/>; rel="original",
<http://wayback.archive-it.org/7877/timemap/link/http://digitalinnovations.ucla.edu/>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2017 15:45:06 GMT"; until="Tue, 21 Mar 2017 15:45:12 GMT",
<http://wayback.archive-it.org/7877/http://digitalinnovations.ucla.edu/>; rel="timegate",
<http://wayback.archive-it.org/7877/20170321154506/http://digitalinnovations.ucla.edu/>; rel="first memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT",
<http://wayback.archive-it.org/7877/20170321154512/http://digitalinnovations.ucla.edu/>; rel="last memento"; datetime="Tue, 21 Mar 2017 15:45:12 GMT"
"""

        expectedtimemapdict = {
            "original_uri": "http://digitalinnovations.ucla.edu/",
            "timegate_uri":
            "http://wayback.archive-it.org/7877/http://digitalinnovations.ucla.edu/",
            "timemap_uri": {
                "link_format":
                "http://wayback.archive-it.org/7877/timemap/link/http://digitalinnovations.ucla.edu/"
            },
            "mementos": {
                "first": {
                    "datetime":
                    datetime(2017, 3, 21, 15, 45, 6),
                    "uri":
                    "http://wayback.archive-it.org/7877/20170321154506/http://digitalinnovations.ucla.edu/"
                },
                "last": {
                    "datetime":
                    datetime(2017, 3, 21, 15, 45, 12),
                    "uri":
                    "http://wayback.archive-it.org/7877/20170321154512/http://digitalinnovations.ucla.edu/"
                },
                "list": [{
                    "datetime":
                    datetime(2017, 3, 21, 15, 45, 6),
                    "uri":
                    "http://wayback.archive-it.org/7877/20170321154506/http://digitalinnovations.ucla.edu/"
                }, {
                    "datetime":
                    datetime(2017, 3, 21, 15, 45, 12),
                    "uri":
                    "http://wayback.archive-it.org/7877/20170321154512/http://digitalinnovations.ucla.edu/"
                }]
            }
        }

        test_directory = "/tmp/collectionmodel_test"

        if not os.path.exists(test_directory):
            os.makedirs(test_directory)

        working_directory = "{}/test_problematic_timemap".format(
            test_directory)

        cm = collectionmodel.CollectionModel(
            working_directory=working_directory)

        urit = "testing1"

        testtimemapheaders = {"header1": "value1", "header2": "value2"}

        cm.addTimeMap(urit, timemapcontent, testtimemapheaders)

        self.assertEqual(expectedtimemapdict, cm.getTimeMap(urit))

        shutil.rmtree(test_directory)
    def test_data_load(self):

        # working_directory="{}/testdata/test_loaddata".format(
        #     os.path.dirname(os.path.realpath(__file__))
        # )

        testdatafile = "{}/testdata/test_loaddata.zip".format(
            os.path.dirname(os.path.realpath(__file__)))

        test_directory = "/tmp/collectionmodel_test"

        if not os.path.exists(test_directory):
            os.makedirs(test_directory)

        working_directory = "{}/test_loaddata".format(test_directory)

        zipref = zipfile.ZipFile(testdatafile, 'r')
        zipref.extractall(test_directory)
        zipref.close()

        badurim = "testing-storage:bad-memento1"

        # badurim_headers = {
        #     "key1": "value1",
        #     "key2": "value2"
        # }

        # badurim_content = b"<html><body>404 Not Found</body></html>"

        errorinformation = b"ERROR MESSAGE"

        testurim = "testing-storage:memento1"

        testmemheaders = {
            "header1": "value1",
            "header2": "value2",
            "memento-datetime": "value3"
        }

        testmemcontent = b"<html><body>mementotext</body></html>"

        testtimemapheaders = {"header1": "value1", "header2": "value2"}

        testurit = "testing-storage:timemap2"

        testtimemapdict = {
            "original_uri": "http://a.example.org",
            "timegate_uri":
            "http://arxiv.example.net/timegate/http://a.example.org",
            "timemap_uri": {
                "link_format":
                "http://arxiv.example.net/timemap/http://a.example.org"
            },
            "mementos": {
                "first": {
                    "datetime":
                    datetime(2000, 6, 20, 18, 2, 59),
                    "uri":
                    "http://arxiv.example.net/web/20000620180259/http://a.example.org"
                },
                "last": {
                    "datetime":
                    datetime(2009, 10, 27, 20, 49, 54),
                    "uri":
                    "http://arxiv.example.net/web/20091027204954/http://a.example.org"
                },
                "list": [{
                    "datetime":
                    datetime(2000, 6, 20, 18, 2, 59),
                    "uri":
                    "http://arxiv.example.net/web/20000620180259/http://a.example.org"
                }, {
                    "datetime":
                    datetime(2009, 10, 27, 20, 49, 54),
                    "uri":
                    "http://arxiv.example.net/web/20091027204954/http://a.example.org"
                }, {
                    "datetime":
                    datetime(2000, 6, 21, 1, 17, 31),
                    "uri":
                    "http://arxiv.example.net/web/20000621011731/http://a.example.org"
                }, {
                    "datetime":
                    datetime(2000, 6, 21, 4, 41, 56),
                    "uri":
                    "http://arxiv.example.net/web/20000621044156/http://a.example.org"
                }]
            }
        }

        cm = collectionmodel.CollectionModel(
            working_directory=working_directory)

        self.assertEqual(testmemcontent, cm.getMementoContent(testurim))

        self.assertEqual(testmemheaders, cm.getMementoHeaders(testurim))

        self.assertEqual([testurim], cm.getMementoURIList())

        self.maxDiff = None

        self.assertEqual(testtimemapdict, cm.getTimeMap(testurit))

        self.assertEqual(testtimemapheaders, cm.getTimeMapHeaders(testurit))

        self.assertEqual([testurit], cm.getTimeMapURIList())

        self.assertEqual(cm.getMementoErrorInformation(badurim),
                         errorinformation)

        shutil.rmtree(test_directory)
    def test_handle_boilerplateremoval_error_due_to_empty_first_document(self):

        working_directory = "/tmp/test_handle_boilerplateremoval_error"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(working_directory=working_directory)

        headers = {
            "key1": "value1",
            "key2": "value2"
        }

        timemap_content ="""<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT",
<memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT",
<memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT"
"""

        full_html_document = b"<html>The quick brown fox jumps over the lazy dog<body></html>"
        really_empty_document = b""

        cm.addTimeMap("timemap1", timemap_content, headers)
        cm.addMemento("memento11", really_empty_document, headers)
        cm.addMemento("memento12", full_html_document, headers)
        cm.addMemento("memento13", full_html_document, headers)

        # TODO: how to handle the empty document?

        mm = MeasureModel()

        mm = compute_jaccard_across_TimeMap(
            cm, mm, tokenize=None, stemming=True)

        pp.pprint(mm.scoremodel)

        self.assertEqual(
            mm.get_Memento_measurement_error_message("memento11", "timemap measures", "jaccard"),
            "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content"
        )

        self.assertEqual(
            mm.get_Memento_measurement_error_message("memento12", "timemap measures", "jaccard"),
            "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content"
        )

        self.assertEqual(
            mm.get_Memento_measurement_error_message("memento13", "timemap measures", "jaccard"),
            "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content"
        )

        mm = compute_cosine_across_TimeMap(
            cm, mm, tokenize=None, stemming=True)

        self.assertEqual(
            mm.get_Memento_measurement_error_message("memento11", "timemap measures", "cosine"),
            "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content"
        )

        self.assertEqual(
            mm.get_Memento_measurement_error_message("memento12", "timemap measures", "cosine"),
            "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content"
        )

        self.assertEqual(
            mm.get_Memento_measurement_error_message("memento13", "timemap measures", "cosine"),
            "Boilerplate removal error with first memento in TimeMap, cannot effectively compare memento content"
        )


        shutil.rmtree(working_directory)
    def test_raw_simhash(self):

        working_directory = "/tmp/test_raw_simhash"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(working_directory=working_directory)

        headers = {
            "key1": "value1",
            "key2": "value2"
        }

        full_sentence = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 
            'the', 'lazy', 'dog', 'etaoin', 'shrdlu', 'Now','is', 'the', 
            'time', 'for', 'all', 'good', 'men', 'to', 'come', 'to', 'the', 
            'aid', 'of', 'their', 'country',
            'Jived', 'fox', 'nymph', 'grabs', 'quick', 'waltz',
            'Glib', 'jocks', 'quiz', 'nymph', 'to', 'vex', 'dwarf',
            'Sphinx', 'of', 'black', 'quartz,', 'judge', 'my', 'vow',
            'How', 'vexingly', 'quick', 'daft', 'zebras', 'jump',
            'The', 'five', 'boxing', 'wizards', 'jump', 'quickly',
            'Pack', 'my', 'box', 'with', 'five', 'dozen', 'liquor', 'jugs'
            ]

        memcontent1 = bytes("<html><body>{}</body></html>".format(" ".join(full_sentence[0:20])), "utf8")
        memcontent2 = bytes("<html><body>{}</body></html>".format(" ".join(full_sentence[20:-1])), "utf8")

        timemap_content ="""<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT",
<memento12>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT"
"""

        cm.addTimeMap("timemap1", timemap_content, headers)
        cm.addMemento("memento11", memcontent1, headers)
        cm.addMemento("memento12", memcontent2, headers)

        mm = MeasureModel()

        mm = compute_rawsimhash_across_TimeMap(cm, mm, tokenize=None, stemming=True)

        self.assertNotEqual(
            same_scores['raw_simhash'],
            mm.get_score("timemap1", "memento12", "timemap measures", "raw_simhash")
        )

        # after removing stop words, the first document consists of 11 words
        # the comparison document consists of more than 20 words
        # the terms 'quick' and 'jump' overlap, giving 2 overlapping terms
        # 11 - 2 = 9, hence the comparison score of 9
        expected_scores = {   'timemaps': {   'timemap1': {   'memento11': {   'timemap measures': {   'raw_simhash': {   'comparison score': 0}}},
                                    'memento12': {   'timemap measures': {   'raw_simhash': {   'comparison score': 36}}}}}}

        # for regression
        self.assertAlmostEqual(
            expected_scores['timemaps']['timemap1']['memento12']['timemap measures']['raw_simhash']['comparison score'],
            mm.get_score("timemap1", "memento12", "timemap measures", "raw_simhash")
        )

        shutil.rmtree(working_directory)
    def test_boilerplate_problem(self):

        working_directory = "/tmp/collectionmodel_test/test_boilerplate_problem"

        cm = collectionmodel.CollectionModel(
            working_directory=working_directory)

        testtimemap2 = """<http://a.example.org>;rel="original",
        <http://arxiv.example.net/timemap/http://a.example.org>
        ; rel="self";type="application/link-format"
        ; from="Tue, 20 Jun 2000 18:02:59 GMT"
        ; until="Wed, 09 Apr 2008 20:30:51 GMT",
        <http://arxiv.example.net/timegate/http://a.example.org>
        ; rel="timegate",
        <http://arxiv.example.net/web/20000620180259/http://a.example.org>
        ; rel="first memento";datetime="Tue, 20 Jun 2000 18:02:59 GMT"
        ; license="http://creativecommons.org/publicdomain/zero/1.0/",
        <http://arxiv.example.net/web/20091027204954/http://a.example.org>
        ; rel="last memento";datetime="Tue, 27 Oct 2009 20:49:54 GMT"
        ; license="http://creativecommons.org/publicdomain/zero/1.0/",
        <http://arxiv.example.net/web/20000621011731/http://a.example.org>
        ; rel="memento";datetime="Wed, 21 Jun 2000 01:17:31 GMT"
        ; license="http://creativecommons.org/publicdomain/zero/1.0/",
        <http://arxiv.example.net/web/20000621044156/http://a.example.org>
        ; rel="memento";datetime="Wed, 21 Jun 2000 04:41:56 GMT"
        ; license="http://creativecommons.org/publicdomain/zero/1.0/",
        """

        testheaders = {"header1": "value1", "header2": "value2"}

        testurit2 = "testing-storage:timemap2"

        cm.addTimeMap(testurit2, testtimemap2, testheaders)

        testcontent = b"<html><body>hi</body></html>"

        cm.addMemento(
            "http://arxiv.example.net/web/20000620180259/http://a.example.org",
            testcontent, testheaders)

        cm.addMemento(
            "http://arxiv.example.net/web/20091027204954/http://a.example.org",
            testcontent, testheaders)

        cm.addMemento(
            "http://arxiv.example.net/web/20000621011731/http://a.example.org",
            testcontent, testheaders)

        cm.addMemento(
            "http://arxiv.example.net/web/20000621044156/http://a.example.org",
            b"", testheaders)

        self.assertEqual(
            cm.getMementoContentWithoutBoilerplate(
                "http://arxiv.example.net/web/20000620180259/http://a.example.org"
            ), b"hi\n")

        with self.assertRaises(
                collectionmodel.
                CollectionModelBoilerPlateRemovalFailureException):
            data = cm.getMementoContentWithoutBoilerplate(
                "http://arxiv.example.net/web/20000621044156/http://a.example.org"
            )
            data  # here to shut up pylint

        shutil.rmtree(working_directory)