def test_empty_documents(self):

        working_directory = "/tmp/test_empty_documents"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(working_directory=working_directory)

        headers = {
            "key1": "value1",
            "key2": "value2"
        }

        timemap_content ="""<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT",
<memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT",
<memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT"
"""

        empty_html_document = b"<html><body></body></html>"

        # if the first document is empty and all subsequent docs are empty, 
        # then we are still on-topic, but this is to be debated
        cm.addTimeMap("timemap1", timemap_content, headers)
        cm.addMemento("memento11", empty_html_document, headers)
        cm.addMemento("memento12", empty_html_document, headers)
        cm.addMemento("memento13", empty_html_document, headers)

        mm = MeasureModel()

        mm = compute_jaccard_across_TimeMap(
            cm, mm, tokenize=None, stemming=True)

        mm = compute_cosine_across_TimeMap(
            cm, mm, tokenize=None, stemming=True)

        mm = compute_gensim_lsi_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_gensim_lda_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )
        
        for urit in mm.get_TimeMap_URIs():

            for urim in mm.get_Memento_URIs_in_TimeMap(urit):

                for measurename in ["cosine", "jaccard", "gensim_lda", "gensim_lsi"]:

                    self.assertEquals(
                        mm.get_Memento_measurement_error_message(urim, "timemap measures", measurename),
                        "After processing content, the first memento in TimeMap is now empty, cannot effectively compare memento content"
                    )

        shutil.rmtree(working_directory)
    def test_handle_boilerplateremoval_error_due_to_empty_document(self):

        working_directory = "/tmp/test_handle_boilerplateremoval_error"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(working_directory=working_directory)

        headers = {
            "key1": "value1",
            "key2": "value2"
        }

        timemap_content ="""<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT",
<memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT",
<memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT"
"""

        full_html_document = b"<html>The quick brown fox jumps over the lazy dog<body></html>"
        really_empty_document = b""

        cm.addTimeMap("timemap1", timemap_content, headers)
        cm.addMemento("memento11", full_html_document, headers)
        cm.addMemento("memento12", really_empty_document, headers)
        cm.addMemento("memento13", full_html_document, headers)

        # TODO: how to handle the empty document?

        mm = MeasureModel()

        mm = compute_jaccard_across_TimeMap(
            cm, mm, tokenize=None, stemming=True)

        pp.pprint(mm.scoremodel)

        self.assertEqual(
            mm.get_Memento_measurement_error_message("memento12", "timemap measures", "jaccard"),
            "CollectionModelBoilerPlateRemovalFailureException('XMLSyntaxError(None)')"
        )

        mm = compute_cosine_across_TimeMap(
            cm, mm, tokenize=None, stemming=True)

        self.assertEqual(
            mm.get_Memento_measurement_error_message("memento12", "timemap measures", "cosine"),
            "CollectionModelBoilerPlateRemovalFailureException('XMLSyntaxError(None)')"
        )

        mm = compute_gensim_lda_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        self.assertEqual(
            mm.get_Memento_measurement_error_message("memento12", "timemap measures", "gensim_lda"),
            "CollectionModelBoilerPlateRemovalFailureException('XMLSyntaxError(None)')"
        )

        shutil.rmtree(working_directory)
    def test_all_mementos_same(self):

        working_directory = "/tmp/test_all_mementos_same"

        if os.path.exists(working_directory):
            shutil.rmtree(working_directory)

        cm = collectionmodel.CollectionModel(working_directory=working_directory)

        headers = {
            "key1": "value1",
            "key2": "value2"
        }

        contents = []

        contents.append(b"<html><body>Content1 is wonderful</body></html>")
        contents.append(b"<html><body>Content2 is great</body></html>")

        timemap1_content ="""<original1>; rel="original",
<timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT",
<memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT",
<memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT"
"""

        timemap2_content ="""<original1>; rel="original",
<timemap2>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT",
<timegate1>; rel="timegate",
<memento21>; rel="first memento"; datetime="Tue, 21 Mar 2016 15:45:06 GMT",
<memento22>; rel="memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT",
<memento23>; rel="last memento"; datetime="Tue, 21 Mar 2018 15:45:12 GMT"
"""

        cm.addTimeMap("timemap1", timemap1_content, headers)
        cm.addTimeMap("timemap2", timemap2_content, headers)

        urits = cm.getTimeMapURIList()

        for i in range(0, 2):

            timemap = cm.getTimeMap(urits[i])

            for memento in timemap["mementos"]["list"]:
            
                urim = memento["uri"]

                cm.addMemento(urim, contents[i], headers)

        mm = MeasureModel()

        mm = compute_bytecount_across_TimeMap(
            cm, mm, tokenize=False, stemming=False
        )

        mm = compute_wordcount_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_jaccard_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_cosine_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_sorensen_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_levenshtein_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_nlevenshtein_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_tfintersection_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_rawsimhash_across_TimeMap(
            cm, mm, tokenize=False, stemming=False
        )

        mm = compute_tfsimhash_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_gensim_lsi_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        mm = compute_gensim_lda_across_TimeMap(
            cm, mm, tokenize=True, stemming=True
        )

        self.assertTrue( "timemap1" in mm.get_TimeMap_URIs() )
        self.assertTrue( "timemap2" in mm.get_TimeMap_URIs() )

        self.assertTrue( "memento11" in mm.get_Memento_URIs_in_TimeMap("timemap1") )
        self.assertTrue( "memento12" in mm.get_Memento_URIs_in_TimeMap("timemap1") )
        self.assertTrue( "memento13" in mm.get_Memento_URIs_in_TimeMap("timemap1") )

        self.assertTrue( "memento21" in mm.get_Memento_URIs_in_TimeMap("timemap2") )
        self.assertTrue( "memento22" in mm.get_Memento_URIs_in_TimeMap("timemap2") )
        self.assertTrue( "memento23" in mm.get_Memento_URIs_in_TimeMap("timemap2") )

        for measure in same_scores:

            print("evaluating measure {}".format(measure))

            for urit in mm.get_TimeMap_URIs():

                for urim in mm.get_Memento_URIs_in_TimeMap(urit):

                    # LDA does not appear to be deterministic
                    if measure == "gensim_lda":

                        self.assertGreaterEqual(
                            mm.get_score(urit, urim, "timemap measures", measure),
                            same_scores[measure],
                            msg="measure {} does not compute the correct score "
                            "for document sameness with URI-M {}".format(measure, urim)                            
                        )

                    else:

                            self.assertAlmostEqual(
                            mm.get_score(urit, urim, "timemap measures", measure),
                            same_scores[measure],
                            msg="measure {} does not compute the correct score "
                            "for document sameness with URI-M {}".format(measure, urim)
                        )

        shutil.rmtree(working_directory)