예제 #1
0
 def test_basic_webpage(self):
     basic_page = Webpage(
         "boh", "<html><head></head><body>hello world</body></html>")
     expected_shingle = Shingle(
         "boh", [["html", "head"], ["head", "head"], ["head", "body"],
                 ["body", "body"], ["body", "html"]])
     shingle = extract_shingle_set(basic_page, 2)
     self.assertEqual(shingle[0].getContent(), ["html", "head"])
     self.assertEqual(shingle[1].getContent(), ["head", "head"])
     self.assertEqual(shingle[2].getContent(), ["head", "body"])
     self.assertEqual(shingle[3].getContent(), ["body", "body"])
     self.assertEqual(shingle[4].getContent(), ["body", "html"])
예제 #2
0
    def passo3(self, hash_table, pages, hash_module, window_size):
        cluster = {}
        for v in hash_table.keys():
            cluster[v] = []
        for page in pages:
            shingle_set = extract_shingle_set(page, window_size)
            v = create_shingle_vector(shingle_set, hash_module)
            v_primo = maximum_count_covering(hash_table, v.getContent())
            if (v_primo != None):
                cluster[v_primo].append(page)

        return cluster
예제 #3
0
    def passo1(self, pages, window_size=10, hash_module=256):
        logger = Logger.get_instance()
        hash_table = {}
        for page in pages:
            logger.print("Processing page: " + page.name, 2)
            shingle_set = extract_shingle_set(page, window_size)
            shingle_vector = create_shingle_vector(shingle_set, hash_module)
            masked_shingle_vectors = k_shingle_cover(shingle_vector, 6)
            for masked_shingle_vector in masked_shingle_vectors:
                #Ecco la bruttura
                if (masked_shingle_vector.getContent() in hash_table):
                    hash_table[masked_shingle_vector.getContent(
                    )] = hash_table.get(masked_shingle_vector.getContent()) + 1
                else:
                    hash_table[masked_shingle_vector.getContent()] = 1

        return hash_table
예제 #4
0
 def test_window_equal_than_page(self):
     page = Webpage("boh", "<html><body></body></html>")
     shingle = extract_shingle_set(page, 4)
     self.assertEqual(len(shingle), 1)
예제 #5
0
 def test_void_webpage(self):
     void_page = Webpage("", "")
     shingle = extract_shingle_set(void_page, 5)
     self.assertEqual(len(shingle), 0)
예제 #6
0
    def setUp(self):

        self.basic_page = Webpage(
            "boh", "<html><head></head><body>hello world</body></html>")
        self.shingle_set = extract_shingle_set(self.basic_page, 2)
        self.shingle_vector = create_shingle_vector(self.shingle_set)