def test_basic_webpage(self): basic_page = Webpage( "boh", "<html><head></head><body>hello world</body></html>") expected_shingle = Shingle( "boh", [["html", "head"], ["head", "head"], ["head", "body"], ["body", "body"], ["body", "html"]]) shingle = extract_shingle_set(basic_page, 2) self.assertEqual(shingle[0].getContent(), ["html", "head"]) self.assertEqual(shingle[1].getContent(), ["head", "head"]) self.assertEqual(shingle[2].getContent(), ["head", "body"]) self.assertEqual(shingle[3].getContent(), ["body", "body"]) self.assertEqual(shingle[4].getContent(), ["body", "html"])
def passo3(self, hash_table, pages, hash_module, window_size): cluster = {} for v in hash_table.keys(): cluster[v] = [] for page in pages: shingle_set = extract_shingle_set(page, window_size) v = create_shingle_vector(shingle_set, hash_module) v_primo = maximum_count_covering(hash_table, v.getContent()) if (v_primo != None): cluster[v_primo].append(page) return cluster
def passo1(self, pages, window_size=10, hash_module=256): logger = Logger.get_instance() hash_table = {} for page in pages: logger.print("Processing page: " + page.name, 2) shingle_set = extract_shingle_set(page, window_size) shingle_vector = create_shingle_vector(shingle_set, hash_module) masked_shingle_vectors = k_shingle_cover(shingle_vector, 6) for masked_shingle_vector in masked_shingle_vectors: #Ecco la bruttura if (masked_shingle_vector.getContent() in hash_table): hash_table[masked_shingle_vector.getContent( )] = hash_table.get(masked_shingle_vector.getContent()) + 1 else: hash_table[masked_shingle_vector.getContent()] = 1 return hash_table
def test_window_equal_than_page(self): page = Webpage("boh", "<html><body></body></html>") shingle = extract_shingle_set(page, 4) self.assertEqual(len(shingle), 1)
def test_void_webpage(self): void_page = Webpage("", "") shingle = extract_shingle_set(void_page, 5) self.assertEqual(len(shingle), 0)
def setUp(self): self.basic_page = Webpage( "boh", "<html><head></head><body>hello world</body></html>") self.shingle_set = extract_shingle_set(self.basic_page, 2) self.shingle_vector = create_shingle_vector(self.shingle_set)