def test_word_count(self): encoding = cc.get_encoding(page_test) parser = etree.HTMLParser(remove_comments=True, encoding=encoding) tree = etree.fromstring(page_test, parser).getroottree() eu.clean_tree(tree) h1 = eu.SingleXPath("//h1[1]") h1_element = h1.get_element(tree) wc = eu.get_word_count(h1.get_element_as_list(h1_element)) print(h1.get_text(h1_element)) self.assertEqual(6, wc) body = eu.SingleXPath("//body[1]") body_element = body.get_element(tree) wc = eu.get_word_count(body.get_element_as_list(body_element)) print(body.get_text(body_element)) self.assertEqual(11, wc)
def test_word_count(self): encoding = cc.get_encoding(page_test) parser = etree.HTMLParser(remove_comments=True, encoding=encoding) tree = etree.fromstring(page_test, parser).getroottree() eu.clean_tree(tree) h1 = eu.SingleXPath('//h1[1]') h1_element = h1.get_element(tree) wc = eu.get_word_count(h1.get_element_as_list(h1_element)) print(h1.get_text(h1_element)) self.assertEqual(6, wc) body = eu.SingleXPath('//body[1]') body_element = body.get_element(tree) wc = eu.get_word_count(body.get_element_as_list(body_element)) print(body.get_text(body_element)) self.assertEqual(11, wc)
def _process_page(self, page, load): page_path = os.path.join(settings.PROJECT_FS_ROOT, page.file_path) page_file = open(page_path) content = page_file.read() page_file.close() encoding = get_encoding(content) parser = etree.HTMLParser(remove_comments=True, encoding=encoding) load.tree = etree.fromstring(content, parser).getroottree() clean_tree(load.tree) page.title = self._process_page_title(page, load) body = self.xbody.get_element(load.tree) body_elements = self.xbody.get_element_as_list(body) page.word_count = get_word_count(body_elements) page.xpath = load.tree.getpath(body) page.save() self._process_init_page(page, load) check = self._check_parser(page, load) if not check: return self._process_sections(page, load)