Пример #1
0
 def setUpClass(self):
     self.test_doc = os.path.join(settings.TESTDATA, "httpclient402doc", "connmgmt.html")
     page = open(self.test_doc)
     content = page.read()
     page.close()
     encoding = cc.get_encoding(content)
     self.parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
     self.tree = etree.fromstring(content, self.parser).getroottree()
Пример #2
0
 def test_encoding(self):
     url = "http://www.infobart.com/index.php/about/"
     file_from = cc.get_file_from(url)
     content = file_from.read()
     encoding = cc.get_encoding(content)
     self.assertEqual(encoding, "utf-8")
     self.assertTrue(len(content) > 0)
     file_from.close()
Пример #3
0
 def setUpClass(self):
     self.test_doc = os.path.join(settings.TESTDATA, 'httpclient402doc',
                                  'connmgmt.html')
     page = open(self.test_doc)
     content = page.read()
     page.close()
     encoding = cc.get_encoding(content)
     self.parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
     self.tree = etree.fromstring(content, self.parser).getroottree()
Пример #4
0
    def test_get_text_context(self):
        encoding = cc.get_encoding(page_test2)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test2, parser).getroottree()
        eu.clean_tree(tree)

        tt = tree.xpath("//tt[1]")[0]
        text_context = eu.get_text_context(tt)
        self.assertEqual("Hello World foobar. This is nice. Yo.", text_context)
Пример #5
0
 def test_encoding(self):
     url = 'http://www.infobart.com/index.php/about/'
     file_from = \
         cc.get_file_from(url)
     content = file_from.read()
     encoding = cc.get_encoding(content)
     self.assertEqual(encoding, 'utf-8')
     self.assertTrue(len(content) > 0)
     file_from.close()
Пример #6
0
    def test_get_text_context(self):
        encoding = cc.get_encoding(page_test2)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test2, parser).getroottree()
        eu.clean_tree(tree)

        tt = tree.xpath('//tt[1]')[0]
        text_context = eu.get_text_context(tt)
        self.assertEqual('Hello World foobar. This is nice. Yo.', text_context)
Пример #7
0
    def process_page(self, url):
        self.logger.info("Processing page: " + url)
        local_url = self.make_copy(get_url_without_hash(url))

        local_page = urllib2.urlopen(local_url)
        content = local_page.read()
        local_page.close()
        parser = etree.HTMLParser(encoding=get_encoding(content))
        tree = etree.fromstring(content, parser)

        links = self.process_page_links(tree, local_url, url)
        self.process_page_imgs(tree, url)

        page = DocumentPage(url, local_url, links)

        return page
Пример #8
0
    def process_page(self, url):
        self.logger.info("Processing page: " + url)
        local_url = self.make_copy(get_url_without_hash(url))

        local_page = urllib2.urlopen(local_url)
        content = local_page.read()
        local_page.close()
        parser = etree.HTMLParser(encoding=get_encoding(content))
        tree = etree.fromstring(content, parser)

        links = self.process_page_links(tree, local_url, url)
        self.process_page_imgs(tree, url)

        page = DocumentPage(url, local_url, links)

        return page
Пример #9
0
    def test_word_count(self):
        encoding = cc.get_encoding(page_test)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test, parser).getroottree()
        eu.clean_tree(tree)

        h1 = eu.SingleXPath("//h1[1]")
        h1_element = h1.get_element(tree)
        wc = eu.get_word_count(h1.get_element_as_list(h1_element))
        print(h1.get_text(h1_element))
        self.assertEqual(6, wc)

        body = eu.SingleXPath("//body[1]")
        body_element = body.get_element(tree)
        wc = eu.get_word_count(body.get_element_as_list(body_element))
        print(body.get_text(body_element))
        self.assertEqual(11, wc)
Пример #10
0
    def test_word_count(self):
        encoding = cc.get_encoding(page_test)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test, parser).getroottree()
        eu.clean_tree(tree)

        h1 = eu.SingleXPath('//h1[1]')
        h1_element = h1.get_element(tree)
        wc = eu.get_word_count(h1.get_element_as_list(h1_element))
        print(h1.get_text(h1_element))
        self.assertEqual(6, wc)

        body = eu.SingleXPath('//body[1]')
        body_element = body.get_element(tree)
        wc = eu.get_word_count(body.get_element_as_list(body_element))
        print(body.get_text(body_element))
        self.assertEqual(11, wc)
Пример #11
0
 def load_snippets(self):
     from_path = os.path.join(settings.TESTDATA, 'snippets')
     snippets = []
     for i, path in enumerate(sorted(os.listdir(from_path))):
         if path.endswith('.java'):
             with open(os.path.join(from_path, path)) as f:
                 text = f.read()
                 encoding = get_encoding(text)
                 content = unicode(text, encoding)
                 snippet = CodeSnippet(
                     index = i,
                     project = self.project,
                     snippet_text = content,
                     language = 'j',
                     source = 'd',
                     )
                 snippet.save()
                 snippets.append(snippet)
                 
     return snippets
Пример #12
0
    def load_snippets(self):
        from_path = os.path.join(settings.TESTDATA, 'snippets')
        snippets = []
        for i, path in enumerate(sorted(os.listdir(from_path))):
            if path.endswith('.java'):
                with open(os.path.join(from_path, path)) as f:
                    text = f.read()
                    encoding = get_encoding(text)
                    content = unicode(text, encoding)
                    snippet = CodeSnippet(
                        index=i,
                        project=self.project,
                        snippet_text=content,
                        language='j',
                        source='d',
                    )
                    snippet.save()
                    snippets.append(snippet)

        return snippets
Пример #13
0
    def test_get_sentence(self):
        encoding = cc.get_encoding(page_test2)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test2, parser).getroottree()
        eu.clean_tree(tree)

        tt = tree.xpath("//tt[1]")[0]
        text_context = eu.get_text_context(tt)
        sentence = eu.get_sentence(tt, "foobar", text_context)
        self.assertEqual("Hello World foobar.", sentence)

        # Test when there are more than one match!
        code = tree.xpath("//code[2]")[0]
        text_context = eu.get_text_context(code)
        sentence = eu.get_sentence(code, "foo", text_context)
        self.assertEqual("This is foo.", sentence)

        # Test when there are more than one match, but wrong markup (sorry...)
        b = tree.xpath("//b[1]")[0]
        text_context = eu.get_text_context(b)
        sentence = eu.get_sentence(b, "foo", text_context)
        self.assertEqual("Hello World foo.", sentence)
Пример #14
0
    def test_get_sentence(self):
        encoding = cc.get_encoding(page_test2)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test2, parser).getroottree()
        eu.clean_tree(tree)

        tt = tree.xpath('//tt[1]')[0]
        text_context = eu.get_text_context(tt)
        sentence = eu.get_sentence(tt, 'foobar', text_context)
        self.assertEqual('Hello World foobar.', sentence)

        # Test when there are more than one match!
        code = tree.xpath('//code[2]')[0]
        text_context = eu.get_text_context(code)
        sentence = eu.get_sentence(code, 'foo', text_context)
        self.assertEqual('This is foo.', sentence)

        # Test when there are more than one match, but wrong markup (sorry...)
        b = tree.xpath('//b[1]')[0]
        text_context = eu.get_text_context(b)
        sentence = eu.get_sentence(b, 'foo', text_context)
        self.assertEqual('Hello World foo.', sentence)
Пример #15
0
    def _process_page(self, page, load):
        page_path = os.path.join(settings.PROJECT_FS_ROOT, page.file_path)
        page_file = open(page_path)
        content = page_file.read()
        page_file.close()
        encoding = get_encoding(content)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        load.tree = etree.fromstring(content, parser).getroottree()
        clean_tree(load.tree)

        page.title = self._process_page_title(page, load)

        body = self.xbody.get_element(load.tree)
        body_elements = self.xbody.get_element_as_list(body)
        page.word_count = get_word_count(body_elements)
        page.xpath = load.tree.getpath(body)
        page.save()

        self._process_init_page(page, load)
        check = self._check_parser(page, load)
        if not check:
            return
        self._process_sections(page, load)
Пример #16
0
    def _process_page(self, page, load):
        page_path = os.path.join(settings.PROJECT_FS_ROOT, page.file_path)
        page_file = open(page_path)
        content = page_file.read()
        page_file.close()
        encoding = get_encoding(content)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        load.tree = etree.fromstring(content, parser).getroottree()
        clean_tree(load.tree)

        page.title = self._process_page_title(page, load)

        body = self.xbody.get_element(load.tree)
        body_elements = self.xbody.get_element_as_list(body)
        page.word_count = get_word_count(body_elements)
        page.xpath = load.tree.getpath(body)
        page.save()

        self._process_init_page(page, load)
        check = self._check_parser(page, load)
        if not check:
            return
        self._process_sections(page, load)