Python PositionContentExtractor 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: position_content_extractor

hotexamples.com에서의 예제들: 8

Python PositionContentExtractor - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 position_content_extractor.PositionContentExtractor에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

PositionContentExtractor(2)

_get_content(1)

예제 #1

파일 보기

파일: test_content_extractor.py 프로젝트: EuanCockburn/ifind

 def setUp(self):
     self.logger = logging.getLogger("TestStructuredExtractor")
     html = ' <html> <div id="header"><h1>hello world</h1>' \
            '</div><div id="content"><p>this is important</p>' \
            '<p> study computing it is fun</p></div>' \
            '<div id="footer"> <h2>byes</h2></div> ' \
            '<div id="post"> stay <div id="sub-post">should be gone</div>' \
            '</div><footer class="myfoot">at the bottom</footer></html> '
     div_ids = []
     self.extractor = PositionContentExtractor(div_ids=div_ids)
     self.extractor.process_html_page(html)

예제 #2

파일 보기

파일: test_content_extractor.py 프로젝트: Loptr250/ifind

 def setUp(self):
     self.logger = logging.getLogger("TestStructuredExtractor")
     html = ' <html> <div id="header"><h1>hello world</h1>' \
            '</div><div id="content"><p>this is important</p>' \
            '<p> study computing it is fun</p></div>' \
            '<div id="footer"> <h2>byes</h2></div> ' \
            '<div id="post"> stay <div id="sub-post">should be gone</div>' \
            '</div><footer class="myfoot">at the bottom</footer></html> '
     div_ids = []
     self.extractor = PositionContentExtractor(div_ids=div_ids)
     self.extractor.process_html_page(html)

예제 #3

파일 보기

파일: test_content_extractor.py 프로젝트: EuanCockburn/ifind

class WebTestPositionExtractor(unittest.TestCase):
    def setUp(self):
        """
        Setting up test on offensive page
        """
        self.logger = logging.getLogger("TestStructuredExtractor")
        pc = PageCapture('https://www.gov.uk/vehicles-you-can-drive')
        self.html = pc.get_page_sourcecode()
        self.div_ids = []

        #from BeautifulSoup import BeautifulSoup
        #soup = BeautifulSoup(html)
        #texts = soup.findAll(text=True)
        #print texts

    def test_extract_from_bad_page(self):
        self.extractor = PositionContentExtractor(div_ids=self.div_ids)
        self.extractor.process_html_page(self.html)
        #todo pass if no errors?
        div_ids = ['related', 'skiplink-container']
        self.extractor.set_div_ids(div_ids)

예제 #4

파일 보기

파일: test_content_extractor.py 프로젝트: Loptr250/ifind

class WebTestPositionExtractor(unittest.TestCase):

    def setUp(self):
        """
        Setting up test on offensive page
        """
        self.logger = logging.getLogger("TestStructuredExtractor")
        pc = PageCapture('https://www.gov.uk/vehicles-you-can-drive')
        self.html = pc.get_page_sourcecode()
        self.div_ids=[]


        #from BeautifulSoup import BeautifulSoup
        #soup = BeautifulSoup(html)
        #texts = soup.findAll(text=True)
        #print texts

    def test_extract_from_bad_page(self):
        self.extractor = PositionContentExtractor(div_ids=self.div_ids)
        self.extractor.process_html_page(self.html)
        #todo pass if no errors?
        div_ids = ['related','skiplink-container']
        self.extractor.set_div_ids(div_ids)

예제 #5

파일 보기

파일: test_content_extractor.py 프로젝트: Loptr250/ifind

 def test_extract_from_bad_page(self):
     self.extractor = PositionContentExtractor(div_ids=self.div_ids)
     self.extractor.process_html_page(self.html)
     #todo pass if no errors?
     div_ids = ['related','skiplink-container']
     self.extractor.set_div_ids(div_ids)

예제 #6

파일 보기

파일: test_content_extractor.py 프로젝트: Loptr250/ifind

class TestPositionExtractor(unittest.TestCase):

    def setUp(self):
        self.logger = logging.getLogger("TestStructuredExtractor")
        html = ' <html> <div id="header"><h1>hello world</h1>' \
               '</div><div id="content"><p>this is important</p>' \
               '<p> study computing it is fun</p></div>' \
               '<div id="footer"> <h2>byes</h2></div> ' \
               '<div id="post"> stay <div id="sub-post">should be gone</div>' \
               '</div><footer class="myfoot">at the bottom</footer></html> '
        div_ids = []
        self.extractor = PositionContentExtractor(div_ids=div_ids)
        self.extractor.process_html_page(html)

    def test_remove_div_content(self):
        div_id=["header"]
        self.extractor.set_div_ids(div_id)
        expected = "this is important study computing it is fun byes stay should be gone at the bottom"

        self.process_test_equals(expected, self.extractor.text)

        div_id = ["content"]
        expected = "hello world byes stay should be gone at the bottom"
        self.extractor.set_div_ids(div_id)

        self.process_test_equals(expected, self.extractor.text)
        #test multiple div removal
        ignore_divs = ['header','footer']
        self.extractor.set_div_ids(ignore_divs)

        expected = 'this is important study computing it is fun stay should be gone at the bottom'
        self.process_test_equals(expected, self.extractor.text)

        #test remove div within a div
        expected = "at the bottom"
        ignore_divs = ['header','footer','content','post']
        self.extractor.set_div_ids(ignore_divs)
        self.process_test_equals(expected, self.extractor.text)

    def test_get_subtext(self):
        self.extractor.text = "this is a sentence which has some words in it"
        result = self.extractor.get_subtext(num_words=2)
        expected = "this is"
        self.process_test_equals(expected,result)
        #test greater than length returns whole text
        result = self.extractor.get_subtext(num_words=12)
        self.process_test_equals(self.extractor.text, result)

    def test_get_content(self):
        expected = "at the bottom"
        result = self.extractor._get_content("footer",tag_class="myfoot")
        self.process_test_equals(expected,result)

    def test_set_all_content(self):
        included_ids = ['content']
        expected = "this is important study computing it is fun"
        self.extractor.set_all_content(included_ids,"div")
        result = self.extractor.text
        self.process_test_equals(expected,result)

    def process_test_equals(self, expected, result):
        msg = 'Expected but got: ', expected, result
        self.assertEqual(expected, result, msg)

예제 #7

파일 보기

파일: test_content_extractor.py 프로젝트: EuanCockburn/ifind

 def test_extract_from_bad_page(self):
     self.extractor = PositionContentExtractor(div_ids=self.div_ids)
     self.extractor.process_html_page(self.html)
     #todo pass if no errors?
     div_ids = ['related', 'skiplink-container']
     self.extractor.set_div_ids(div_ids)

예제 #8

파일 보기

파일: test_content_extractor.py 프로젝트: EuanCockburn/ifind

class TestPositionExtractor(unittest.TestCase):
    def setUp(self):
        self.logger = logging.getLogger("TestStructuredExtractor")
        html = ' <html> <div id="header"><h1>hello world</h1>' \
               '</div><div id="content"><p>this is important</p>' \
               '<p> study computing it is fun</p></div>' \
               '<div id="footer"> <h2>byes</h2></div> ' \
               '<div id="post"> stay <div id="sub-post">should be gone</div>' \
               '</div><footer class="myfoot">at the bottom</footer></html> '
        div_ids = []
        self.extractor = PositionContentExtractor(div_ids=div_ids)
        self.extractor.process_html_page(html)

    def test_remove_div_content(self):
        div_id = ["header"]
        self.extractor.set_div_ids(div_id)
        expected = "this is important study computing it is fun byes stay should be gone at the bottom"

        self.process_test_equals(expected, self.extractor.text)

        div_id = ["content"]
        expected = "hello world byes stay should be gone at the bottom"
        self.extractor.set_div_ids(div_id)

        self.process_test_equals(expected, self.extractor.text)
        #test multiple div removal
        ignore_divs = ['header', 'footer']
        self.extractor.set_div_ids(ignore_divs)

        expected = 'this is important study computing it is fun stay should be gone at the bottom'
        self.process_test_equals(expected, self.extractor.text)

        #test remove div within a div
        expected = "at the bottom"
        ignore_divs = ['header', 'footer', 'content', 'post']
        self.extractor.set_div_ids(ignore_divs)
        self.process_test_equals(expected, self.extractor.text)

    def test_get_subtext(self):
        self.extractor.text = "this is a sentence which has some words in it"
        result = self.extractor.get_subtext(num_words=2)
        expected = "this is"
        self.process_test_equals(expected, result)
        #test greater than length returns whole text
        result = self.extractor.get_subtext(num_words=12)
        self.process_test_equals(self.extractor.text, result)

    def test_get_content(self):
        expected = "at the bottom"
        result = self.extractor._get_content("footer", tag_class="myfoot")
        self.process_test_equals(expected, result)

    def test_set_all_content(self):
        included_ids = ['content']
        expected = "this is important study computing it is fun"
        self.extractor.set_all_content(included_ids, "div")
        result = self.extractor.text
        self.process_test_equals(expected, result)

    def process_test_equals(self, expected, result):
        msg = 'Expected but got: ', expected, result
        self.assertEqual(expected, result, msg)