예제 #1
0
 def get_file(location):
     if location.startswith('/'):
         with open(location) as f:
             r = requests.Session()
             r.content = f.read()
     else:
         r = requests.get(location)
     return fromstring(r.content), get_clean_body_content(r.content)
예제 #2
0
 def get_file(location):
     if location.startswith('/'):
         with open(location) as f:
             r = requests.Session()
             r.content = f.read()
     else:
         r = requests.get(location)
     return fromstring(r.content), get_clean_body_content(r.content)
예제 #3
0
파일: ill.py 프로젝트: enyst/juriscraper
        def fetcher(url):
            r = requests.get(url,
                             allow_redirects=False,
                             headers={'User-Agent': 'Juriscraper'})
            # Throw an error if a bad status code is returned.
            r.raise_for_status()

            html_tree = html.fromstring(r.text)
            html_tree.make_links_absolute(self.url)

            path = '//p[contains(@style, "justify")]/span[@style="font-weight: bold" ]/../following-sibling::p[not(contains(@style, "justify"))][position()=2]/following-sibling::p'
            summary_string = ""
            for e in html_tree.xpath(path):
                s = html.tostring(e, method='html', encoding='unicode')
                summary_string += s
            return get_clean_body_content(summary_string, remove_extra_tags=['span'])
예제 #4
0
        def fetcher(url):
            r = requests.get(url,
                             allow_redirects=False,
                             headers={'User-Agent': 'Juriscraper'})
            # Throw an error if a bad status code is returned.
            r.raise_for_status()

            html_tree = html.fromstring(r.text)
            html_tree.make_links_absolute(self.url)

            path = '//p[contains(@style, "justify")]/span[@style="font-weight: bold" ]/../following-sibling::p[not(contains(@style, "justify"))][position()=2]/following-sibling::p'
            summary_string = ""
            for e in html_tree.xpath(path):
                s = html.tostring(e, method='html', encoding='unicode')
                summary_string += s
            return get_clean_body_content(summary_string,
                                          remove_extra_tags=['span'])