예제 #1
0
    def _download(self, request_dict={}):
        html_l = OpinionSite._download(self)
        s = requests.session()
        html_trees = []
        for url in html_l.xpath("//td[contains(./text(),'Opinion') or contains(./text(), 'PER CURIAM')]"
                                "/preceding-sibling::td[1]//@href")[:2]:
            r = s.get(
                url,
                headers={'User-Agent': 'Juriscraper'},
                verify=certifi.where(),
                **request_dict
            )
            r.raise_for_status()

            # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
            if r.encoding == 'ISO-8859-1':
                r.encoding = 'cp1252'

            # Grab the content
            text = self._clean_text(r.text)
            html_tree = html.fromstring(text)
            html_tree.make_links_absolute(self.url)

            remove_anchors = lambda url: url.split('#')[0]
            html_tree.rewrite_links(remove_anchors)
            html_trees.append(html_tree)
        return html_trees
예제 #2
0
파일: iowa.py 프로젝트: m4h7/juriscraper
    def _download(self, request_dict={}):
        if self.method == "LOCAL":
            # Note that this is returning a list of HTML trees.
            html_trees = [super(Site, self)._download(request_dict=request_dict)]
        else:
            html_l = OpinionSite._download(self)
            s = requests.session()
            html_trees = []
            for url in html_l.xpath(
                "//td[@width='49%']//tr[contains(., ', {year}')]/td[5]/a/@href".format(year=self.year)
            ):
                r = s.get(url, headers={"User-Agent": "Juriscraper"}, verify=certifi.where(), **request_dict)
                r.raise_for_status()

                # If the encoding is iso-8859-1, switch it to cp1252 (a
                # superset)
                if r.encoding == "ISO-8859-1":
                    r.encoding = "cp1252"

                # Grab the content
                text = self._clean_text(r.text)
                html_tree = html.fromstring(text)
                html_tree.make_links_absolute(self.url)

                remove_anchors = lambda url: url.split("#")[0]
                html_tree.rewrite_links(remove_anchors)
                html_trees.append(html_tree)
        return html_trees
예제 #3
0
파일: kan.py 프로젝트: brianwc/juriscraper
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            # Note that this is returning a list of HTML trees.
            html_trees = [super(Site, self)._download(request_dict=request_dict)]
        else:
            html_l = OpinionSite._download(self)
            s = requests.session()
            html_trees = []
            # The latest 5 urls on the page.
            path = "//td[@width='50%'][{court_index}]/h3[contains(., '{year}')]/following::ul[1]//a/@href".format(
                court_index=self.court_index,
                year=self.date.year,
            )
            for url in html_l.xpath(path)[0:4]:
                logger.info("Downloading Kansas page at: {url}".format(url=url))
                r = s.get(url,
                          headers={'User-Agent': 'Juriscraper'},
                          **request_dict)
                r.raise_for_status()

                # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
                if r.encoding == 'ISO-8859-1':
                    r.encoding = 'cp1252'

                # Grab the content
                text = self._clean_text(r.text)
                html_tree = html.fromstring(text)
                html_tree.make_links_absolute(url)

                remove_anchors = lambda url: url.split('#')[0]
                html_tree.rewrite_links(remove_anchors)
                html_trees.append(html_tree)
        return html_trees
예제 #4
0
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            # Note that this is returning a list of HTML trees.
            html_trees = [
                super(Site, self)._download(request_dict=request_dict)
            ]
        else:
            html_l = OpinionSite._download(self)
            s = requests.session()
            html_trees = []
            for url in html_l.xpath(
                    "//td[@width='49%']//tr[contains(., ', {year}')]/td[5]/a/@href"
                    .format(year=self.year)):
                r = s.get(url,
                          headers={'User-Agent': 'Juriscraper'},
                          **request_dict)
                r.raise_for_status()

                # If the encoding is iso-8859-1, switch it to cp1252 (a
                # superset)
                if r.encoding == 'ISO-8859-1':
                    r.encoding = 'cp1252'

                # Grab the content
                text = self._clean_text(r.text)
                html_tree = html.fromstring(text)
                html_tree.make_links_absolute(self.url)

                remove_anchors = lambda url: url.split('#')[0]
                html_tree.rewrite_links(remove_anchors)
                html_trees.append(html_tree)
        return html_trees
예제 #5
0
 def _download(self, request_dict={}):
     html_l = OpinionSite._download(self)
     html_trees = []
     path = "//td[contains(./text(),'Opinion') or contains(./text(), 'PER CURIAM')]/preceding-sibling::td[1]//@href"
     for url in html_l.xpath(path)[:2]:
         html_tree = self._get_html_tree_by_url(url, request_dict)
         html_trees.append(html_tree)
     return html_trees
예제 #6
0
 def _download(self, request_dict={}):
     if self.method == 'LOCAL':
         # Note that this is returning a list of HTML trees.
         html_trees = [super(Site, self)._download(request_dict=request_dict)]
     else:
         html_l = OpinionSite._download(self)
         html_trees = []
         for url in html_l.xpath("//td[@width='49%']//tr[contains(., ', {year}')]/td[5]/a/@href".format(year=self.year)):
             html_tree = self._get_html_tree_by_url(url, request_dict)
             html_trees.append(html_tree)
     return html_trees
예제 #7
0
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            # Note that this is returning a list of HTML trees.
            html_trees = [super(Site, self)._download(request_dict=request_dict)]
        else:
            html_l = OpinionSite._download(self)
            html_trees = []
            path = "//td[@width='50%'][{court_index}]/h3[contains(., '{year}')]/following::ul[1]//a/@href".format(
                court_index=self.court_index,
                year=self.date.year,
            )

            # The latest 7 urls on the page.
            for url in html_l.xpath(path)[0:7]:
                logger.info("Downloading Kansas page at: {url}".format(url=url))
                html_tree = self._get_html_tree_by_url(url, request_dict)
                html_trees.append(html_tree)
        return html_trees
예제 #8
0
    def _download(self, request_dict={}):
        if self.test_mode_enabled():
            # Note that this is returning a list of HTML trees.
            html_trees = [
                super(Site, self)._download(request_dict=request_dict)
            ]
        else:
            html_l = OpinionSite._download(self)
            html_trees = []
            path = "//td[@width='50%'][{court_index}]/h3[contains(., '{year}')]/following::ul[1]//a/@href".format(
                court_index=self.court_index,
                year=self.date.year,
            )

            # The latest 7 urls on the page.
            for url in html_l.xpath(path)[0:7]:
                logger.info(
                    "Downloading Kansas page at: {url}".format(url=url))
                html_tree = self._get_html_tree_by_url(url, request_dict)
                html_trees.append(html_tree)
        return html_trees
예제 #9
0
파일: iowa.py 프로젝트: enyst/juriscraper
    def _download(self, request_dict={}):
        html_l = OpinionSite._download(self)
        s = requests.session()
        html_trees = []
        for url in html_l.xpath("//td[@width='49%']//tr[contains(., ', {year}')]/td[5]/a/@href".format(year=self.year)):
            r = s.get(url,
                      headers={'User-Agent': 'Juriscraper'},
                      **request_dict)
            r.raise_for_status()

            # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
            if r.encoding == 'ISO-8859-1':
                r.encoding = 'cp1252'

            # Grab the content
            text = self._clean_text(r.text)
            html_tree = html.fromstring(text)
            html_tree.make_links_absolute(self.url)

            remove_anchors = lambda url: url.split('#')[0]
            html_tree.rewrite_links(remove_anchors)
            html_trees.append(html_tree)
        return html_trees