示例#1
0
    def _get_download_urls(self):
        """Links from the root page go to a second page where the real links
        are posted.
        """
        def fetcher(seed_url):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                """Goes to second page, grabs the link and returns it."""
                r = requests.get(
                    seed_url,
                    allow_redirects=False,
                    headers={'User-Agent': 'Juriscraper'},
                    verify=certifi.where(),
                )
                r.raise_for_status()
                html_tree = html.fromstring(r.text)
                html_tree.make_links_absolute(self.url)

                path_to_audio_file = "//*[@class='padboxauto_MediaContent']//a/@href"
                try:
                    url = html_tree.xpath(path_to_audio_file)[0]
                except IndexError:
                    # The URL wasn't found, so something is wrong and we'll have to
                    # fix it in the _post_parse() method.
                    url = ''
                return url

        path = "//tr[@class='dg_tr']/td[6]//@href"
        seed_urls = self.html.xpath(path)
        return DeferringList(seed=seed_urls, fetcher=fetcher)
示例#2
0
    def _get_download_urls(self):
        """We use a fetcher and a DeferringList object and a HEAD request
        to test whether the wpd exists for a case"""
        def fetcher(html_link):
            if self.test_mode_enabled():
                return html_link  # Can't fetch remote during tests
            case_number = re.search(r"(\d+)", html_link).group(0)
            wpd_link = "http://www.ndcourts.gov/wp/%s.wpd" % case_number
            r = requests.head(
                wpd_link,
                allow_redirects=False,
                headers={"User-Agent": "Juriscraper"},
            )
            if r.status_code == 200:
                return wpd_link
            else:
                return html_link

        if self.crawl_date >= date(1998, 10, 1):
            path = '//a/@href[contains(., "/court/opinions/")]'
            seed = list(self.html.xpath(path))
        else:
            path = "//ul//a[text()]/@href"
            seed = list(self.html.xpath(path))
        return DeferringList(seed=seed, fetcher=fetcher)
示例#3
0
    def _get_summaries(self):
        def fetcher(url):
            r = requests.get(url,
                             allow_redirects=False,
                             headers={'User-Agent': 'Juriscraper'})
            # Throw an error if a bad status code is returned.
            r.raise_for_status()

            html_tree = html.fromstring(r.text)
            html_tree.make_links_absolute(self.url)

            path = '//p[contains(@style, "justify")]/span[@style="font-weight: bold" ]/../following-sibling::p[not(contains(@style, "justify"))][position()=2]/following-sibling::p'
            summary_string = ""
            for e in html_tree.xpath(path):
                s = html.tostring(e, method='html', encoding='unicode')
                summary_string += s
            return get_clean_body_content(summary_string,
                                          remove_extra_tags=['span'])

        path = "//td[@class='center']/table[3]//tr/td[6]/div/a/@href"
        seed_urls = self.html.xpath(path)
        if seed_urls:
            return DeferringList(seed=seed_urls, fetcher=fetcher)
        else:
            return []
示例#4
0
    def _get_case_names(self):
        """The case names on the main page only show the first half of long
        case names. As a result, we browse to the pages they link to and
        compile those pages using Selenium and PhantomJS. Normally we wouldn't
        do the compilation step, but, alas, these pages put all their data
        into JavaScript functions, where are then executed to create the page.

        A couple other notes:
         1. When developing, if you stop this after dirver.get(), you can get
            the content of the page by doing this:
              https://stackoverflow.com/questions/22739514
        """
        def fetcher(html_link):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                full_url = 'http://2.alalinc.net/library/view/file/?lib=SUPREME&file={seed}'.format(
                    seed=html_link)
                driver = webdriver.PhantomJS(
                    executable_path='/usr/local/phantomjs/phantomjs',
                    service_log_path=os.path.
                    devnull,  # Disable ghostdriver.log
                )

                r = requests.get(
                    full_url,
                    headers={'User-Agent': 'Juriscraper'},
                    cookies=self._cookies,
                )
                r.raise_for_status()

                # Create a fake HTML page from r.text that can be requested by
                # selenium. See: https://stackoverflow.com/questions/24834838/
                driver.get('data:text/html,' + r.text)
                case_name = driver.find_element_by_xpath(
                    "//table[contains(descendant::text(), 'Description')]//tr[2]"
                ).text
                case_name = ' '.join(case_name.split())
                case_name = case_name.split('(')[0]
                case_name = case_name.split('PETITION')[0]
                return case_name

        seed = list(
            self.html.xpath(
                "//value[2]/text()[not(contains(../../value[7]/text(), 'list of decisions'))]"
            ))
        logger.info(
            "Getting {count} pages and rendering them using Selenium browser PhantomJS..."
            .format(count=len(seed)))
        return DeferringList(seed=seed, fetcher=fetcher)
示例#5
0
文件: ky.py 项目: brianwc/juriscraper
    def _get_case_names(self):
        def fetcher(e):
            """This reaches out to a secondary system and scrapes the correct
             info.
             """
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                url = 'http://162.114.92.78/dockets/SearchCaseDetail.asp'
                anchor_text = html.tostring(e,
                                            method='text',
                                            encoding='unicode')
                m = self.docket_number_regex.search(anchor_text)

                r = requests.post(
                    url,
                    headers={'User-Agent': 'Juriscraper'},
                    data={
                        'txtyear': m.group('year'),
                        'txtcasenumber': m.group('docket_num').strip('0'),
                        'cmdnamesearh': 'Search',
                    },
                )

                # Throw an error if a bad status code is returned.
                r.raise_for_status()

                # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
                if r.encoding == 'ISO-8859-1':
                    r.encoding = 'cp1252'

                # Grab the content
                text = self._clean_text(r.text)
                html_tree = html.fromstring(text)

                # And finally, we parse out the good stuff.
                parties_path = "//tr[descendant::text()[contains(., 'Appell')]]//td[3]//text()"
                case_name_parts = []
                for s in html_tree.xpath(parties_path):
                    if s.strip():
                        case_name_parts.append(titlecase(s.strip().lower()))
                    if len(case_name_parts) == 2:
                        break
                return ' v. '.join(case_name_parts)

        # Get the docket numbers to use for queries.
        path = "//a[@href[contains(., '{m}')]]".format(m=self.hrefs_contain)
        elements = filter(self._has_valid_docket_number, self.html.xpath(path))
        return DeferringList(seed=elements, fetcher=fetcher)
示例#6
0
    def _get_case_names(self):
        def fetcher(url):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                r = requests.get(
                    url,
                    allow_redirects=True,
                    headers={'User-Agent': 'Juriscraper'},
                    verify=certifi.where(),
                )
                r.raise_for_status()

                html_tree = html.fromstring(r.text)
                html_tree.make_links_absolute(self.url)
                plaintiff = ''
                defendant = ''
                try:
                    plaintiff = html_tree.xpath(
                        "//text()[contains(., 'Style:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                    )[0]
                    defendant = html_tree.xpath(
                        "//text()[contains(., 'v.:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                    )[0]
                except IndexError:
                    logger.warn(
                        "No title or defendant found for {}".format(url))

                if defendant.strip():
                    # If there's a defendant
                    return titlecase('%s v. %s' % (plaintiff, defendant))
                else:
                    return titlecase(plaintiff)

        seed_urls = []
        for html_tree in self.html:
            page_records_count = self._get_opinion_count(html_tree)
            for record in range(page_records_count):
                path = "id('ctl00_ContentPlaceHolder1_grdDocuments_ctl00__{n}')/td[5]//@href".format(
                    n=record)
                seed_urls.append(html_tree.xpath(path)[0])
        if seed_urls:
            return DeferringList(seed=seed_urls, fetcher=fetcher)
        else:
            return []
示例#7
0
    def _get_case_names(self):
        def fetcher(url):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                r = requests.get(
                    url,
                    allow_redirects=False,
                    headers={'User-Agent': 'Juriscraper'},
                )
                r.raise_for_status()

                html_tree = html.fromstring(r.text)
                html_tree.make_links_absolute(self.url)
                plaintiff = html_tree.xpath(
                    "//text()[contains(., 'Style')]/ancestor::tr[1]/td[2]/text()"
                )[0]
                defendant = html_tree.xpath(
                    "//text()[contains(., 'v.:')]/ancestor::tr[1]/td[2]/text()"
                )[0]

                if defendant.strip():
                    # If there's a defendant
                    return titlecase('%s v. %s' % (plaintiff, defendant))
                else:
                    return titlecase(plaintiff)

        seed_urls = []
        if isinstance(self.html, list):
            for html_tree in self.html:
                page_records_count = self._get_opinion_count(html_tree)
                for record in xrange(page_records_count):
                    path = "id('ctl00_ContentPlaceHolder1_grdDocuments_ctl00__{n}')/td[5]//@href".format(
                        n=record)
                    seed_urls.append(html_tree.xpath(path)[0])
        else:
            seed_urls = map(self._return_seed_url, range(self.records_nr))
        if seed_urls:
            return DeferringList(seed=seed_urls, fetcher=fetcher)
        else:
            return []
示例#8
0
    def _get_download_urls(self):
        """Links from the root page go to a second page where the real links
        are posted.
        """
        def fetcher(seed_url):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                # Goes to second page, grabs the link and returns it.
                html_tree = self._get_html_tree_by_url(seed_url)
                path_to_audio_file = "//*[@class='padboxauto_MediaContent']//a/@href"
                try:
                    url = html_tree.xpath(path_to_audio_file)[0]
                except IndexError:
                    # The URL wasn't found, so something is wrong and we'll have to
                    # fix it in the _post_parse() method.
                    url = ''
                return url

        path = "//tr[@class='dg_tr']/td[6]//@href"
        seed_urls = self.html.xpath(path)
        return DeferringList(seed=seed_urls, fetcher=fetcher)
示例#9
0
文件: tex.py 项目: Ro5s/juriscraper
    def _get_case_names(self):
        def fetcher(url):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                plaintiff = ''
                defendant = ''
                try:
                    plaintiff = html_tree.xpath(
                        "//text()[contains(., 'Style:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                    )[0]
                    defendant = html_tree.xpath(
                        "//text()[contains(., 'v.:')]/ancestor::div[@class='span2']/following-sibling::div/text()"
                    )[0]
                except IndexError:
                    logger.warn("No title or defendant found for {}".format(url))

                if defendant.strip():
                    # If there's a defendant
                    return titlecase('%s v. %s' % (plaintiff, defendant))
                else:
                    return titlecase(plaintiff)

        seed_urls = []
        for html_tree in self.html:
            page_records_count = self._get_opinion_count(html_tree)
            for record in range(page_records_count):
                path = "id('ctl00_ContentPlaceHolder1_grdDocuments_ctl00__{n}')/td[5]//@href".format(
                    n=record
                )
                seed_urls.append(html_tree.xpath(path)[0])
        if seed_urls:
            return DeferringList(seed=seed_urls, fetcher=fetcher)
        else:
            return []
示例#10
0
                html_tree = html.fromstring(text)

                # And finally, we parse out the good stuff.
                parties_path = "//tr[descendant::text()[contains(., 'Appell')]]//td[3]//text()"
                case_name_parts = []
                for s in html_tree.xpath(parties_path):
                    if s.strip():
                        case_name_parts.append(titlecase(s.strip().lower()))
                    if len(case_name_parts) == 2:
                        break
                return ' v. '.join(case_name_parts)

        # Get the docket numbers to use for queries.
        path = "//a[@href[contains(., '{m}')]]".format(m=self.hrefs_contain)
        elements = filter(self._has_valid_docket_number, self.html.xpath(path))
        return DeferringList(seed=elements, fetcher=fetcher)

    def _get_docket_numbers(self):
        path = "//a[@href[contains(., '{m}')]]".format(
            m=self.hrefs_contain)
        elements = filter(self._has_valid_docket_number, self.html.xpath(path))
        return map(self._return_docket_number_from_str, elements)

    def _has_valid_docket_number(self, e):
        text = html.tostring(e, method='text', encoding='unicode')
        if self.docket_number_regex.search(text):
            return True
        else:
            return False

    def _return_docket_number_from_str(self, e):