def parse_and_scrape_site(self, mod, full_crawl):
        court_str = mod.__name__.split('.')[-1].split('_')[0]
        logger.info("Using court_str: \"%s\"" % court_str)

        for site in site_yielder(mod.Site().back_scrape_iterable, mod):
            site.parse()
            self.scrape_court(site, full_crawl=True)
    def _download(self, request_dict={}):
        """This is another of the cursed MS asp.net pages with damned POST parameters like __EVENTVALIDATION.
         These are near impossible to scrape without using Selenium.
        """
        driver = webdriver.PhantomJS(
            executable_path='/usr/local/phantomjs/phantomjs',
            service_log_path=os.path.devnull,  # Disable ghostdriver.log
        )
        driver.implicitly_wait(30)
        logger.info("Now downloading case page at: %s" % self.url)
        driver.get(self.url)

        # Select the correct drop downs, then submit.
        path_to_opinion_type = "//select[@id='ddlTypes']/option[@value='{type}']".format(
            type=self.opinion_type)
        driver.find_element_by_xpath(path_to_opinion_type).click()
        path_to_date = "//select[@id='ddlMonths']/option[@value='{d}']".format(
            d=self.release_date)
        driver.find_element_by_xpath(path_to_date).click()
        path_to_submit = "//input[@id='cmdSearch']"
        driver.find_element_by_xpath(path_to_submit).click()

        # Selenium doesn't give us the actual code, we have to hope.
        self.status = 200

        text = self._clean_text(driver.page_source)
        html_tree = html.fromstring(text)
        html_tree.rewrite_links(self._link_repl)
        return html_tree
Exemplo n.º 3
0
    def _download_backwards(self, page_year):
        logger.info("Running PhantomJS with params: %s" % (page_year,))
        driver = webdriver.PhantomJS(
            executable_path=phantomjs_executable_path,
            service_log_path=os.path.devnull,  # Disable ghostdriver.log
        )
        driver.implicitly_wait(30)
        driver.get(self.url)

        # Select the year (this won't trigger a GET unless it's changed)
        path = "//*[@id='ContentPlaceHolder1_PageContent_OpinionYears']/option[@value={year}]".format(year=page_year[1])
        option = driver.find_element_by_xpath(path)
        option.click()

        if page_year[0] != 0:
            # Not the first, page, go to the one desired.
            links = driver.find_elements_by_xpath("//a[@href[contains(., 'Page')]]")
            links[page_year[0] - 1].click()

        text = self._clean_text(driver.page_source)
        driver.quit()
        html_tree = html.fromstring(text)

        html_tree.rewrite_links(fix_links_in_lxml_tree,
                                base_href=self.request['url'])
        self.html = html_tree
        self.status = 200
Exemplo n.º 4
0
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            # Note that this is returning a list of HTML trees.
            html_trees = [super(Site, self)._download(request_dict=request_dict)]
        else:
            html_l = OpinionSite._download(self)
            s = requests.session()
            html_trees = []
            # The latest 5 urls on the page.
            path = "//td[@width='50%'][{court_index}]/h3[contains(., '{year}')]/following::ul[1]//a/@href".format(
                court_index=self.court_index,
                year=self.date.year,
            )
            for url in html_l.xpath(path)[0:4]:
                logger.info("Downloading Kansas page at: {url}".format(url=url))
                r = s.get(url,
                          headers={'User-Agent': 'Juriscraper'},
                          **request_dict)
                r.raise_for_status()

                # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
                if r.encoding == 'ISO-8859-1':
                    r.encoding = 'cp1252'

                # Grab the content
                text = self._clean_text(r.text)
                html_tree = html.fromstring(text)
                html_tree.make_links_absolute(url)

                remove_anchors = lambda url: url.split('#')[0]
                html_tree.rewrite_links(remove_anchors)
                html_trees.append(html_tree)
        return html_trees
Exemplo n.º 5
0
    def _download_backwards(self, page_year):
        logger.info("Running PhantomJS with params: %s" % (page_year, ))
        driver = webdriver.PhantomJS(
            executable_path='/usr/local/phantomjs/phantomjs',
            service_log_path=os.path.devnull,  # Disable ghostdriver.log
        )
        driver.implicitly_wait(30)
        driver.get(self.url)

        # Select the year (this won't trigger a GET unless it's changed)
        path = "//*[@id='ContentPlaceHolder1_PageContent_OpinionYears']/option[@value={year}]".format(
            year=page_year[1])
        option = driver.find_element_by_xpath(path)
        option.click()

        if page_year[0] != 0:
            # Not the first, page, go to the one desired.
            links = driver.find_elements_by_xpath(
                "//a[@href[contains(., 'Page')]]")
            links[page_year[0] - 1].click()

        text = self._clean_text(driver.page_source)
        driver.quit()
        html_tree = html.fromstring(text)

        html_tree.rewrite_links(self._link_repl)
        self.html = html_tree
        self.status = 200
Exemplo n.º 6
0
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            return super(Site, self)._download(request_dict=request_dict)
        else:
            html_l = super(Site, self)._download(request_dict)
            s = requests.session()
            html_trees = []
            for url in html_l.xpath("//*[@class='cen']/a/@href"):
                logger.info("Getting sub-url: {url}".format(url=url))
                r = s.get(
                    url,
                    headers={'User-Agent': 'Juriscraper'},
                    verify=certifi.where(),
                    **request_dict
                )
                r.raise_for_status()

                # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
                if r.encoding == 'ISO-8859-1':
                    r.encoding = 'cp1252'

                # Grab the content
                text = self._clean_text(r.text)
                html_tree = html.fromstring(text)
                html_tree.make_links_absolute(url)

                remove_anchors = lambda url: url.split('#')[0]
                html_tree.rewrite_links(remove_anchors)
                html_trees.append(html_tree)
            return html_trees
Exemplo n.º 7
0
 def _download_backwards(self, d):
     yy = str(d if d >= 10 else '0{}'.format(d))
     logger.info("Running backscraper for year: 20%s" % yy)
     self.running_back_scraper = True
     self.set_url()
     self.url = self.url.replace(self.yy, yy)
     self.html = self._download()
Exemplo n.º 8
0
    def _get_case_html_page(self, html_trees, html_l, request_dict):
        """Gets each of the individual case pages"""
        s = requests.session()
        for case_url in html_l.xpath(self.case_xpath):
            logger.info("  Getting sub-page at: %s" % case_url)
            r = s.get(case_url,
                      headers={'User-Agent': 'Juriscraper'},
                      verify=certifi.where(),
                      timeout=60,
                      **request_dict)

            r.raise_for_status()

            # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
            if r.encoding == 'ISO-8859-1':
                r.encoding = 'cp1252'

            # Grab the content
            text = self._clean_text(r.text)
            html_tree = html.fromstring(text)
            html_tree.make_links_absolute(self.url)

            remove_anchors = lambda url: url.split('#')[0]
            html_tree.rewrite_links(remove_anchors)
            html_trees.append(html_tree)
        return html_trees
Exemplo n.º 9
0
    def _get_case_html_page(self, html_trees, html_l, request_dict):
        """Gets each of the individual case pages"""
        s = requests.session()
        for case_url in html_l.xpath(self.case_xpath):
            logger.info("  Getting sub-page at: %s" % case_url)
            r = s.get(
                case_url,
                headers={'User-Agent': 'Juriscraper'},
                verify=certifi.where(),
                **request_dict
            )

            r.raise_for_status()

            # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
            if r.encoding == 'ISO-8859-1':
                r.encoding = 'cp1252'

            # Grab the content
            text = self._clean_text(r.text)
            html_tree = html.fromstring(text)
            html_tree.make_links_absolute(self.url)

            remove_anchors = lambda url: url.split('#')[0]
            html_tree.rewrite_links(remove_anchors)
            html_trees.append(html_tree)
        return html_trees
Exemplo n.º 10
0
    def _download(self, request_dict={}):
        """This is another of the cursed MS asp.net pages with damned POST
          parameters like __EVENTVALIDATION. These are near impossible to
          scrape without using Selenium.
        """
        if self.method == 'LOCAL':
            return super(Site, self)._download(request_dict=request_dict)
        else:
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
            )
            driver.implicitly_wait(30)
            logger.info("Now downloading case page at: %s" % self.url)
            driver.get(self.url)

            # Select the correct drop downs, then submit.
            path_to_opinion_type = "//select[@id='ddlTypes']/option[@value='{type}']".format(
                type=self.opinion_type)
            driver.find_element_by_xpath(path_to_opinion_type).click()
            path_to_date = "//select[@id='ddlMonths']/option[@value='{d}']".format(
                d=self.release_date)
            driver.find_element_by_xpath(path_to_date).click()
            path_to_submit = "//input[@id='cmdSearch']"
            driver.find_element_by_xpath(path_to_submit).click()

            # Selenium doesn't give us the actual code, we have to hope.
            self.status = 200

            text = self._clean_text(driver.page_source)
            html_tree = html.fromstring(text)
            html_tree.rewrite_links(self._link_repl)
        return html_tree
Exemplo n.º 11
0
    def parse_and_scrape_site(self, mod, full_crawl):
        court_str = mod.__name__.split(".")[-1].split("_")[0]
        logger.info('Using court_str: "%s"' % court_str)

        for site in site_yielder(mod.Site().back_scrape_iterable, mod):
            site.parse()
            self.scrape_court(site, full_crawl=True)
Exemplo n.º 12
0
    def handle(self, *args, **options):
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        self.verbosity = int(options.get("verbosity", 1))
        daemon_mode = options.get("daemonmode", False)

        full_crawl = options.get("full_crawl", False)

        try:
            rate = int(options["rate"])
        except (ValueError, AttributeError, TypeError):
            rate = 30

        court_id = options.get("court_id")
        if not court_id:
            raise CommandError("You must specify a court as a package or " "module.")
        else:
            module_strings = build_module_list(court_id)
            if not len(module_strings):
                raise CommandError("Unable to import module or package. " "Aborting.")

            logger.info("Starting up the scraper.")
            num_courts = len(module_strings)
            wait = (rate * 60) / num_courts
            i = 0
            while i < num_courts:
                # this catches SIGTERM, so the code can be killed safely.
                if die_now:
                    logger.info("The scraper has stopped.")
                    sys.exit(1)

                package, module = module_strings[i].rsplit(".", 1)

                mod = __import__("%s.%s" % (package, module), globals(), locals(), [module])
                # noinspection PyBroadException
                try:
                    self.parse_and_scrape_site(mod, full_crawl)
                except Exception, e:
                    # noinspection PyBroadException
                    try:
                        msg = (
                            "********!! CRAWLER DOWN !!***********\n"
                            "*****scrape_court method failed!*****\n"
                            "********!! ACTION NEEDED !!**********\n%s" % traceback.format_exc()
                        )
                        logger.critical(msg)

                        # opinions.united_states.federal.ca9_u --> ca9
                        court_str = mod.Site.__module__.split(".")[-1].split("_")[0]
                        court = Court.objects.get(pk=court_str)
                        ErrorLog(log_level="CRITICAL", court=court, message=msg).save()
                    except Exception, e:
                        # This is very important. Without this, an exception
                        # above will crash the caller.
                        pass
                finally:
Exemplo n.º 13
0
 def _get_case_names(self):
     path = '//tr/td[4]/text()'
     names = []
     for s in self.html.xpath(path):
         if s.strip():
             names.append(s)
     logger.info(str(len(names)))
     return names
Exemplo n.º 14
0
 def _get_case_names(self):
     path = '//tr/td[4]/text()'
     names = []
     for s in self.html.xpath(path):
         if s.strip():
             names.append(s)
     logger.info(str(len(names)))
     return names
Exemplo n.º 15
0
 def _download_backwards(self, d):
     logger.info("Running backscraper for year: 20{}".format(d))
     self.url = self.back_scrape_url.format(d if d >= 10 else '0{}'.format(d))
     self.html = self._download()
     if self.html is not None:
         # Setting status is important because it prevents the download
         # function from being run a second time by the parse method.
         self.status = 200
Exemplo n.º 16
0
def follow_redirections(r, s):
    """
    Recursive function that follows meta refresh redirections if they exist.
    """
    redirected, url = test_for_meta_redirections(r)
    if redirected:
        logger.info("Following a meta redirection to: %s" % url.encode("utf-8"))
        r = follow_redirections(s.get(url), s)
    return r
Exemplo n.º 17
0
 def __init__(self, *args, **kwargs):
     logger.warn("Using DeferringList object which cannot be sorted until "
                 "fetched. Note that in usual processing, the fetching "
                 "happens before the sorting, so this is OK.")
     logger.info("DeferringList has %s entries to fetch." %
                 len(kwargs["seed"]))
     self._data = kwargs["seed"]
     self._fetched_items = [False] * len(kwargs["seed"])
     self._fetching_function = kwargs["fetcher"]
Exemplo n.º 18
0
def add_delay(delay=0, deviation=0):
    """Create a semi-random delay.

    Delay is the number of seconds your program will be stopped for, and
    deviation is the number of seconds that the delay can vary.
    """
    duration = random.randrange(delay - deviation, delay + deviation)
    logger.info("Adding a delay of %s seconds. Please wait." % duration)
    time.sleep(duration)
Exemplo n.º 19
0
 def _download_backwards(self, d):
     logger.info("Running backscraper for year: 20{}".format(d))
     self.url = self.back_scrape_url.format(
         d if d >= 10 else '0{}'.format(d))
     self.html = self._download()
     if self.html is not None:
         # Setting status is important because it prevents the download
         # function from being run a second time by the parse method.
         self.status = 200
Exemplo n.º 20
0
 def __init__(self, *args, **kwargs):
     logger.warn("Using DeferringList object which cannot be sorted until "
                 "fetched. Note that in usual processing, the fetching "
                 "happens before the sorting, so this is OK.")
     logger.info("DeferringList has %s entries to fetch." %
                 len(kwargs['seed']))
     self._data = kwargs['seed']
     self._fetched_items = [False] * len(kwargs['seed'])
     self._fetching_function = kwargs['fetcher']
Exemplo n.º 21
0
def follow_redirections(r, s):
    """
    Recursive function that follows meta refresh redirections if they exist.
    """
    redirected, url = test_for_meta_redirections(r)
    if redirected:
        logger.info('Following a meta redirection to: %s' % url)
        r = follow_redirections(s.get(url), s)
    return r
Exemplo n.º 22
0
def add_delay(delay=0, deviation=0):
    """Create a semi-random delay.

    Delay is the number of seconds your program will be stopped for, and
    deviation is the number of seconds that the delay can vary.
    """
    duration = random.randrange(delay - deviation, delay + deviation)
    logger.info("Adding a delay of %s seconds. Please wait." % duration)
    time.sleep(duration)
Exemplo n.º 23
0
 def _get_case_names(self):
     path = "//tr/td[4]/text()"
     names = []
     for s in self.html.xpath(path):
         s = clean_if_py3(s)
         if s.strip():
             names.append(s)
     logger.info(str(len(names)))
     return names
Exemplo n.º 24
0
 def __getitem__(self, item):
     if self._fetched_items[item]:
         return self._data[item]
     else:
         # Go get the item using the fetching function
         logger.info("Getting deferred value from seed: %s" % self._data[item])
         new_val = self._fetching_function(self._data[item])
         self._data[item] = new_val
         self._fetched_items[item] = True
         return new_val
Exemplo n.º 25
0
    def _download_backwards(self, d):
        self.__set_paths(d)

        self.url = self.url_base.format(year=self.year)
        logger.info('Scraping year {}'.format(self.year))
        self.html = self._download()
        if self.html is not None:
            # Setting status is important because it prevents the download
            # function from being run a second time by the parse method.
            self.status = 200
Exemplo n.º 26
0
def follow_redirections(r: Response, s: Session) -> Response:
    """
    Parse and recursively follow meta refresh redirections if they exist until
    there are no more.
    """
    redirected, url = test_for_meta_redirections(r)
    if redirected:
        logger.info(f"Following a meta redirection to: {url.encode()}")
        r = follow_redirections(s.get(url), s)
    return r
Exemplo n.º 27
0
    def _download_backwards(self, d):
        self.__set_paths(d)

        self.url = self.url_base.format(year=self.year)
        logger.info("Scraping year {}".format(self.year))
        self.html = self._download()
        if self.html is not None:
            # Setting status is important because it prevents the download
            # function from being run a second time by the parse method.
            self.status = 200
Exemplo n.º 28
0
    def _download_backwards(self, year):
        self.url = self._get_url(year)
        logger.info("Backscraping for year %d: %s" % (year, self.url))
        self.html = self._download()

        # Setting status is important because it prevents the download
        # function from being run a second time by the parse method.
        if self.html is not None:
            self.status = 200
            self._process_html()
Exemplo n.º 29
0
def follow_redirections(r, s):
    """
    Parse and recursively follow meta refresh redirections if they exist until
    there are no more.
    """
    redirected, url = test_for_meta_redirections(r)
    if redirected:
        logger.info('Following a meta redirection to: %s' % url.encode('utf-8'))
        r = follow_redirections(s.get(url), s)
    return r
Exemplo n.º 30
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        #
        # PLEASE NOTE: if your adding a new example file, ensure that,
        # if any of the opinion links on the page do not link directly to
        # a pdf url, that you manually edit your example file and add '.pdf'
        # to the end of all those opinion anchor hrefs. We do this in order
        # to prevent the tests form hitting the network.  HINT: if your new
        # test takes any more than a split second to run, its probably hitting
        # the network and needs ot be fixed as explained above.
        #
        # PLEASE ALSO NOTE: coloctapp_example_3.html is supposed to have 0
        # results.  It is a blank page test case covering is_this_a_blank_page().
        if self.method == 'LOCAL':
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            hrefs = html_l.xpath(self.base_path)
            for ahref in hrefs:
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # DEACTIVATED BY arderyp ON 2018.06.07, SEE NOTE ON get_next_page()
                # # process all sub-pages
                # if self.next_subpage_path and self.method != 'LOCAL':
                #     while True:
                #         next_subpage_html = self.get_next_page(html_tree, self.next_subpage_path, request_dict, url)
                #         if next_subpage_html is None:
                #             break
                #
                #         self._extract_cases_from_sub_page(next_subpage_html, date_obj)
                #         html_trees.append((next_subpage_html, date_obj))
                #         html_tree = next_subpage_html

        return html_trees
Exemplo n.º 31
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        #
        # PLEASE NOTE: if your adding a new example file, ensure that,
        # if any of the opinion links on the page do not link directly to
        # a pdf url, that you manually edit your example file and add '.pdf'
        # to the end of all those opinion anchor hrefs. We do this in order
        # to prevent the tests form hitting the network.  HINT: if your new
        # test takes any more than a split second to run, its probably hitting
        # the network and needs ot be fixed as explained above.
        #
        # PLEASE ALSO NOTE: coloctapp_example_3.html is supposed to have 0
        # results.  It is a blank page test case covering is_this_a_blank_page().
        if self.test_mode_enabled():
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            hrefs = html_l.xpath(self.base_path)
            for ahref in hrefs:
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # DEACTIVATED BY arderyp ON 2018.06.07, SEE NOTE ON get_next_page()
                # # process all sub-pages
                # if self.next_subpage_path and self.test_mode_enabled():
                #     while True:
                #         next_subpage_html = self.get_next_page(html_tree, self.next_subpage_path, request_dict, url)
                #         if next_subpage_html is None:
                #             break
                #
                #         self._extract_cases_from_sub_page(next_subpage_html, date_obj)
                #         html_trees.append((next_subpage_html, date_obj))
                #         html_tree = next_subpage_html

        return html_trees
Exemplo n.º 32
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        # PLEASE NOTE: Tests still hit network in _extract_cases_from_sub_page
        # because awful court site doesn't direct link to PDF resources on date
        # listing pages
        if self.method == 'LOCAL':
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            for ahref in html_l.xpath(self.base_path):
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # process all sub-pages
                if self.next_subpage_path is not None and self.method != 'LOCAL':
                    while True:
                        next_subpage_html = self.get_next_page(
                            html_tree, self.next_subpage_path, request_dict,
                            url)
                        if next_subpage_html is None:
                            break
                        self._extract_cases_from_sub_page(
                            next_subpage_html, date_obj)
                        html_trees.append((next_subpage_html, date_obj))
                        html_tree = next_subpage_html

            if self.method != 'LOCAL':
                next_page_html = self.get_next_page(html_l,
                                                    self.next_page_path,
                                                    request_dict, self.url)
                if next_page_html is not None:
                    html_list.append(next_page_html)

        return html_trees
Exemplo n.º 33
0
 def _download(self, request_dict={}):
     if self.method == 'LOCAL':
         return super(Site, self)._download(request_dict=request_dict)
     else:
         html_l = super(Site, self)._download(request_dict)
         html_trees = []
         for url in html_l.xpath("//*[@class='cen']/a/@href"):
             logger.info("Getting sub-url: {url}".format(url=url))
             html_tree = self._get_html_tree_by_url(url, request_dict)
             html_trees.append(html_tree)
         return html_trees
Exemplo n.º 34
0
 def _download_backwards(self, d):
     self.backwards_days = 7
     self.case_date = d
     logger.info("Running backscraper with date range: %s to %s",
                 self.case_date - timedelta(days=self.backwards_days),
                 self.case_date)
     self.html = self._download()
     if self.html is not None:
         # Setting status is important because it prevents the download
         # function from being run a second time by the parse method.
         self.status = 200
Exemplo n.º 35
0
 def __getitem__(self, item):
     if self._fetched_items[item]:
         return self._data[item]
     else:
         # Go get the item using the fetching function
         logger.info("Getting deferred value from seed: %s" %
                     self._data[item])
         new_val = self._fetching_function(self._data[item])
         self._data[item] = new_val
         self._fetched_items[item] = True
         return new_val
Exemplo n.º 36
0
 def _download(self, request_dict={}):
     if self.method == 'LOCAL':
         return super(Site, self)._download(request_dict=request_dict)
     else:
         html_l = super(Site, self)._download(request_dict)
         html_trees = []
         for url in html_l.xpath("//*[@class='cen']/a/@href"):
             logger.info("Getting sub-url: {url}".format(url=url))
             html_tree = self._get_html_tree_by_url(url, request_dict)
             html_trees.append(html_tree)
         return html_trees
Exemplo n.º 37
0
 def _download_backwards(self, _):
     """Walk over all "Archive" links on Archive page,
     extract cases dictionaries, and add to self.cases
     """
     self.archive = True
     self.url = self.url + 'opinions-archive/'
     landing_page_html = self._download()
     path = '//div[@class="main-content-wrapper"]//a[contains(./text(), "Opinions Archive")]/@href'
     for archive_page_url in landing_page_html.xpath(path):
         logger.info("Back scraping archive page: %s" % archive_page_url)
         archive_page_html = self._get_html_tree_by_url(archive_page_url)
         self.extract_archive_cases(archive_page_html)
Exemplo n.º 38
0
 def _download_backwards(self, _):
     """Walk over all "Archive" links on Archive page,
     extract cases dictionaries, and add to self.cases
     """
     self.archive = True
     self.url = self.url + "opinions-archive/"
     landing_page_html = self._download()
     path = '//div[@class="main-content-wrapper"]//a[contains(./text(), "Opinions Archive")]/@href'
     for archive_page_url in landing_page_html.xpath(path):
         logger.info("Back scraping archive page: %s" % archive_page_url)
         archive_page_html = self._get_html_tree_by_url(archive_page_url)
         self.extract_archive_cases(archive_page_html)
Exemplo n.º 39
0
    def handle(self, *args, **options):
        global die_now

        # this line is used for handling SIGTERM (CTRL+4), so things can die
        # safely
        signal.signal(signal.SIGTERM, signal_handler)

        module_strings = build_module_list(options['court_id'])
        if not len(module_strings):
            raise CommandError('Unable to import module or package. Aborting.')

        logger.info("Starting up the scraper.")
        num_courts = len(module_strings)
        wait = (options['rate'] * 60) / num_courts
        i = 0
        while i < num_courts:
            # this catches SIGTERM, so the code can be killed safely.
            if die_now:
                logger.info("The scraper has stopped.")
                sys.exit(1)

            package, module = module_strings[i].rsplit('.', 1)

            mod = __import__(
                "%s.%s" % (package, module),
                globals(),
                locals(),
                [module]
            )
            # noinspection PyBroadException
            try:
                self.parse_and_scrape_site(mod, options['full_crawl'])
            except Exception, e:
                # noinspection PyBroadException
                try:
                    msg = ('********!! CRAWLER DOWN !!***********\n'
                           '*****scrape_court method failed!*****\n'
                           '********!! ACTION NEEDED !!**********\n%s' %
                           traceback.format_exc())
                    logger.critical(msg)

                    # opinions.united_states.federal.ca9_u --> ca9
                    court_str = mod.Site.__module__.split('.')[-1].split('_')[0]
                    court = Court.objects.get(pk=court_str)
                    ErrorLog(
                        log_level='CRITICAL',
                        court=court,
                        message=msg
                    ).save()
                except Exception, e:
                    # This is very important. Without this, an exception
                    # above will crash the caller.
                    pass
Exemplo n.º 40
0
def extract_recap_documents(docs,
                            skip_ocr=False,
                            order_by=None,
                            queue=None,
                            queue_length=100):
    """Loop over RECAPDocuments and extract their contents. Use OCR if requested.

    :param docs: A queryset containing the RECAPDocuments to be processed.
    :type docs: Django Queryset
    :param skip_ocr: Whether OCR should be completed (False) or whether items
    should simply be updated to have status OCR_NEEDED.
    :type skip_ocr: Bool
    :param order_by: An optimization parameter. You may opt to order the
    processing by 'small-first' or 'big-first'.
    :type order_by: str
    :param queue: The celery queue to send the content to.
    :type queue: str
    :param queue_length: The number of items to send to the queue at a time.
    :type queue_length: int
    """
    docs = docs.exclude(filepath_local='')
    if skip_ocr:
        # Focus on the items that we don't know if they need OCR.
        docs = docs.filter(ocr_status=None)
    else:
        # We're doing OCR. Only work with those items that require it.
        docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED)

    if order_by is not None:
        if order_by == 'small-first':
            docs = docs.order_by('page_count')
        elif order_by == 'big-first':
            docs = docs.order_by('-page_count')

    tasks = []
    completed = 0
    count = docs.count()
    logger.info("There are %s documents to process." % count)
    for pk in docs.values_list('pk', flat=True):
        # Send the items off for processing.
        last_item = (count == completed + 1)
        tasks.append(
            extract_recap_pdf.s(pk, skip_ocr).set(priority=5, queue=queue))

        # Every enqueue_length items, send the tasks to Celery.
        if (len(tasks) >= queue_length) or last_item:
            logger.info("Sent %s tasks to celery. We have sent %s "
                        "items so far." % (len(tasks), completed + 1))
            job = group(*tasks)
            job.apply_async().join()
            tasks = []

        completed += 1
    def _download(self, request_dict={}):
        """This is another of the cursed MS asp.net pages with damned POST
          parameters like __EVENTVALIDATION. These are near impossible to
          scrape without using Selenium.
        """
        if self.method == 'LOCAL':
            # This is an arbitrary date that we need to set
            # for our compar.json test to pass
            self.case_date = convert_date_string('2017-08-13')
            return super(Site, self)._download(request_dict=request_dict)
        else:
            driver = webdriver.PhantomJS(
                executable_path='/usr/local/phantomjs/phantomjs',
                service_log_path=os.path.devnull,  # Disable ghostdriver.log
                # Without these args, when you get self.url, you'll still be at
                # about:config because the SSL on this site is so terrible.
                service_args=[
                    '--ignore-ssl-errors=true', '--ssl-protocol=tlsv1'
                ],
            )
            driver.implicitly_wait(30)
            logger.info("Now downloading case page at: %s" % self.url)
            driver.get(self.url)

            # Select the correct drop downs, then submit.
            path_to_opinion_type = "//select[@id='ddlTypes']/option[@value='{type}']".format(
                type=self.opinion_type)
            driver.find_element_by_xpath(path_to_opinion_type).click()
            path_to_date = "//select[@id='ddlMonths']/option[@value='{d}']".format(
                d=self.release_date)

            try:
                driver.find_element_by_xpath(path_to_date).click()
            except NoSuchElementException:
                # This is not uncommon early in the month (or if there are
                # no opinions published in the current month), so failures
                # resulting from this raise can probably be ignored.
                warning = 'Current month (%s) not yet available in portal--common occurrence early in the month.'
                raise InsanityException(warning % self.release_date)

            path_to_submit = "//input[@id='cmdSearch']"
            driver.find_element_by_xpath(path_to_submit).click()

            # Selenium doesn't give us the actual code, we have to hope.
            self.status = 200

            text = self._clean_text(driver.page_source)
            html_tree = html.fromstring(text)
            html_tree.rewrite_links(fix_links_in_lxml_tree,
                                    base_href=self.request['url'])
        return html_tree
Exemplo n.º 42
0
def get_binary_content(download_url, cookies, method='GET'):
    """ Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors.

    :param download_url: The URL for the item you wish to download.
    :param cookies: Cookies that might be necessary to download the item.
    :param method: The HTTP method used to get the item, or "LOCAL" to get an item during testing
    :return: Two values. The first is a msg indicating any errors encountered. If blank, that indicates success. The
    second value is the response object containing the downloaded file.
    """
    if not download_url:
        # Occurs when a DeferredList fetcher fails.
        msg = 'NoDownloadUrlError: %s\n%s' % (download_url,
                                              traceback.format_exc())
        return msg, None
    # noinspection PyBroadException
    try:
        if method == 'LOCAL':
            mr = MockRequest(
                url=os.path.join(settings.INSTALL_ROOT, 'alert', download_url))
            r = mr.get()
        else:
            # Note that we do a GET even if site.method is POST. This is deliberate.
            s = requests.session()
            headers = {'User-Agent': 'CourtListener'}
            cookies = normalize_cookies(cookies)
            logger.info("Using cookies: %s" % cookies)
            try:
                r = s.get(download_url, headers=headers, cookies=cookies)
            except SSLError:
                # Washington has a certificate we don't understand.
                r = s.get(download_url,
                          verify=False,
                          headers=headers,
                          cookies=cookies)

            # test for empty files (thank you CA1)
            if len(r.content) == 0:
                msg = 'EmptyFileError: %s\n%s' % (download_url,
                                                  traceback.format_exc())
                return msg, r

            # test for and follow meta redirects
            r = follow_redirections(r, s)
    except:
        msg = 'DownloadingError: %s\n%s' % (download_url,
                                            traceback.format_exc())
        print msg
        return msg, r

    # Success!
    return '', r
Exemplo n.º 43
0
    def _get_case_names(self):
        """The case names on the main page only show the first half of long
        case names. As a result, we browse to the pages they link to and
        compile those pages using Selenium and PhantomJS. Normally we wouldn't
        do the compilation step, but, alas, these pages put all their data
        into JavaScript functions, where are then executed to create the page.

        A couple other notes:
         1. When developing, if you stop this after dirver.get(), you can get
            the content of the page by doing this:
              https://stackoverflow.com/questions/22739514
        """
        def fetcher(html_link):
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                full_url = 'http://2.alalinc.net/library/view/file/?lib=SUPREME&file={seed}'.format(
                    seed=html_link)
                driver = webdriver.PhantomJS(
                    executable_path='/usr/local/phantomjs/phantomjs',
                    service_log_path=os.path.
                    devnull,  # Disable ghostdriver.log
                )

                r = requests.get(
                    full_url,
                    headers={'User-Agent': 'Juriscraper'},
                    cookies=self._cookies,
                )
                r.raise_for_status()

                # Create a fake HTML page from r.text that can be requested by
                # selenium. See: https://stackoverflow.com/questions/24834838/
                driver.get('data:text/html,' + r.text)
                case_name = driver.find_element_by_xpath(
                    "//table[contains(descendant::text(), 'Description')]//tr[2]"
                ).text
                case_name = ' '.join(case_name.split())
                case_name = case_name.split('(')[0]
                case_name = case_name.split('PETITION')[0]
                return case_name

        seed = list(
            self.html.xpath(
                "//value[2]/text()[not(contains(../../value[7]/text(), 'list of decisions'))]"
            ))
        logger.info(
            "Getting {count} pages and rendering them using Selenium browser PhantomJS..."
            .format(count=len(seed)))
        return DeferringList(seed=seed, fetcher=fetcher)
Exemplo n.º 44
0
 def _download_backwards(self, _):
     """Overriding this method from parent iowa class because the
     site link/page structure for archived Court of Appeals opinions
     is different than that of Archived Supreme Court opinions
     """
     self.archive = True
     landing_page_html = self._download()
     path_filter_class = 'contains(./@class, "nav-link")'
     path_filter_text = 'contains(./text(), "Archived Court of Appeals Opinions")'
     path = '//a[%s][%s]/@href' % (path_filter_class, path_filter_text)
     for archive_page_url in landing_page_html.xpath(path):
         logger.info("Back scraping archive page: %s" % archive_page_url)
         archive_page_html = self._get_html_tree_by_url(archive_page_url)
         self.extract_archive_cases(archive_page_html)
Exemplo n.º 45
0
 def _download_backwards(self, _):
     """Overriding this method from parent iowa class because the
     site link/page structure for archived Court of Appeals opinions
     is different than that of Archived Supreme Court opinions
     """
     self.archive = True
     landing_page_html = self._download()
     path_filter_class = 'contains(./@class, "nav-link")'
     path_filter_text = 'contains(./text(), "Archived Court of Appeals Opinions")'
     path = '//a[%s][%s]/@href' % (path_filter_class, path_filter_text)
     for archive_page_url in landing_page_html.xpath(path):
         logger.info("Back scraping archive page: %s" % archive_page_url)
         archive_page_html = self._get_html_tree_by_url(archive_page_url)
         self.extract_archive_cases(archive_page_html)
Exemplo n.º 46
0
def get_binary_content(download_url, cookies, method='GET'):
    """ Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors.

    :param download_url: The URL for the item you wish to download.
    :param cookies: Cookies that might be necessary to download the item.
    :param method: The HTTP method used to get the item, or "LOCAL" to get an item during testing
    :return: Two values. The first is a msg indicating any errors encountered. If blank, that indicates success. The
    second value is the response object containing the downloaded file.
    """
    if not download_url:
        # Occurs when a DeferredList fetcher fails.
        msg = 'NoDownloadUrlError: %s\n%s' % (download_url, traceback.format_exc())
        return msg, None
    # noinspection PyBroadException
    try:
        if method == 'LOCAL':
            mr = MockRequest(url=os.path.join(settings.INSTALL_ROOT, 'alert', download_url))
            r = mr.get()
        else:
            # Note that we do a GET even if site.method is POST. This is deliberate.
            s = requests.session()
            headers = {'User-Agent': 'CourtListener'}
            cookies = normalize_cookies(cookies)
            logger.info("Using cookies: %s" % cookies)
            try:
                r = s.get(download_url,
                          headers=headers,
                          cookies=cookies)
            except SSLError:
                # Washington has a certificate we don't understand.
                r = s.get(download_url,
                          verify=False,
                          headers=headers,
                          cookies=cookies)

            # test for empty files (thank you CA1)
            if len(r.content) == 0:
                msg = 'EmptyFileError: %s\n%s' % (download_url, traceback.format_exc())
                return msg, r

            # test for and follow meta redirects
            r = follow_redirections(r, s)
    except:
        msg = 'DownloadingError: %s\n%s' % (download_url, traceback.format_exc())
        print msg
        return msg, r

    # Success!
    return '', r
Exemplo n.º 47
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        # PLEASE NOTE: Tests still hit network in _extract_cases_from_sub_page
        # because awful court site doesn't direct link to PDF resources on date
        # listing pages
        # PLEASE ALSO NOTE: coloctapp_example_3.html is supposed to have 0
        # results.  It is a blank page test case covering is_this_a_blank_page().
        if self.method == 'LOCAL':
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            hrefs = html_l.xpath(self.base_path)
            for ahref in hrefs:
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # DEACTIVATED BY arderyp ON 2018.06.07, SEE NOTE ON get_next_page()
                # # process all sub-pages
                # if self.next_subpage_path and self.method != 'LOCAL':
                #     while True:
                #         next_subpage_html = self.get_next_page(html_tree, self.next_subpage_path, request_dict, url)
                #         if next_subpage_html is None:
                #             break
                #
                #         self._extract_cases_from_sub_page(next_subpage_html, date_obj)
                #         html_trees.append((next_subpage_html, date_obj))
                #         html_tree = next_subpage_html

        return html_trees
Exemplo n.º 48
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        # PLEASE NOTE: Tests still hit network in _extract_cases_from_sub_page
        # because awful court site doesn't direct link to PDF resources on date
        # listing pages
        # PLEASE ALSO NOTE: coloctapp_example_3.html is supposed to have 0
        # results.  It is a blank page test case covering is_this_a_blank_page().
        if self.method == 'LOCAL':
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            hrefs = html_l.xpath(self.base_path)
            for ahref in hrefs:
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # DEACTIVATED BY arderyp ON 2018.06.07, SEE NOTE ON get_next_page()
                # # process all sub-pages
                # if self.next_subpage_path and self.method != 'LOCAL':
                #     while True:
                #         next_subpage_html = self.get_next_page(html_tree, self.next_subpage_path, request_dict, url)
                #         if next_subpage_html is None:
                #             break
                #
                #         self._extract_cases_from_sub_page(next_subpage_html, date_obj)
                #         html_trees.append((next_subpage_html, date_obj))
                #         html_tree = next_subpage_html

        return html_trees
Exemplo n.º 49
0
    def _download(self, request_dict={}):
        self.request_dict = request_dict
        landing_page_html = super(Site, self)._download(request_dict)

        # Test/example files should use html from direct resource page
        # PLEASE NOTE: Tests still hit network in _extract_cases_from_sub_page
        # because awful court site doesn't direct link to PDF resources on date
        # listing pages
        if self.method == 'LOCAL':
            date_string = landing_page_html.xpath('//h3')[0].text_content()
            date_obj = convert_date_string(date_string)
            self._extract_cases_from_sub_page(landing_page_html, date_obj)
            return [landing_page_html]

        html_trees = []
        html_list = [landing_page_html]

        while len(html_list) > 0:
            html_l = html_list[0]
            html_list = html_list[1:]

            # Loop over sub-pages
            for ahref in html_l.xpath(self.base_path):
                date_string = ahref.xpath("./text()")[0]
                url = ahref.xpath("./@href")[0]
                date_obj = convert_date_string(date_string)
                logger.info("Getting sub-url: %s" % url)

                # Fetch sub-page's content
                html_tree = self._get_html_tree_by_url(url, self.request_dict)
                self._extract_cases_from_sub_page(html_tree, date_obj)
                html_trees.append((html_tree, date_obj))

                # process all sub-pages
                if self.next_subpage_path is not None and self.method != 'LOCAL':
                    while True:
                        next_subpage_html = self.get_next_page(html_tree, self.next_subpage_path, request_dict, url)
                        if next_subpage_html is None:
                            break
                        self._extract_cases_from_sub_page(next_subpage_html, date_obj)
                        html_trees.append((next_subpage_html, date_obj))
                        html_tree = next_subpage_html

            if self.method != 'LOCAL':
                next_page_html = self.get_next_page(html_l, self.next_page_path, request_dict, self.url)
                if next_page_html is not None:
                    html_list.append(next_page_html)

        return html_trees
Exemplo n.º 50
0
        def fetcher(elem):
            """This reaches out to a secondary system and scrapes the correct
             info.
             """
            if self.method == 'LOCAL':
                return "No case names fetched during tests."
            else:
                ip_addresses = ['162.114.92.72', '162.114.92.78']
                for ip_address in ip_addresses:
                    last_item = ip_addresses.index(ip_address) == len(ip_addresses) - 1
                    url = 'http://%s/dockets/SearchCaseDetail.asp' % ip_address
                    anchor_text = html.tostring(elem, method='text', encoding='unicode')
                    m = self.docket_number_regex.search(anchor_text)

                    try:
                        r = requests.post(
                            url,
                            headers={'User-Agent': 'Juriscraper'},
                            timeout=5,
                            verify=certifi.where(),
                            data={
                                'txtyear': m.group('year'),
                                'txtcasenumber': m.group('docket_num').strip('0'),
                                'cmdnamesearh': 'Search',
                            },
                        )

                        # Throw an error if a bad status code is returned,
                        # otherwise, break the loop so we don't try more ip
                        # addresses than necessary.
                        r.raise_for_status()
                        break
                    except HTTPError, e:
                        logger.info("404 error connecting to: {ip}".format(
                            ip=ip_address,
                        ))
                        if e.response.status_code == 404 and not last_item:
                            continue
                        else:
                            raise e
                    except (ConnectionError, Timeout), e:
                        logger.info("Timeout/Connection error connecting to: {ip}".format(
                            ip=ip_address,
                        ))
                        if not last_item:
                            continue
                        else:
                            raise e
Exemplo n.º 51
0
    def _download(self, request_dict={}):
        html = super(Site, self)._download(request_dict)
        self.extract_cases(html)
        if self.method == 'LOCAL' or self.archive:
            return html

        # Walk over pagination "Next" page(s), if present
        proceed = True
        while proceed:
            next_page_url = self.extract_next_page_url(html)
            if next_page_url:
                logger.info('Scraping next page: %s' % next_page_url)
                html = self._get_html_tree_by_url(next_page_url)
                self.extract_cases(html)
            else:
                proceed = False
Exemplo n.º 52
0
    def _download(self, request_dict={}):
        if self.method == 'LOCAL':
            # Note that this is returning a list of HTML trees.
            html_trees = [super(Site, self)._download(request_dict=request_dict)]
        else:
            html_l = OpinionSite._download(self)
            html_trees = []
            path = "//td[@width='50%'][{court_index}]/h3[contains(., '{year}')]/following::ul[1]//a/@href".format(
                court_index=self.court_index,
                year=self.date.year,
            )

            # The latest 7 urls on the page.
            for url in html_l.xpath(path)[0:7]:
                logger.info("Downloading Kansas page at: {url}".format(url=url))
                html_tree = self._get_html_tree_by_url(url, request_dict)
                html_trees.append(html_tree)
        return html_trees
Exemplo n.º 53
0
    def _fetch_case_name(self, year, number):
        """Fetch case name for a given docket number + publication year pair.

        Some resources show 'Public Access Restricted' messages and do not
        provide parseable case name information.  These will be skipped by
        our system by returning False below.  The only other approach would
        be to parse the case name from the raw PDF text itself.
        """

        ip_addresses = ['162.114.92.72', '162.114.92.78']
        for ip_address in ip_addresses:
            last_ip = (ip_address == ip_addresses[-1])
            url = 'http://%s/dockets/SearchCaseDetail.asp' % ip_address

            try:
                r = requests.post(
                    url,
                    headers={'User-Agent': 'Juriscraper'},
                    timeout=5,
                    verify=certifi.where(),
                    data={
                        'txtyear': year,
                        'txtcasenumber': number,
                        'cmdnamesearh': 'Search',
                    },
                )

                # Throw an error if a bad status code is returned,
                # otherwise, break the loop so we don't try more ip
                # addresses than necessary.
                r.raise_for_status()
                break
            except HTTPError, e:
                logger.info('404 error connecting to: %s' % ip_address)
                if e.response.status_code == 404 and not last_ip:
                    continue
                else:
                    raise e
            except (ConnectionError, Timeout), e:
                logger.info('Timeout/Connection error connecting to: %s' % ip_address)
                if not last_ip:
                    continue
                else:
                    raise e
Exemplo n.º 54
0
    def _download(self, request_dict={}):
        """Uses Selenium because doing it with requests is a pain."""
        if self.method == 'LOCAL':
            return super(Site, self)._download(request_dict=request_dict)

        driver = webdriver.PhantomJS(
            executable_path=phantomjs_executable_path,
            service_log_path=os.path.devnull,  # Disable ghostdriver.log
        )
        driver.implicitly_wait(30)
        logger.info("Now downloading case page at: %s" % self.url)
        driver.get(self.url)

        # Set the start and end dates
        start_date_id = 'ctl00_Content_dpDateSearch_dateInput'
        start_date_input = driver.find_element_by_id(start_date_id)
        start_date_input.send_keys((self.case_date - timedelta(
            days=self.backwards_days)).strftime('%-m/%-d/%Y'))

        end_date_id = 'ctl00_Content_dpDateSearchTo_dateInput'
        end_date_input = driver.find_element_by_id(end_date_id)
        end_date_input.send_keys(self.case_date.strftime('%-m/%-d/%Y'))
        # driver.save_screenshot('%s.png' % self.case_date)

        # Check ordering by case date (this orders by case date, *ascending*)
        ordering_id = 'Content_rdoCaseName_1'
        driver.find_element_by_id(ordering_id).click()

        # Submit
        driver.find_element_by_id('Content_btnSearch').click()

        # Do not proceed until the results show up.
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located(
            (By.ID, 'Content_ddlResultsPerPage'))
        )
        # driver.save_screenshot('with-results.png')

        text = self._clean_text(driver.page_source)
        driver.quit()
        html_tree = self._make_html_tree(text)
        html_tree.rewrite_links(fix_links_but_keep_anchors,
                                base_href=self.url)
        return html_tree
Exemplo n.º 55
0
    def abort_by_url_hash(self, url, hash):
        """Checks whether we should abort due to a hash of the site data being
        unchanged since the last time a URL was visited.

        Returns True if we should abort the crawl. Else, returns False. Creates
        the item in the database if it doesn't already exist, assigning it to
        self.url2Hash.
        """
        changed, self.url2Hash = self._court_changed(url, hash)
        if not self.full_crawl:
            if not changed:
                logger.info("Unchanged hash at: %s" % url)
                return True
            else:
                logger.info("Identified changed hash at: %s" % url)
                return False
        else:
            # If it's a full crawl, we don't care about the hash. We do not abort no matter what.
            return False