def search_by_date( self, start_date=None, end_date=None, case_details=False, case_types=[], download_dir=None, headless=True, ): """ Scrape case metadata and/or details by date ranges. Defaults to current day if no dates provided. Supports the ability to pass in additional query parameters supported by the Advanced Search in order to further limit search results. Args: start_date (str): start date in YYYY-MM-DD format (optional) end_date (str): end date in YYYY-MM-DD format (optional) case_details (boolean): Whether to scrape detailed case data. (optional; defaults to False) case_types (list<str>): One or more case type codes (optional) download_dir (str): Override Selenium download directory (defaults to standard court-scraper) headless (boolean): Run Selenium in headless mode for case detail searches (defaults to True) Returns: List of CaseInfo instances """ if not start_date: start_date, end_date = self.current_day, self.current_day results = [] county = self.place_id[3:] # Clip the state prefix from place_id if case_details: results = self.search( start_date=start_date, end_date=end_date, case_types=case_types, download_dir=download_dir or self.get_download_dir(), headless=headless, ) else: # Case metadata can be gathered using just Requests date_format = "%m-%d-%Y" dates = dates_for_range(start_date, end_date, output_format=date_format) for date_str in dates: api = SearchApi(county) extra_params = {} if case_types: extra_params["caseType"] = ",".join(case_types) cases = api.search_by_filing_date(date_str, date_str, extra_params) results.extend(cases) return results
def search(self, start_date, end_date, extra_params={}, case_details=False): date_format = "%m/%d/%Y" dates = dates_for_range(start_date, end_date, output_format=date_format) search_results = [] for date_str in dates: # Convert date_str to standard YYYY-MM-DD for upstream usage date_key = self._standardize_date(date_str, date_format, "%Y-%m-%d") # Always limit query to a single filing date, to minimize # chances of truncate results search_params = { "FiledDateL": date_str, # start filing date - MM/DD/YYYY "FiledDateH": date_str, # end filing date - MM/DD/YYYY } # Merge any additional query parameters search_params.update(extra_params) html, basic_case_data = self._run_search(search_params) # Skip if there were no results for date if not basic_case_data: continue # Warn if results were truncated if "results are limited to 500" in html: msg = ( "WARNING: Results were truncated for your search." " Try using a more targeted query, e.g. with a case type, " " to avoid losing records.") logger.warning(msg) if case_details: results = self._scrape_case_details(date_key, basic_case_data) search_results.extend(results) else: # Add the filing date to CaseInfo instances if it's only a metadata search # since it's not listed on results page for case in basic_case_data: case.update({"filing_date": date_key}) search_results.append(case)
def search_by_date(self, county, start_date, end_date, case_types=[]): date_format = "%m-%d-%Y" dates = dates_for_range(start_date, end_date, output_format=date_format) payload = [] for idx, day in enumerate(dates): self.go_to() # advanced search page self._execute_date_search(county, day, day, case_types) if not self.search_has_results(self.driver.current_url): continue # Solve the captcha on the first search, # save the solution for re-use, and apply the solution # on the first case of the first day's search results # (using it on subsequent case detail API calls causes errors) result_kwargs = {"use_captcha_solution": False} if idx == 0: captcha_solution = self.solve_captcha() result_kwargs["use_captcha_solution"] = True # Searches that yield a single result redirect automatically # to case detail page rather than search results listing page. # For these cases, immediately execute the case detail query if "caseDetail" in self.driver.current_url: case_info = self._get_case_details( county, self.driver.current_url, captcha_solution, result_kwargs["use_captcha_solution"], ) results = [case_info] else: results_page = SearchResultsPage(self.driver, county, self.captcha_api_key, captcha_solution) results = results_page.results.get(**result_kwargs) # TODO: if results_page.results_found(): # results_page.display_max_results() payload.extend(results) return payload
def search(self, start_date, end_date, case_details=False): date_format = "%m-%d-%y" dates = dates_for_range(start_date, end_date, output_format=date_format) search_results = [] for date_str in dates: # Convert date_str to standard YYYY-MM-DD for upstream usage date_key = self._standardize_date(date_str, date_format, "%Y-%m-%d") basic_case_data = self._run_search_for_day(date_str) # Skip if there were no results for date if not basic_case_data: continue if case_details: results = self._scrape_case_details(date_key, basic_case_data) search_results.extend(results) else: # Add the filing date to CaseInfo instances if it's only a metadata search # since it's not listed on results page for case in basic_case_data: case.update({'filing_date': date_key}) search_results.append(case)
def test_dates_for_range(start, end, kwargs, expected): dates = dates_for_range(start, end, **kwargs) assert dates == expected