def scrape_regions(session: requests.Session) -> List[Region]: session = authenticated_session() regions: List[Region] = [] supported_states = State.objects.values_list("code", flat=True) next_url = f"{API_ENDPOINT}/regions?limit=100" while next_url: with statsd.timed("turnout.official.usvfcall.regions", sample_rate=0.2): result = acquire_data(session, next_url) for usvf_region in result["objects"]: if usvf_region.get("state_abbr") not in supported_states: continue regions.append( Region( external_id=usvf_region["id"], name=usvf_region.get("region_name"), municipality=usvf_region.get("municipality_name"), municipality_type=usvf_region.get("municipality_type"), county=usvf_region.get("county_name"), state_id=usvf_region.get("state_abbr"), )) next_url = result["meta"].get("next") statsd.gauge("turnout.official.scraper.regions", len(regions)) logger.info("Found %(number)s Regions", {"number": len(regions)}) Region.objects.bulk_create(regions, ignore_conflicts=True) return regions
def geocode(**kwargs): RETRIES = 2 TIMEOUT = 6.0 # seems to be enough to handle slow apartment build queries args = {} for k in ["street", "city", "state", "q", "fields"]: if k in kwargs: if k == "street" and kwargs[k]: args[k] = strip_address_number_alpha_suffix(kwargs[k]) else: args[k] = kwargs[k] if "zipcode" in kwargs: args["postal_code"] = kwargs["zipcode"] url = f"{API_ENDPOINT}?{urlencode({**args, 'api_key': settings.GEOCODIO_KEY})}" with statsd.timed("turnout.common.geocode.geocode", sample_rate=0.2): retries = Retry(total=RETRIES, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) http = requests.Session() http.mount("https://", HTTPAdapter(max_retries=retries)) try: with tracer.trace("geocode", service="geocodioclient"): r = http.get(url, timeout=TIMEOUT) except Exception as e: extra = { "url": API_ENDPOINT, "api_args": str(args), "exception": str(e) } logger.warning( "Error querying geocodio args %(api_args)s, exception %(exception)s", extra, extra=extra, ) sentry_sdk.capture_exception( GeocodioAPIError( f"Error querying {API_ENDPOINT}, exception {str(e)}")) return None if r.status_code != 200: extra = { "url": API_ENDPOINT, "api_args": str(args), "status_code": r.status_code, } logger.warning( "Error querying geocodio args %(api_args)s, status code %(status_code)s", extra, extra=extra, ) if r.status_code != 422: # we get this from bogus addresses sentry_sdk.capture_exception( GeocodioAPIError( f"Error querying {API_ENDPOINT}, status code {r.status_code}" )) return None return r.json().get("results", None)
def scrape_regions(session: requests.Session) -> List[Region]: session = authenticated_session() regions: List[Region] = [] supported_states = State.states.values_list("code", flat=True) # The USVF API is buggy and does not paginate reliably. Make # multiple passes with different page sizes to ensure we capture # all records. In practice, the [100,73] is sufficient but # additional passes act as an insurance policy. saw_id = set() for limit in [100, 73, 67]: next_url = f"{API_ENDPOINT}/regions?limit={limit}" while next_url: with statsd.timed("turnout.official.usvfcall.regions", sample_rate=0.2): result = acquire_data(session, next_url) for usvf_region in result["objects"]: if usvf_region.get("state_abbr") not in supported_states: continue id_ = usvf_region.get("id") if id_ in saw_id: continue saw_id.add(id_) regions.append( Region( external_id=usvf_region["id"], name=usvf_region.get("region_name"), municipality=usvf_region.get("municipality_name"), municipality_type=usvf_region.get("municipality_type"), county=usvf_region.get("county_name"), state_id=usvf_region.get("state_abbr"), ) ) next_url = result["meta"].get("next") logger.info( "Found %(number)s Regions after this pass", {"number": len(regions)} ) statsd.gauge("turnout.official.scraper.regions", len(regions)) logger.info("Found %(number)s Regions", {"number": len(regions)}) Region.objects.bulk_create(regions, ignore_conflicts=True) Region.objects.exclude(external_id__in=[r.external_id for r in regions]).delete() return regions
def scrape_offices(session: requests.Session, regions: Sequence[Region]) -> None: existing_region_ids = [region.external_id for region in regions] existing_offices = Office.objects.values_list("external_id", flat=True) offices_dict: Dict[(int, Tuple[Action, Office])] = {} existing_addresses = Address.objects.values_list("external_id", flat=True) addresses_dict: Dict[(int, Tuple[Action, Address])] = {} next_url = f"{API_ENDPOINT}/offices?limit=100" while next_url: with statsd.timed("turnout.official.usvfcall.offices", sample_rate=0.2): result = acquire_data(session, next_url) for office in result["objects"]: # Check that the region is valid (we don't support US territories) region_id = int(office["region"].rsplit("/", 1)[1]) if region_id not in existing_region_ids: continue # Process each office in the response if office["id"] in existing_offices: office_action = Action.UPDATE else: office_action = Action.INSERT offices_dict[office["id"]] = ( office_action, Office( external_id=office["id"], region_id=int(office["region"].split("/")[-1]), hours=office.get("hours"), notes=office.get("notes"), ), ) for address in office.get("addresses", []): # Process each address in the office if address["id"] in existing_addresses: address_action = Action.UPDATE else: address_action = Action.INSERT addresses_dict[address["id"]] = ( address_action, Address( external_id=address["id"], office_id=office["id"], address=address.get("address_to"), address2=address.get("street1"), address3=address.get("street2"), city=address.get("city"), state_id=address.get("state"), zipcode=address.get("zip"), website=address.get("website"), email=address.get("main_email"), phone=address.get("main_phone_number"), fax=address.get("main_fax_number"), is_physical=address.get("is_physical"), is_regular_mail=address.get("is_regular_mail"), process_domestic_registrations="DOM_VR" in address["functions"], process_absentee_requests="DOM_REQ" in address["functions"], process_absentee_ballots="DOM_RET" in address["functions"], process_overseas_requests="OVS_REQ" in address["functions"], process_overseas_ballots="OVS_RET" in address["functions"], ), ) next_url = result["meta"].get("next") statsd.gauge("turnout.official.scraper.offices", len(offices_dict)) logger.info("Found %(number)s Offices", {"number": len(offices_dict)}) statsd.gauge("turnout.official.scraper.addresses", len(addresses_dict)) logger.info("Found %(number)s Addresses", {"number": len(addresses_dict)}) # Remove any records in our database but not in the result Office.objects.exclude(external_id__in=offices_dict.keys()).delete() Address.objects.exclude(external_id__in=addresses_dict.keys()).delete() # Create any records that are not already in our database Office.objects.bulk_create( [x[1] for x in offices_dict.values() if x[0] == Action.INSERT]) Address.objects.bulk_create( [x[1] for x in addresses_dict.values() if x[0] == Action.INSERT]) # Update any records that are already in our database Office.objects.bulk_update( [x[1] for x in offices_dict.values() if x[0] == Action.UPDATE], ["hours", "notes"], ) Address.objects.bulk_update( [x[1] for x in addresses_dict.values() if x[0] == Action.UPDATE], [ "address", "address2", "address3", "city", "state", "zipcode", "website", "email", "phone", "fax", "is_physical", "is_regular_mail", "process_domestic_registrations", "process_absentee_requests", "process_absentee_ballots", "process_overseas_requests", "process_overseas_ballots", ], )
def scrape_offices(session: requests.Session, regions: Sequence[Region]) -> None: existing_region_ids = [region.external_id for region in regions] existing_offices = Office.objects.values_list("external_id", flat=True) offices_dict: Dict[(int, Tuple[Action, Office])] = {} existing_addresses = {a.external_id: a for a in Address.objects.all()} addresses_dict: Dict[(int, Tuple[Action, Address])] = {} # The USVF API pagination is buggy; make multiple passes with # different page sizes. for limit in [100, 73, 67]: next_url = f"{API_ENDPOINT}/offices?limit={limit}" while next_url: with statsd.timed("turnout.official.usvfcall.offices", sample_rate=0.2): result = acquire_data(session, next_url) for office in result["objects"]: # Check that the region is valid (we don't support US territories) region_id = int(office["region"].rsplit("/", 1)[1]) if region_id not in existing_region_ids: continue office_id = office["id"] if office_id in offices_dict: continue # Process each office in the response if office_id in existing_offices: office_action = Action.UPDATE else: office_action = Action.INSERT offices_dict[office_id] = ( office_action, Office( external_id=office_id, region_id=int(office["region"].split("/")[-1]), hours=office.get("hours"), notes=office.get("notes"), ), ) for address in office.get("addresses", []): # Process each address in the office existing = existing_addresses.get(address["id"], None) if existing: address_action = Action.UPDATE location = existing.location else: address_action = Action.INSERT location = None if not location and settings.USVF_GEOCODE: addrs = geocode( street=address.get("street1"), city=address.get("city"), state=address.get("state"), zipcode=address.get("zip"), ) if addrs: location = Point( addrs[0]["location"]["lng"], addrs[0]["location"]["lat"] ) addresses_dict[address["id"]] = ( address_action, Address( external_id=address["id"], office_id=office["id"], address=address.get("address_to"), address2=address.get("street1"), address3=address.get("street2"), city=address.get("city"), state_id=address.get("state"), zipcode=address.get("zip"), website=address.get("website"), email=address.get("main_email"), phone=address.get("main_phone_number"), fax=address.get("main_fax_number"), location=location, is_physical=address.get("is_physical"), is_regular_mail=address.get("is_regular_mail"), process_domestic_registrations="DOM_VR" in address["functions"], process_absentee_requests="DOM_REQ" in address["functions"], process_absentee_ballots="DOM_RET" in address["functions"], process_overseas_requests="OVS_REQ" in address["functions"], process_overseas_ballots="OVS_RET" in address["functions"], ), ) next_url = result["meta"].get("next") logger.info( "Found %(number)s offices after this pass", {"number": len(offices_dict)} ) statsd.gauge("turnout.official.scraper.offices", len(offices_dict)) logger.info("Found %(number)s Offices", {"number": len(offices_dict)}) statsd.gauge("turnout.official.scraper.addresses", len(addresses_dict)) logger.info("Found %(number)s Addresses", {"number": len(addresses_dict)}) # Remove any records in our database but not in the result Office.objects.exclude(external_id__in=offices_dict.keys()).delete() Address.objects.exclude(external_id__in=addresses_dict.keys()).delete() # Create any records that are not already in our database Office.objects.bulk_create( [x[1] for x in offices_dict.values() if x[0] == Action.INSERT] ) Address.objects.bulk_create( [x[1] for x in addresses_dict.values() if x[0] == Action.INSERT] ) # Update any records that are already in our database def slow_bulk_update(cls, pk, records, keys): # this is slower than django's bulk_update(), but it (1) # respects modified_at and (2) works on aurora for r in records: obj = cls.objects.get(pk=r.pk) changed = False for k in keys: if getattr(obj, k) != getattr(r, k): setattr(obj, k, getattr(r, k)) changed = True if changed: logger.info(f"Updated {obj}") obj.save() slow_bulk_update( Office, "external_id", [x[1] for x in offices_dict.values() if x[0] == Action.UPDATE], ["hours", "notes"], ) slow_bulk_update( Address, "external_id", [x[1] for x in addresses_dict.values() if x[0] == Action.UPDATE], [ "address", "address2", "address3", "city", "state", "zipcode", "website", "email", "phone", "fax", "is_physical", "is_regular_mail", "location", "process_domestic_registrations", "process_absentee_requests", "process_absentee_ballots", "process_overseas_requests", "process_overseas_ballots", ], )