示例#1
0
def scrape_regions(session: requests.Session) -> List[Region]:
    session = authenticated_session()
    regions: List[Region] = []
    supported_states = State.objects.values_list("code", flat=True)

    next_url = f"{API_ENDPOINT}/regions?limit=100"
    while next_url:
        with statsd.timed("turnout.official.usvfcall.regions",
                          sample_rate=0.2):
            result = acquire_data(session, next_url)

        for usvf_region in result["objects"]:
            if usvf_region.get("state_abbr") not in supported_states:
                continue
            regions.append(
                Region(
                    external_id=usvf_region["id"],
                    name=usvf_region.get("region_name"),
                    municipality=usvf_region.get("municipality_name"),
                    municipality_type=usvf_region.get("municipality_type"),
                    county=usvf_region.get("county_name"),
                    state_id=usvf_region.get("state_abbr"),
                ))

        next_url = result["meta"].get("next")

    statsd.gauge("turnout.official.scraper.regions", len(regions))
    logger.info("Found %(number)s Regions", {"number": len(regions)})
    Region.objects.bulk_create(regions, ignore_conflicts=True)

    return regions
示例#2
0
def geocode(**kwargs):
    RETRIES = 2
    TIMEOUT = 6.0  # seems to be enough to handle slow apartment build queries

    args = {}
    for k in ["street", "city", "state", "q", "fields"]:
        if k in kwargs:
            if k == "street" and kwargs[k]:
                args[k] = strip_address_number_alpha_suffix(kwargs[k])
            else:
                args[k] = kwargs[k]
    if "zipcode" in kwargs:
        args["postal_code"] = kwargs["zipcode"]
    url = f"{API_ENDPOINT}?{urlencode({**args, 'api_key': settings.GEOCODIO_KEY})}"
    with statsd.timed("turnout.common.geocode.geocode", sample_rate=0.2):
        retries = Retry(total=RETRIES,
                        backoff_factor=1,
                        status_forcelist=[429, 500, 502, 503, 504])
        http = requests.Session()
        http.mount("https://", HTTPAdapter(max_retries=retries))
        try:
            with tracer.trace("geocode", service="geocodioclient"):
                r = http.get(url, timeout=TIMEOUT)
        except Exception as e:
            extra = {
                "url": API_ENDPOINT,
                "api_args": str(args),
                "exception": str(e)
            }
            logger.warning(
                "Error querying geocodio args %(api_args)s, exception %(exception)s",
                extra,
                extra=extra,
            )
            sentry_sdk.capture_exception(
                GeocodioAPIError(
                    f"Error querying {API_ENDPOINT}, exception {str(e)}"))
            return None
    if r.status_code != 200:
        extra = {
            "url": API_ENDPOINT,
            "api_args": str(args),
            "status_code": r.status_code,
        }
        logger.warning(
            "Error querying geocodio args %(api_args)s, status code %(status_code)s",
            extra,
            extra=extra,
        )
        if r.status_code != 422:  # we get this from bogus addresses
            sentry_sdk.capture_exception(
                GeocodioAPIError(
                    f"Error querying {API_ENDPOINT}, status code {r.status_code}"
                ))
        return None
    return r.json().get("results", None)
示例#3
0
def scrape_regions(session: requests.Session) -> List[Region]:
    session = authenticated_session()
    regions: List[Region] = []
    supported_states = State.states.values_list("code", flat=True)

    # The USVF API is buggy and does not paginate reliably.  Make
    # multiple passes with different page sizes to ensure we capture
    # all records.  In practice, the [100,73] is sufficient but
    # additional passes act as an insurance policy.
    saw_id = set()
    for limit in [100, 73, 67]:
        next_url = f"{API_ENDPOINT}/regions?limit={limit}"
        while next_url:
            with statsd.timed("turnout.official.usvfcall.regions", sample_rate=0.2):
                result = acquire_data(session, next_url)

            for usvf_region in result["objects"]:
                if usvf_region.get("state_abbr") not in supported_states:
                    continue

                id_ = usvf_region.get("id")
                if id_ in saw_id:
                    continue
                saw_id.add(id_)
                regions.append(
                    Region(
                        external_id=usvf_region["id"],
                        name=usvf_region.get("region_name"),
                        municipality=usvf_region.get("municipality_name"),
                        municipality_type=usvf_region.get("municipality_type"),
                        county=usvf_region.get("county_name"),
                        state_id=usvf_region.get("state_abbr"),
                    )
                )

            next_url = result["meta"].get("next")
        logger.info(
            "Found %(number)s Regions after this pass", {"number": len(regions)}
        )

    statsd.gauge("turnout.official.scraper.regions", len(regions))
    logger.info("Found %(number)s Regions", {"number": len(regions)})
    Region.objects.bulk_create(regions, ignore_conflicts=True)
    Region.objects.exclude(external_id__in=[r.external_id for r in regions]).delete()

    return regions
示例#4
0
def scrape_offices(session: requests.Session,
                   regions: Sequence[Region]) -> None:
    existing_region_ids = [region.external_id for region in regions]

    existing_offices = Office.objects.values_list("external_id", flat=True)
    offices_dict: Dict[(int, Tuple[Action, Office])] = {}

    existing_addresses = Address.objects.values_list("external_id", flat=True)
    addresses_dict: Dict[(int, Tuple[Action, Address])] = {}

    next_url = f"{API_ENDPOINT}/offices?limit=100"
    while next_url:
        with statsd.timed("turnout.official.usvfcall.offices",
                          sample_rate=0.2):
            result = acquire_data(session, next_url)

        for office in result["objects"]:
            # Check that the region is valid (we don't support US territories)
            region_id = int(office["region"].rsplit("/", 1)[1])
            if region_id not in existing_region_ids:
                continue

            # Process each office in the response
            if office["id"] in existing_offices:
                office_action = Action.UPDATE
            else:
                office_action = Action.INSERT
            offices_dict[office["id"]] = (
                office_action,
                Office(
                    external_id=office["id"],
                    region_id=int(office["region"].split("/")[-1]),
                    hours=office.get("hours"),
                    notes=office.get("notes"),
                ),
            )

            for address in office.get("addresses", []):
                # Process each address in the office
                if address["id"] in existing_addresses:
                    address_action = Action.UPDATE
                else:
                    address_action = Action.INSERT
                addresses_dict[address["id"]] = (
                    address_action,
                    Address(
                        external_id=address["id"],
                        office_id=office["id"],
                        address=address.get("address_to"),
                        address2=address.get("street1"),
                        address3=address.get("street2"),
                        city=address.get("city"),
                        state_id=address.get("state"),
                        zipcode=address.get("zip"),
                        website=address.get("website"),
                        email=address.get("main_email"),
                        phone=address.get("main_phone_number"),
                        fax=address.get("main_fax_number"),
                        is_physical=address.get("is_physical"),
                        is_regular_mail=address.get("is_regular_mail"),
                        process_domestic_registrations="DOM_VR"
                        in address["functions"],
                        process_absentee_requests="DOM_REQ"
                        in address["functions"],
                        process_absentee_ballots="DOM_RET"
                        in address["functions"],
                        process_overseas_requests="OVS_REQ"
                        in address["functions"],
                        process_overseas_ballots="OVS_RET"
                        in address["functions"],
                    ),
                )

        next_url = result["meta"].get("next")

    statsd.gauge("turnout.official.scraper.offices", len(offices_dict))
    logger.info("Found %(number)s Offices", {"number": len(offices_dict)})
    statsd.gauge("turnout.official.scraper.addresses", len(addresses_dict))
    logger.info("Found %(number)s Addresses", {"number": len(addresses_dict)})

    # Remove any records in our database but not in the result
    Office.objects.exclude(external_id__in=offices_dict.keys()).delete()
    Address.objects.exclude(external_id__in=addresses_dict.keys()).delete()

    # Create any records that are not already in our database
    Office.objects.bulk_create(
        [x[1] for x in offices_dict.values() if x[0] == Action.INSERT])
    Address.objects.bulk_create(
        [x[1] for x in addresses_dict.values() if x[0] == Action.INSERT])

    # Update any records that are already in our database
    Office.objects.bulk_update(
        [x[1] for x in offices_dict.values() if x[0] == Action.UPDATE],
        ["hours", "notes"],
    )
    Address.objects.bulk_update(
        [x[1] for x in addresses_dict.values() if x[0] == Action.UPDATE],
        [
            "address",
            "address2",
            "address3",
            "city",
            "state",
            "zipcode",
            "website",
            "email",
            "phone",
            "fax",
            "is_physical",
            "is_regular_mail",
            "process_domestic_registrations",
            "process_absentee_requests",
            "process_absentee_ballots",
            "process_overseas_requests",
            "process_overseas_ballots",
        ],
    )
示例#5
0
def scrape_offices(session: requests.Session, regions: Sequence[Region]) -> None:
    existing_region_ids = [region.external_id for region in regions]

    existing_offices = Office.objects.values_list("external_id", flat=True)
    offices_dict: Dict[(int, Tuple[Action, Office])] = {}

    existing_addresses = {a.external_id: a for a in Address.objects.all()}
    addresses_dict: Dict[(int, Tuple[Action, Address])] = {}

    # The USVF API pagination is buggy; make multiple passes with
    # different page sizes.
    for limit in [100, 73, 67]:
        next_url = f"{API_ENDPOINT}/offices?limit={limit}"
        while next_url:
            with statsd.timed("turnout.official.usvfcall.offices", sample_rate=0.2):
                result = acquire_data(session, next_url)

            for office in result["objects"]:
                # Check that the region is valid (we don't support US territories)
                region_id = int(office["region"].rsplit("/", 1)[1])
                if region_id not in existing_region_ids:
                    continue

                office_id = office["id"]
                if office_id in offices_dict:
                    continue

                # Process each office in the response
                if office_id in existing_offices:
                    office_action = Action.UPDATE
                else:
                    office_action = Action.INSERT
                offices_dict[office_id] = (
                    office_action,
                    Office(
                        external_id=office_id,
                        region_id=int(office["region"].split("/")[-1]),
                        hours=office.get("hours"),
                        notes=office.get("notes"),
                    ),
                )

                for address in office.get("addresses", []):
                    # Process each address in the office
                    existing = existing_addresses.get(address["id"], None)
                    if existing:
                        address_action = Action.UPDATE
                        location = existing.location
                    else:
                        address_action = Action.INSERT
                        location = None
                    if not location and settings.USVF_GEOCODE:
                        addrs = geocode(
                            street=address.get("street1"),
                            city=address.get("city"),
                            state=address.get("state"),
                            zipcode=address.get("zip"),
                        )
                        if addrs:
                            location = Point(
                                addrs[0]["location"]["lng"], addrs[0]["location"]["lat"]
                            )
                    addresses_dict[address["id"]] = (
                        address_action,
                        Address(
                            external_id=address["id"],
                            office_id=office["id"],
                            address=address.get("address_to"),
                            address2=address.get("street1"),
                            address3=address.get("street2"),
                            city=address.get("city"),
                            state_id=address.get("state"),
                            zipcode=address.get("zip"),
                            website=address.get("website"),
                            email=address.get("main_email"),
                            phone=address.get("main_phone_number"),
                            fax=address.get("main_fax_number"),
                            location=location,
                            is_physical=address.get("is_physical"),
                            is_regular_mail=address.get("is_regular_mail"),
                            process_domestic_registrations="DOM_VR"
                            in address["functions"],
                            process_absentee_requests="DOM_REQ" in address["functions"],
                            process_absentee_ballots="DOM_RET" in address["functions"],
                            process_overseas_requests="OVS_REQ" in address["functions"],
                            process_overseas_ballots="OVS_RET" in address["functions"],
                        ),
                    )

            next_url = result["meta"].get("next")
        logger.info(
            "Found %(number)s offices after this pass", {"number": len(offices_dict)}
        )

    statsd.gauge("turnout.official.scraper.offices", len(offices_dict))
    logger.info("Found %(number)s Offices", {"number": len(offices_dict)})
    statsd.gauge("turnout.official.scraper.addresses", len(addresses_dict))
    logger.info("Found %(number)s Addresses", {"number": len(addresses_dict)})

    # Remove any records in our database but not in the result
    Office.objects.exclude(external_id__in=offices_dict.keys()).delete()
    Address.objects.exclude(external_id__in=addresses_dict.keys()).delete()

    # Create any records that are not already in our database
    Office.objects.bulk_create(
        [x[1] for x in offices_dict.values() if x[0] == Action.INSERT]
    )
    Address.objects.bulk_create(
        [x[1] for x in addresses_dict.values() if x[0] == Action.INSERT]
    )

    # Update any records that are already in our database
    def slow_bulk_update(cls, pk, records, keys):
        # this is slower than django's bulk_update(), but it (1)
        # respects modified_at and (2) works on aurora
        for r in records:
            obj = cls.objects.get(pk=r.pk)
            changed = False
            for k in keys:
                if getattr(obj, k) != getattr(r, k):
                    setattr(obj, k, getattr(r, k))
                    changed = True
            if changed:
                logger.info(f"Updated {obj}")
                obj.save()

    slow_bulk_update(
        Office,
        "external_id",
        [x[1] for x in offices_dict.values() if x[0] == Action.UPDATE],
        ["hours", "notes"],
    )
    slow_bulk_update(
        Address,
        "external_id",
        [x[1] for x in addresses_dict.values() if x[0] == Action.UPDATE],
        [
            "address",
            "address2",
            "address3",
            "city",
            "state",
            "zipcode",
            "website",
            "email",
            "phone",
            "fax",
            "is_physical",
            "is_regular_mail",
            "location",
            "process_domestic_registrations",
            "process_absentee_requests",
            "process_absentee_ballots",
            "process_overseas_requests",
            "process_overseas_ballots",
        ],
    )