def _add_website_and_provider(site: schema.NormalizedLocation, entry: dict) -> None: """Adds website and provider information from `entry`, if any, to the given `site` object.""" # Create a fresh object each time, though many sites may have the same website. website = _make_website_contact(entry["link"]) if website is not None: site.contact = site.contact or [] site.contact.append(website) # Try to work out well-known providers from the URL. site.parent_organization = _lookup_provider(website)
def _add_id(site: schema.NormalizedLocation) -> None: """Generates source and site IDs for the given `site` object and updates the object in place. """ # We don't have a stable site ID or name from the source document, # so generate one ID by hashing whatever name and address info we do have. # These are likely to be more stable than the phone or website info. # Avoid using the `page` and `provider` numbers from `entry`, # because those are sensitive to layout changes in the source document. candidate_data: List[Optional[Text]] = list( filter( None, [ site.name, getattr(site.address, "street1", None), getattr(site.address, "city", None), getattr(site.address, "state", None), getattr(site.address, "zip", None), ], )) # Fall back to website or phone info if we don't have concrete name or location info. if not candidate_data: candidate_data.extend([c.website or c.phone for c in site.contact]) site.source.id = _md5_hash(candidate_data) site_id = _make_site_id(site.source) logger.debug("Site ID: %s", site_id) site.id = site_id
def _process_location( normalized_location: location.NormalizedLocation, ) -> Optional[location.NormalizedLocation]: """Run through all of the methods to enrich the location""" enriched_location = normalized_location.copy() _add_provider_from_name(enriched_location) _add_source_link(enriched_location) _add_provider_tag(enriched_location) _normalize_phone_format(enriched_location) if not _valid_address(enriched_location): logger.warning( "Skipping source location %s because its address could not be validated: %s", normalized_location.id, normalized_location.address, ) return None if not enriched_location.location: logger.warning( "Skipping source location %s because it does not have a location (lat/lng)", normalized_location.id, ) return None return enriched_location
def _add_provider_from_name(loc: location.NormalizedLocation) -> None: """Add provider link from name if missing""" if not loc.name: return provider_tuple = normalize.provider_id_from_name(loc.name) if not provider_tuple: return provider_authority, provider_id = provider_tuple existing_links = _generate_link_map(loc) if str(provider_authority) not in existing_links: loc.links = [ *(loc.links or []), location.Link(authority=provider_authority, id=provider_id), ] if not loc.parent_organization: loc.parent_organization = location.Organization(id=provider_authority)
def _process_location( normalized_location: location.NormalizedLocation, ) -> Optional[location.NormalizedLocation]: """Run through all of the methods to enrich the location""" enriched_location = normalized_location.copy() _add_provider_from_name(enriched_location) _add_source_link(enriched_location) if not _valid_address(enriched_location): return None if not enriched_location.location: return None return enriched_location
def _add_source_link(loc: location.NormalizedLocation) -> None: """Add source link from source if missing""" if not loc.source: return if not loc.source.source or not loc.source.id: return existing_links = _generate_link_map(loc) if str(loc.source.source) in existing_links: return loc.links = [ *(loc.links or []), location.Link(authority=loc.source.source, id=loc.source.id), ]
def _add_provider_tag(loc: location.NormalizedLocation) -> None: """Add provider tag to concordances to use for matching""" if not loc.parent_organization: return if not loc.parent_organization.id: return existing_links = _generate_link_map(loc) if PROVIDER_TAG in existing_links: return provider_id = str(loc.parent_organization.id) loc.links = [ *(loc.links or []), location.Link(authority=PROVIDER_TAG, id=provider_id), ]
def calculate_content_hash(loc: location.NormalizedLocation) -> str: """Calculate a hash from the normalized content of a location without source data""" loc_dict = loc.dict(exclude_none=True, exclude={"source"}) loc_json = orjson.dumps(loc_dict, option=orjson.OPT_SORT_KEYS) return hashlib.md5(loc_json).hexdigest()