def get_parsed_values(parsed_addr, orig_val, val_label, orig_addr_str): # type: (Mapping[str, str], str, str, str) -> Union[str, None] """Get valid values from parsed_addr corresponding to val_label. Retrieves values from parsed_addr corresponding to the label supplied in val_label. If a value for val_label is found in parsed_addr AND an orig_val is supplied, a single string will be returned if the values match. If only one of the two contains a non-null value. If both values are empty, None is returned. If the values an AmbiguousAddressError will be returned if the two values are not equal. This provides a check against misidentified address components when known values are available. (For example when a city is supplied from the address dict or record being normalized, but usaddress identifies extra information stored in address_line_1 as a PlaceName.) :param parsed_addr: address parsed into ordereddict per usaddress. :type parsed_addr: Mapping :param orig_val: related value passed in from incoming data source. :type orig_val: str :param val_label: label to locate in parsed_addr :type val_label: str :param orig_addr_str: address string to pass to error, if applicable. :type orig_addr_str: str :return: str | None """ val_from_parse = parsed_addr.get(val_label) orig_val = post_clean_addr_str(orig_val) val_from_parse = post_clean_addr_str(val_from_parse) non_null_val_set = {orig_val, val_from_parse} - {None} if len(non_null_val_set) > 1: raise AmbiguousAddressError(None, None, orig_addr_str) else: return non_null_val_set.pop() if non_null_val_set else None
def test_post_clean_addr_str(self): """Test post_clean_addr_str function.""" addr_str = '(100-104) SW NO WHERE st' expected = '100-104 SW NO WHERE ST' result = post_clean_addr_str(addr_str) self.assertEqual(expected, result) self.assertIsNone(post_clean_addr_str(None)) self.assertEqual('', post_clean_addr_str(''))
def validate_us_postal_code_format(postal_code, address): # type: (str, Union[str, Mapping]) -> str """Validate postal code conforms to US five-digit Zip or Zip+4 standards. :param postal_code: string containing US postal code data. :type postal_code: str :param address: dict or string containing original address. :type address: dict | str :return: original postal code if no error is raised :rtype: str """ error = None msg = ( 'US Postal Codes must conform to five-digit Zip or Zip+4 standards.') postal_code = post_clean_addr_str(postal_code) if '-' in postal_code: plus_four_code = postal_code.split('-') if len(plus_four_code) != 2: error = True elif len(plus_four_code[0]) != 5 or len(plus_four_code[1]) != 4: error = True elif len(postal_code) != 5: error = True if error: raise AddressValidationError(msg, None, address) else: return postal_code
def get_normalized_line_segment(parsed_addr, line_labels): # type: (Mapping[str, str], Sequence[str]) -> str """ :param parsed_addr: address parsed into ordereddict per usaddress. :param line_labels: tuple of str labels of all the potential keys related to the desired address segment (ie address_line_1 or address_line_2). :return: s/r joined values from parsed_addr corresponding to given labels. """ line_elems = [ elem for key, elem in parsed_addr.items() if key in line_labels ] line_str = ' '.join(line_elems) if line_elems else None return post_clean_addr_str(line_str)
def normalize_addr_str( addr_str, # type: str line2=None, # type: Optional[str] city=None, # type: Optional[str] state=None, # type: Optional[str] zipcode=None, # type: Optional[str] addtl_funcs=None # type: Sequence[Callable[str, (str, str)]] # noqa ): # noqa # type (...) -> Mapping[str, str] # noqa # type (...) -> Mapping[str, str] """Normalize a complete or partial address string. :param addr_str: str containing address data. :type addr_str: str :param line2: optional str containing occupancy or sub-address data (eg: Unit, Apt, Lot). :type line2: str :param city: optional str city name that does not need to be parsed from addr_str. :type city: str :param state: optional str state name that does not need to be parsed from addr_str. :type state: str :param zipcode: optional str postal code that does not need to be parsed from addr_str. :type zipcode: str :param addtl_funcs: optional sequence of funcs that take string for further processing and return line1 and line2 strings :type addtl_funcs: Sequence[Callable[str, (str, str)]] :return: address dict with uppercase parsed and normalized address values. :rtype: Mapping[str, str] """ # get address parsed into usaddress components. error = None parsed_addr = None addr_str = pre_clean_addr_str(addr_str, normalize_state(state)) try: parsed_addr = parse_address_string(addr_str) except (usaddress.RepeatedLabelError, AmbiguousAddressError) as err: error = err if not line2 and addtl_funcs: for func in addtl_funcs: try: line1, line2 = func(addr_str) error = False # send refactored line_1 and line_2 back through processing return normalize_addr_str(line1, line2=line2, city=city, state=state, zipcode=zipcode) except ValueError: # try a different additional processing function pass if parsed_addr: parsed_addr = normalize_address_components(parsed_addr) zipcode = get_parsed_values(parsed_addr, zipcode, 'ZipCode', addr_str) city = get_parsed_values(parsed_addr, city, 'PlaceName', addr_str) state = get_parsed_values(parsed_addr, state, 'StateName', addr_str) state = normalize_state(state) # assumes if line2 is passed in that it need not be parsed from # addr_str. Primarily used to allow advanced processing of otherwise # unparsable addresses. line2 = line2 if line2 else get_normalized_line_segment( parsed_addr, LINE2_USADDRESS_LABELS) line2 = post_clean_addr_str(line2) # line 1 is fully post cleaned in get_normalized_line_segment. line1 = get_normalized_line_segment(parsed_addr, LINE1_USADDRESS_LABELS) validate_parens_groups_parsed(line1) else: # line1 is set to addr_str so complete dict can be passed to error. line1 = addr_str addr_rec = dict(address_line_1=line1, address_line_2=line2, city=city, state=state, postal_code=zipcode) if error: raise UnParseableAddressError(None, None, addr_rec) else: return addr_rec