def test_countries(self): for s, country, expected in country_test_cases: languages = get_country_languages(country) self.assertTrue(bool(languages)) lang = disambiguate_language(s, languages.items()) self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, languages.items()))
def test_countries(self): for s, country, expected in country_test_cases: languages = get_country_languages(country) self.assertTrue(bool(languages)) lang = disambiguate_language(s, languages.items()) self.assertEqual( lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, languages.items()))
def test_regional(self): for s, country, k, v, expected in regional_test_cases: languages = get_country_languages(country) self.assertTrue(bool(languages)) regional = get_regional_languages(country, k, v) self.assertTrue(bool(regional)) regional.update(languages) lang = disambiguate_language(s, regional.items()) self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, regional.items()))
def test_regional(self): for s, country, k, v, expected in regional_test_cases: languages = get_country_languages(country) self.assertTrue(bool(languages)) regional = get_regional_languages(country, k, v) self.assertTrue(bool(regional)) regional.update(languages) lang = disambiguate_language(s, regional.items()) self.assertEqual( lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, regional.items()))
def formatted_addresses(self, country_dir, path, configs, tag_components=True): abbreviate_street_prob = float( self.get_property('abbreviate_street_probability', *configs)) separate_street_prob = float( self.get_property('separate_street_probability', *configs) or 0.0) abbreviate_unit_prob = float( self.get_property('abbreviate_unit_probability', *configs)) separate_unit_prob = float( self.get_property('separate_unit_probability', *configs) or 0.0) abbreviate_toponym_prob = float( self.get_property('abbreviate_toponym_probability', *configs)) add_osm_boundaries = bool( self.get_property('add_osm_boundaries', *configs) or False) add_osm_neighborhoods = bool( self.get_property('add_osm_neighborhoods', *configs) or False) osm_neighborhood_overrides_city = self.get_property( 'osm_neighborhood_overrides_city', *configs) non_numeric_units = bool( self.get_property('non_numeric_units', *configs) or False) house_number_strip_commas = bool( self.get_property('house_number_strip_commas', *configs) or False) numeric_postcodes_only = bool( self.get_property('numeric_postcodes_only', *configs) or False) postcode_strip_non_digit_chars = bool( self.get_property('postcode_strip_non_digit_chars', *configs) or False) address_only_probability = float( self.get_property('address_only_probability', *configs)) place_only_probability = float( self.get_property('place_only_probability', *configs)) place_and_postcode_probability = float( self.get_property('place_and_postcode_probability', *configs)) city_replacements = self.get_property('city_replacements', *configs) override_country_dir = self.get_property('override_country_dir', *configs) postcode_length = int( self.get_property('postcode_length', *configs) or 0) drop_address_probability = place_only_probability + place_and_postcode_probability ignore_rows_missing_fields = set( self.get_property('ignore_rows_missing_fields', *configs) or []) ignore_fields_containing = { field: re.compile( six.u('|').join( [six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE) for field, value in six.iteritems( dict( self.get_property('ignore_fields_containing', *configs) or {})) } alias_fields_containing = { field: [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value] for field, value in six.iteritems( dict( self.get_property('alias_fields_containing', *configs) or {})) } config_language = self.get_property('language', *configs) add_components = self.get_property('add', *configs) fields = self.get_property('fields', *configs) if not fields: return field_map = { field_name: f['component'] for field_name, f in six.iteritems(fields) } mapped_values = { f['component']: f['value_map'] for f in six.itervalues(fields) if hasattr(f.get('value_map'), 'get') } f = open(path) reader = unicode_csv_reader(f) headers = reader.next() header_indices = { i: field_map[k] for i, k in enumerate(headers) if k in field_map } latitude_index = headers.index('LAT') longitude_index = headers.index('LON') # Clear cached polygons self.components.osm_admin_rtree.clear_cache() self.components.neighborhoods_rtree.clear_cache() for row in reader: try: latitude = float(row[latitude_index]) longitude = float(row[longitude_index]) except (ValueError, TypeError): continue language = config_language components = {} skip_record = False for i, key in six.iteritems(header_indices): value = row[i].strip() if not value and key in ignore_rows_missing_fields: skip_record = True break elif not value: continue if key in mapped_values: value = mapped_values[key].get(value, value) if key == AddressFormatter.ROAD and language == SPANISH: value = self.components.spanish_street_name(value) if key == AddressFormatter.POSTCODE: value = self.cleanup_number(value) if postcode_strip_non_digit_chars: value = six.u('').join( (c for c in value if c.isdigit())) if value and not is_numeric( value) and numeric_postcodes_only: continue else: if postcode_length: value = value.zfill( postcode_length)[:postcode_length] if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE: if add_osm_boundaries: continue value = self.components.cleaned_name( value, first_comma_delimited_phrase=True) if value and ((len(value) < 2 and not get_string_script(value)[0].lower() in ideographic_scripts) or is_numeric(value)): continue if not_applicable_regex.match(value) or null_regex.match( value) or unknown_regex.match(value): continue for exp, sub_val in self.field_regex_replacements.get(key, []): value = exp.sub(sub_val, value) for exp, sub_val in self.field_regex_replacements.get( None, []): value = exp.sub(sub_val, value) value = value.strip(', -') validator = self.country_validators.get(country_dir, {}).get( key, self.language_validators.get(language, {}).get( key, self.component_validators.get(key, None))) if validator is not None and not validator(value): continue if key in ignore_fields_containing and ignore_fields_containing[ key].search(value): continue for (pattern, alias) in alias_fields_containing.get(key, []): if pattern.search(value): if 'component' in alias: key = alias['component'] if value: components[key] = value if skip_record: continue if components: country, candidate_languages = self.country_rtree.country_and_languages( latitude, longitude) if not (country and candidate_languages) or ( country != country_dir and not override_country_dir): country = country_dir candidate_languages = get_country_languages(country) if not candidate_languages: continue candidate_languages = candidate_languages.items() components = self.fix_component_encodings(components) if language is None: language = AddressComponents.address_language( components, candidate_languages) street = components.get(AddressFormatter.ROAD, None) if street is not None: street = street.strip() street = AddressComponents.cleaned_name(street) if language == UNKNOWN_LANGUAGE: strip_unit_language = candidate_languages[0][ 0] if candidate_languages else None else: strip_unit_language = language street = self.components.strip_unit_phrases_for_language( street, strip_unit_language) street = abbreviate(street_types_gazetteer, street, language, abbreviate_prob=abbreviate_street_prob, separate_prob=separate_street_prob) components[AddressFormatter.ROAD] = street house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) if house_number: house_number = self.cleanup_number( house_number, strip_commas=house_number_strip_commas) if language == CHINESE: house_number = self.format_chinese_house_number( house_number) if country_dir == Countries.COLOMBIA: house_number = self.format_colombian_house_number( house_number) if house_number is not None: components[ AddressFormatter.HOUSE_NUMBER] = house_number unit = components.get(AddressFormatter.UNIT, None) street_required = country not in ( Countries.JAPAN, Countries.CZECH_REPUBLIC ) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES postcode = components.get(AddressFormatter.POSTCODE, None) if postcode: components[AddressFormatter. POSTCODE] = PostalCodes.add_country_code( postcode, country) # If there's a postcode, we can still use just the city/state/postcode, otherwise discard if (not street and street_required) or ( street and house_number and (street.lower() == house_number.lower())) or ( unit and street and street.lower() == unit.lower()): if not postcode: continue components = self.components.drop_address(components) # Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc. unit = components.get(AddressFormatter.UNIT, None) if unit is not None: if is_numeric_strict(unit): unit = Unit.phrase(unit, language, country=country) elif non_numeric_units: unit = abbreviate(unit_types_gazetteer, unit, language, abbreviate_prob=abbreviate_unit_prob, separate_prob=separate_unit_prob) else: unit = None if unit is not None: components[AddressFormatter.UNIT] = unit else: components.pop(AddressFormatter.UNIT) unit = None # CLDR country name country_name = self.cldr_country_name(country, language, configs) if country_name: components[AddressFormatter.COUNTRY] = country_name for component_key in AddressFormatter.BOUNDARY_COMPONENTS: component = components.get(component_key, None) if component is not None: component = abbreviate( toponym_abbreviations_gazetteer, component, language, abbreviate_prob=abbreviate_toponym_prob) component = self.components.name_hyphens(component) components[component_key] = component # Any components specified to be added by the config (usually state) if add_components: for k, v in six.iteritems(add_components): if k not in components: components[k] = v # Get named states occasionally, added component is usually a state code address_state = self.components.state_name( components, country, language) if address_state: components[AddressFormatter.STATE] = address_state state = components.get(AddressFormatter.STATE) if state: state = self.components.abbreviated_state( state, country, language) if state: components[AddressFormatter.STATE] = state # This is expensive, so only turn on for files that don't supply their own city names # or for which those names are flawed osm_components = [] # Using population=0 instead of None means if there's no known population or # we don't need to add OSM components, we assume the population of the town is # very small and the place name shouldn't be used unqualified (i.e. needs information # like state name to disambiguate it) population = 0 unambiguous_city = False if add_osm_boundaries or AddressFormatter.CITY not in components: osm_components = self.components.osm_reverse_geocoded_components( latitude, longitude) self.components.add_admin_boundaries( components, osm_components, country, language, latitude, longitude) categorized = self.components.categorized_osm_components( country, osm_components) for component, label in categorized: if label == AddressFormatter.CITY: unambiguous_city = self.components.unambiguous_wikipedia( component, language) if 'population' in component: population = component['population'] break if AddressFormatter.CITY not in components and city_replacements: components.update({ k: v for k, v in six.iteritems(city_replacements) if k not in components }) # The neighborhood index is cheaper so can turn on for whole countries neighborhood_components = [] if add_osm_neighborhoods: neighborhood_components = self.components.neighborhood_components( latitude, longitude) self.components.add_neighborhoods( components, neighborhood_components, country, language, replace_city=osm_neighborhood_overrides_city) self.components.cleanup_boundary_names(components) self.components.country_specific_cleanup(components, country) self.components.replace_name_affixes(components, language, country=country) self.components.replace_names(components) self.components.prune_duplicate_names(components) self.components.remove_numeric_boundary_names(components) self.components.add_house_number_phrase(components, language, country=country) self.components.add_postcode_phrase(components, language, country=country) # Component dropout all_osm_components = osm_components + neighborhood_components components = place_config.dropout_components( components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city) self.components.add_genitives(components, language) formatted = self.formatter.format_address( components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) if random.random() < address_only_probability and street: address_only_components = self.components.drop_places( components) address_only_components = self.components.drop_postcode( address_only_components) formatted = self.formatter.format_address( address_only_components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) rand_val = random.random() if street and house_number and rand_val < drop_address_probability: components = self.components.drop_address(components) if rand_val < place_and_postcode_probability: components = self.components.drop_postcode(components) if components and (len(components) > 1 or add_osm_boundaries): formatted = self.formatter.format_address( components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted)
def formatted_addresses(self, country_dir, path, configs, tag_components=True): abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs)) separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0) abbreviate_unit_prob = float(self.get_property('abbreviate_unit_probability', *configs)) separate_unit_prob = float(self.get_property('separate_unit_probability', *configs) or 0.0) abbreviate_toponym_prob = float(self.get_property('abbreviate_toponym_probability', *configs)) add_osm_boundaries = bool(self.get_property('add_osm_boundaries', *configs) or False) add_osm_neighborhoods = bool(self.get_property('add_osm_neighborhoods', *configs) or False) osm_neighborhood_overrides_city = self.get_property('osm_neighborhood_overrides_city', *configs) non_numeric_units = bool(self.get_property('non_numeric_units', *configs) or False) house_number_strip_commas = bool(self.get_property('house_number_strip_commas', *configs) or False) numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False) postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False) address_only_probability = float(self.get_property('address_only_probability', *configs)) place_only_probability = float(self.get_property('place_only_probability', *configs)) place_and_postcode_probability = float(self.get_property('place_and_postcode_probability', *configs)) city_replacements = self.get_property('city_replacements', *configs) override_country_dir = self.get_property('override_country_dir', *configs) postcode_length = int(self.get_property('postcode_length', *configs) or 0) drop_address_probability = place_only_probability + place_and_postcode_probability ignore_rows_missing_fields = set(self.get_property('ignore_rows_missing_fields', *configs) or []) ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE) for field, value in six.iteritems(dict(self.get_property('ignore_fields_containing', *configs) or {}))} alias_fields_containing = {field: [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value] for field, value in six.iteritems(dict(self.get_property('alias_fields_containing', *configs) or {}))} config_language = self.get_property('language', *configs) add_components = self.get_property('add', *configs) fields = self.get_property('fields', *configs) if not fields: return field_map = {field_name: f['component'] for field_name, f in six.iteritems(fields)} mapped_values = {f['component']: f['value_map'] for f in six.itervalues(fields) if hasattr(f.get('value_map'), 'get')} f = open(path) reader = unicode_csv_reader(f) headers = reader.next() header_indices = {i: field_map[k] for i, k in enumerate(headers) if k in field_map} latitude_index = headers.index('LAT') longitude_index = headers.index('LON') # Clear cached polygons self.components.osm_admin_rtree.clear_cache() self.components.neighborhoods_rtree.clear_cache() for row in reader: try: latitude = float(row[latitude_index]) longitude = float(row[longitude_index]) except (ValueError, TypeError): continue language = config_language components = {} skip_record = False for i, key in six.iteritems(header_indices): value = row[i].strip() if not value and key in ignore_rows_missing_fields: skip_record = True break elif not value: continue if key in mapped_values: value = mapped_values[key].get(value, value) if key == AddressFormatter.ROAD and language == SPANISH: value = self.components.spanish_street_name(value) if key == AddressFormatter.POSTCODE: value = self.cleanup_number(value) if postcode_strip_non_digit_chars: value = six.u('').join((c for c in value if c.isdigit())) if value and not is_numeric(value) and numeric_postcodes_only: continue else: if postcode_length: value = value.zfill(postcode_length)[:postcode_length] if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE: if add_osm_boundaries: continue value = self.components.cleaned_name(value, first_comma_delimited_phrase=True) if value and ((len(value) < 2 and not get_string_script(value)[0].lower() in ideographic_scripts) or is_numeric(value)): continue if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value): continue for exp, sub_val in self.field_regex_replacements.get(key, []): value = exp.sub(sub_val, value) for exp, sub_val in self.field_regex_replacements.get(None, []): value = exp.sub(sub_val, value) value = value.strip(', -') validator = self.country_validators.get(country_dir, {}).get(key, self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None))) if validator is not None and not validator(value): continue if key in ignore_fields_containing and ignore_fields_containing[key].search(value): continue for (pattern, alias) in alias_fields_containing.get(key, []): if pattern.search(value): if 'component' in alias: key = alias['component'] if value: components[key] = value if skip_record: continue if components: country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude) if not (country and candidate_languages) or (country != country_dir and not override_country_dir): country = country_dir candidate_languages = get_country_languages(country) if not candidate_languages: continue candidate_languages = candidate_languages.items() components = self.fix_component_encodings(components) if language is None: language = AddressComponents.address_language(components, candidate_languages) street = components.get(AddressFormatter.ROAD, None) if street is not None: street = street.strip() street = AddressComponents.cleaned_name(street) if language == UNKNOWN_LANGUAGE: strip_unit_language = candidate_languages[0][0] if candidate_languages else None else: strip_unit_language = language street = self.components.strip_unit_phrases_for_language(street, strip_unit_language) street = abbreviate(street_types_gazetteer, street, language, abbreviate_prob=abbreviate_street_prob, separate_prob=separate_street_prob) components[AddressFormatter.ROAD] = street house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) if house_number: house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas) if language == CHINESE: house_number = self.format_chinese_house_number(house_number) if country_dir == Countries.COLOMBIA: house_number = self.format_colombian_house_number(house_number) if house_number is not None: components[AddressFormatter.HOUSE_NUMBER] = house_number unit = components.get(AddressFormatter.UNIT, None) street_required = country not in (Countries.JAPAN, Countries.CZECH_REPUBLIC) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES postcode = components.get(AddressFormatter.POSTCODE, None) if postcode: components[AddressFormatter.POSTCODE] = PostalCodes.add_country_code(postcode, country) # If there's a postcode, we can still use just the city/state/postcode, otherwise discard if (not street and street_required) or (street and house_number and (street.lower() == house_number.lower())) or (unit and street and street.lower() == unit.lower()): if not postcode: continue components = self.components.drop_address(components) # Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc. unit = components.get(AddressFormatter.UNIT, None) if unit is not None: if is_numeric_strict(unit): unit = Unit.phrase(unit, language, country=country) elif non_numeric_units: unit = abbreviate(unit_types_gazetteer, unit, language, abbreviate_prob=abbreviate_unit_prob, separate_prob=separate_unit_prob) else: unit = None if unit is not None: components[AddressFormatter.UNIT] = unit else: components.pop(AddressFormatter.UNIT) unit = None # CLDR country name country_name = self.cldr_country_name(country, language, configs) if country_name: components[AddressFormatter.COUNTRY] = country_name for component_key in AddressFormatter.BOUNDARY_COMPONENTS: component = components.get(component_key, None) if component is not None: component = abbreviate(toponym_abbreviations_gazetteer, component, language, abbreviate_prob=abbreviate_toponym_prob) component = self.components.name_hyphens(component) components[component_key] = component # Any components specified to be added by the config (usually state) if add_components: for k, v in six.iteritems(add_components): if k not in components: components[k] = v # Get named states occasionally, added component is usually a state code address_state = self.components.state_name(components, country, language) if address_state: components[AddressFormatter.STATE] = address_state state = components.get(AddressFormatter.STATE) if state: state = self.components.abbreviated_state(state, country, language) if state: components[AddressFormatter.STATE] = state # This is expensive, so only turn on for files that don't supply their own city names # or for which those names are flawed osm_components = [] # Using population=0 instead of None means if there's no known population or # we don't need to add OSM components, we assume the population of the town is # very small and the place name shouldn't be used unqualified (i.e. needs information # like state name to disambiguate it) population = 0 unambiguous_city = False if add_osm_boundaries or AddressFormatter.CITY not in components: osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude) self.components.add_admin_boundaries(components, osm_components, country, language, latitude, longitude) categorized = self.components.categorized_osm_components(country, osm_components) for component, label in categorized: if label == AddressFormatter.CITY: unambiguous_city = self.components.unambiguous_wikipedia(component, language) if 'population' in component: population = component['population'] break if AddressFormatter.CITY not in components and city_replacements: components.update({k: v for k, v in six.iteritems(city_replacements) if k not in components}) # The neighborhood index is cheaper so can turn on for whole countries neighborhood_components = [] if add_osm_neighborhoods: neighborhood_components = self.components.neighborhood_components(latitude, longitude) self.components.add_neighborhoods(components, neighborhood_components, country, language, replace_city=osm_neighborhood_overrides_city) self.components.cleanup_boundary_names(components) self.components.country_specific_cleanup(components, country) self.components.replace_name_affixes(components, language, country=country) self.components.replace_names(components) self.components.prune_duplicate_names(components) self.components.remove_numeric_boundary_names(components) self.components.add_house_number_phrase(components, language, country=country) self.components.add_postcode_phrase(components, language, country=country) # Component dropout all_osm_components = osm_components + neighborhood_components components = place_config.dropout_components(components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city) self.components.add_genitives(components, language) formatted = self.formatter.format_address(components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) if random.random() < address_only_probability and street: address_only_components = self.components.drop_places(components) address_only_components = self.components.drop_postcode(address_only_components) formatted = self.formatter.format_address(address_only_components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) rand_val = random.random() if street and house_number and rand_val < drop_address_probability: components = self.components.drop_address(components) if rand_val < place_and_postcode_probability: components = self.components.drop_postcode(components) if components and (len(components) > 1 or add_osm_boundaries): formatted = self.formatter.format_address(components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted)
def formatted_addresses(self, path, tag_components=True): country = Countries.UNITED_KINGDOM candidate_languages = get_country_languages(country).items() f = open(path) reader = unicode_csv_reader(f) headers = reader.next() header_indices = {i: self.field_map[k] for i, k in enumerate(headers) if k in self.field_map} for row in reader: components = {} for i, key in six.iteritems(header_indices): value = row[i].strip() if not value: continue if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value): continue value = value.strip(', -') validator = self.component_validators.get(key, None) if validator is not None and not validator(value): continue if value: components[key] = value if components: components = self.fix_component_encodings(components) language = AddressComponents.address_language(components, candidate_languages) street = components.get(AddressFormatter.ROAD, None) if street is not None: street = street.strip() street = AddressComponents.cleaned_name(street) if AddressComponents.street_name_is_valid(street): street = abbreviate(street_types_gazetteer, street, language) components[AddressFormatter.ROAD] = street else: components.pop(AddressFormatter.ROAD) street = None house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) if house_number: house_number = self.cleanup_number(house_number, strip_commas=True) if house_number is not None: components[AddressFormatter.HOUSE_NUMBER] = house_number postcode = components.get(AddressFormatter.POSTCODE, None) # If there's a postcode, we can still use just the city/state/postcode, otherwise discard if not street or (street and house_number and (street.lower() == house_number.lower())): if not postcode: continue components = AddressComponents.drop_address(components) country_name = AddressComponents.cldr_country_name(country, language) if country_name: components[AddressFormatter.COUNTRY] = country_name for component_key in AddressFormatter.BOUNDARY_COMPONENTS: component = components.get(component_key, None) if component is not None: component = abbreviate(toponym_abbreviations_gazetteer, component, language) component = AddressComponents.name_hyphens(component) components[component_key] = component AddressComponents.replace_names(components) AddressComponents.prune_duplicate_names(components) AddressComponents.remove_numeric_boundary_names(components) AddressComponents.add_house_number_phrase(components, language, country=country) # Component dropout components = place_config.dropout_components(components, country=country) formatted = self.formatter.format_address(components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) if random.random() < self.address_only_probability and street: address_only_components = AddressComponents.drop_places(components) address_only_components = AddressComponents.drop_postcode(address_only_components) formatted = self.formatter.format_address(address_only_components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) rand_val = random.random() if street and house_number and rand_val < self.drop_address_probability: components = AddressComponents.drop_address(components) if rand_val < self.drop_address_and_postcode_probability: components = AddressComponents.drop_postcode(components) if components and (len(components) > 1): formatted = self.formatter.format_address(components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted)
def formatted_addresses(self, path, tag_components=True): country = Countries.UNITED_KINGDOM candidate_languages = get_country_languages(country).items() f = open(path) reader = unicode_csv_reader(f) headers = reader.next() header_indices = { i: self.field_map[k] for i, k in enumerate(headers) if k in self.field_map } for row in reader: components = {} for i, key in six.iteritems(header_indices): value = row[i].strip() if not value: continue if not_applicable_regex.match(value) or null_regex.match( value) or unknown_regex.match(value): continue value = value.strip(', -') validator = self.component_validators.get(key, None) if validator is not None and not validator(value): continue if value: components[key] = value if components: components = self.fix_component_encodings(components) language = AddressComponents.address_language( components, candidate_languages) street = components.get(AddressFormatter.ROAD, None) if street is not None: street = street.strip() street = AddressComponents.cleaned_name(street) if AddressComponents.street_name_is_valid(street): street = abbreviate(street_types_gazetteer, street, language) components[AddressFormatter.ROAD] = street else: components.pop(AddressFormatter.ROAD) street = None house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) if house_number: house_number = self.cleanup_number(house_number, strip_commas=True) if house_number is not None: components[ AddressFormatter.HOUSE_NUMBER] = house_number postcode = components.get(AddressFormatter.POSTCODE, None) # If there's a postcode, we can still use just the city/state/postcode, otherwise discard if not street or (street and house_number and (street.lower() == house_number.lower())): if not postcode: continue components = AddressComponents.drop_address(components) country_name = AddressComponents.cldr_country_name( country, language) if country_name: components[AddressFormatter.COUNTRY] = country_name for component_key in AddressFormatter.BOUNDARY_COMPONENTS: component = components.get(component_key, None) if component is not None: component = abbreviate(toponym_abbreviations_gazetteer, component, language) component = AddressComponents.name_hyphens(component) components[component_key] = component AddressComponents.replace_names(components) AddressComponents.prune_duplicate_names(components) AddressComponents.remove_numeric_boundary_names(components) AddressComponents.add_house_number_phrase(components, language, country=country) # Component dropout components = place_config.dropout_components(components, country=country) formatted = self.formatter.format_address( components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) if random.random() < self.address_only_probability and street: address_only_components = AddressComponents.drop_places( components) address_only_components = AddressComponents.drop_postcode( address_only_components) formatted = self.formatter.format_address( address_only_components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) rand_val = random.random() if street and house_number and rand_val < self.drop_address_probability: components = AddressComponents.drop_address(components) if rand_val < self.drop_address_and_postcode_probability: components = AddressComponents.drop_postcode( components) if components and (len(components) > 1): formatted = self.formatter.format_address( components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted)