def insertion_distribution(self, insertions): values = [] probs = [] for k, v in six.iteritems(insertions): if k == 'conditional' or not v: continue if 'before' in v: val = (self.BEFORE, v['before']) elif 'after' in v: val = (self.AFTER, v['after']) elif 'last' in v: val = (self.LAST, None) elif 'first' in v: val = (self.FIRST, None) else: raise ValueError('Insertions must contain one of {{first, before, after, last}}. Value was: {}'.format(v)) prob = v['probability'] values.append(val) probs.append(prob) # If the probabilities don't sum to 1, add a "do nothing" action if not isclose(sum(probs), 1.0): probs.append(1.0 - sum(probs)) values.append((None, None, False)) return values, cdf(probs)
def insertion_distribution(self, insertions): values = [] probs = [] for k, v in six.iteritems(insertions): if k == 'conditional' or not v: continue if 'before' in v: val = (self.BEFORE, v['before']) elif 'after' in v: val = (self.AFTER, v['after']) elif 'last' in v: val = (self.LAST, None) elif 'first' in v: val = (self.FIRST, None) else: raise ValueError( 'Insertions must contain one of {{first, before, after, last}}. Value was: {}' .format(v)) prob = v['probability'] values.append(val) probs.append(prob) # If the probabilities don't sum to 1, add a "do nothing" action if not isclose(sum(probs), 1.0): probs.append(1.0 - sum(probs)) values.append((None, None, False)) return values, cdf(probs)
def choose_alphanumeric_type(cls, key, language, country=None): alphanumeric_props = address_config.get_property(key, language, country=country, default=None) if alphanumeric_props is None: return None, None values = [] probs = [] for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.HYPHENATED_NUMBER, cls.ROMAN_NUMERAL): key = '{}_probability'.format(num_type) prob = alphanumeric_props.get(key) if prob is not None: values.append(num_type) probs.append(prob) if not values: return None, None probs = cdf(probs) num_type = weighted_choice(values, probs) num_type_props = alphanumeric_props.get(num_type, {}) return num_type, num_type_props
def cldr_country_name(self, country_code, language, configs): cldr_country_prob = float( self.get_property('cldr_country_probability', *configs)) country_name = None if random.random() < cldr_country_prob: localized, iso_3166, alpha2, alpha3 = values = range(4) localized_prob = float( self.get_property('localized_name_probability', *configs)) iso_3166_prob = float( self.get_property('iso_3166_name_probability', *configs)) alpha2_prob = float( self.get_property('iso_alpha_2_code_probability', *configs)) alpha3_prob = float( self.get_property('iso_alpha_3_code_probability', *configs)) probs = cdf( [localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob]) country_type = weighted_choice(values, probs) country_name = country_code.upper() if country_type == localized: country_name = country_names.localized_name( country_code, language) or country_names.localized_name( country_code) or country_name elif country_type == iso_3166: country_name = country_names.iso3166_name(country_code) elif country_type == alpha3: country_name = country_names.alpha3_code( country_code) or country_name return country_name
def pick_phrase_and_type(cls, number, language, country=None): values, probs = address_config.alternative_probabilities( cls.key, language, dictionaries=cls.dictionaries, country=country) if not values: return None, safe_decode( number) if number is not None else None, None phrase, phrase_props = weighted_choice(values, probs) values = [] probs = [] for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX): key = '{}_probability'.format(num_type) prob = phrase_props.get(key, None) if prob is not None: values.append(num_type) probs.append(prob) if not probs: num_type = cls.NUMERIC else: probs = cdf(probs) num_type = weighted_choice(values, probs) return num_type, phrase, phrase_props[num_type]
def add_country_code(cls, postal_code, country): postal_code = postal_code.strip() if not postal_codes_config.get_property('add_country_code', country=country): return postal_code cc_probability = postal_codes_config.get_property('country_code_probablity', country=country, default=0.0) if random.random() >= cc_probability or not postal_code or not postal_code[0].isdigit(): return postal_code country_code_phrases = postal_codes_config.get_property('country_code_phrase', country=country, default=None) if country_code_phrases is None: country_code_phrase = country.upper() else: alternates, probs = alternative_probabilities(country_code_phrases) probs_cdf = cdf(probs) country_code_phrase = weighted_choice(alternates, probs_cdf) cc_hyphen_probability = postal_codes_config.get_property('country_code_hyphen_probability', country=country, default=0.0) separator = u'' r = random.random() if r < cc_hyphen_probability: separator = u'-' return u'{}{}{}'.format(country_code_phrase, separator, postal_code)
def cldr_country_name(self, country_code, language, configs): cldr_country_prob = float(self.get_property('cldr_country_probability', *configs)) country_name = None if random.random() < cldr_country_prob: localized, iso_3166, alpha2, alpha3 = values = range(4) localized_prob = float(self.get_property('localized_name_probability', *configs)) iso_3166_prob = float(self.get_property('iso_3166_name_probability', *configs)) alpha2_prob = float(self.get_property('iso_alpha_2_code_probability', *configs)) alpha3_prob = float(self.get_property('iso_alpha_3_code_probability', *configs)) probs = cdf([localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob]) country_type = weighted_choice(values, probs) country_name = country_code.upper() if country_type == localized: country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name elif country_type == iso_3166: country_name = country_names.iso3166_name(country_code) elif country_type == alpha3: country_name = country_names.alpha3_code(country_code) or country_name return country_name
class Entrance(NumberedComponent): max_entrances = 10 entrance_range = range(1, max_entrances + 1) entrance_range_probs = zipfian_distribution(len(entrance_range), 2.0) entrance_range_cdf = cdf(entrance_range_probs) @classmethod def random(cls, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type('entrances.alphanumeric', language, country=country) if num_type is None: return None if num_type == cls.NUMERIC: number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf) return safe_decode(number) elif num_type == cls.HYPHENATED_NUMBER: number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf) number2 = number + weighted_choice(cls.entrance_range, cls.entrance_range_cdf) return u'{}-{}'.format(number, number2) else: alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None) if alphabet_probability is not None and random.random() >= alphabet_probability: alphabet = latin_alphabet letter = sample_alphabet(alphabet, 2.0) if num_type == cls.ALPHA: return safe_decode(letter) else: number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf) whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0)) hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0)) whitespace_phrase = u'' r = random.random() if r < whitespace_probability: whitespace_phrase = u' ' elif r < (whitespace_probability + hyphen_probability): whitespace_phrase = u'-' if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}{}').format(letter, whitespace_phrase, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}{}').format(number, whitespace_phrase, letter) @classmethod def phrase(cls, entrance, language, country=None): if entrance is None: return None return cls.numeric_phrase('entrances.alphanumeric', entrance, language, dictionaries=['entrances'], country=country)
def random(cls, language, country=None): category_props = address_config.get_property('categories', language, country=country) if category_props is None: return None values = [] probs = [] for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL): k = '{}_probability'.format(prep_phrase_type) prob = category_props.get(k, None) if prob is not None: values.append(prep_phrase_type) probs.append(prob) probs = cdf(probs) return weighted_choice(values, probs)
def random(cls, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type( 'po_boxes.alphanumeric', language, country=country) if num_type is None: return None if num_type != cls.ALPHA: digit_config = address_config.get_property('po_boxes.digits', language, country=country, default=[]) values = [] probs = [] for val in digit_config: values.append(val['length']) probs.append(val['probability']) probs = cdf(probs) num_digits = weighted_choice(values, probs) digits = cls.random_digits(num_digits) number = Digits.rewrite(digits, language, num_type_props) if num_type == cls.NUMERIC: return safe_decode(number) else: letter = cls.random_letter(language, country=country) whitespace_probability = float( num_type_props.get('whitespace_probability', 0.0)) whitespace_phrase = six.u( ' ') if whitespace_probability and random.random( ) < whitespace_probability else six.u('') if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}{}').format(letter, whitespace_phrase, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}{}').format(number, whitespace_phrase, letter) else: return cls.random_letter(language, country=country)
def rewrite(cls, d, lang, props, num_type=CARDINAL): if not props: return d d = safe_decode(d) values = [] probs = [] for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH, cls.ROMAN_NUMERAL): key = '{}_probability'.format(digit_type) if key in props: values.append(digit_type) probs.append(props[key]) if not isclose(sum(probs), 1.0): values.append(cls.ASCII) probs.append(1.0 - sum(probs)) probs = cdf(probs) digit_type = weighted_choice(values, probs) if digit_type == cls.ASCII: return d elif digit_type == cls.SPELLOUT: return cls.rewrite_spellout(d, lang, num_type, props) elif digit_type == cls.ROMAN_NUMERAL: roman_numeral = cls.rewrite_roman_numeral(d) if random.random() < props.get('ordinal_suffix_probability', 0.0): ordinal_suffix = ordinal_expressions.get_suffix( d, lang, gender=props.get('gender', None)) if ordinal_suffix: roman_numeral = six.u('{}{}').format( roman_numeral, ordinal_suffix) return roman_numeral elif digit_type == cls.UNICODE_FULL_WIDTH: return cls.rewrite_full_width(d) else: return d
def rewrite(cls, d, lang, props, num_type=CARDINAL): if not props: return d d = safe_decode(d) values = [] probs = [] for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH, cls.ROMAN_NUMERAL): key = '{}_probability'.format(digit_type) if key in props: values.append(digit_type) probs.append(props[key]) if not isclose(sum(probs), 1.0): values.append(cls.ASCII) probs.append(1.0 - sum(probs)) probs = cdf(probs) digit_type = weighted_choice(values, probs) if digit_type == cls.ASCII: return d elif digit_type == cls.SPELLOUT: return cls.rewrite_spellout(d, lang, num_type, props) elif digit_type == cls.ROMAN_NUMERAL: roman_numeral = cls.rewrite_roman_numeral(d) if random.random() < props.get('ordinal_suffix_probability', 0.0): ordinal_suffix = ordinal_expressions.get_suffix(d, lang, gender=props.get('gender', None)) if ordinal_suffix: roman_numeral = six.u('{}{}').format(roman_numeral, ordinal_suffix) return roman_numeral elif digit_type == cls.UNICODE_FULL_WIDTH: return cls.rewrite_full_width(d) else: return d
def pick_phrase_and_type(cls, number, language, country=None): values, probs = address_config.alternative_probabilities(cls.key, language, dictionaries=cls.dictionaries, country=country) if not values: return None, safe_decode(number) if number is not None else None, None phrase, phrase_props = weighted_choice(values, probs) values = [] probs = [] for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX): key = '{}_probability'.format(num_type) prob = phrase_props.get(key, None) if prob is not None: values.append(num_type) probs.append(prob) if not probs: num_type = cls.NUMERIC else: probs = cdf(probs) num_type = weighted_choice(values, probs) return num_type, phrase, phrase_props[num_type]
def sample_alphabet(alphabet, b=1.5): ''' Sample an "alphabet" using a Zipfian distribution (frequent items are very frequent, long tail of infrequent items). If we look at something like unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or "Unit Z" simply because most dwellings only have a few units. Sampling letters from a Zipfian distribution rather than uniformly means that instead of every letter having the same likelihood (1/26), letters toward the beginning of the alphabet are much more likely to be selected. Letters toward the end can still be selected sometimes, but are not very likely. Note letters don't necessarily need to be sorted alphabetically, just in order of frequency. ''' global alphabets alphabet = tuple(alphabet) if alphabet not in alphabets: probs = zipfian_distribution(len(alphabet), b) probs_cdf = cdf(probs) alphabets[alphabet] = probs_cdf probs_cdf = alphabets[alphabet] return weighted_choice(alphabet, probs_cdf)
def random(cls, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type('po_boxes.alphanumeric', language, country=country) if num_type is None: return None if num_type != cls.ALPHA: digit_config = address_config.get_property('po_boxes.digits', language, country=country, default=[]) values = [] probs = [] for val in digit_config: values.append(val['length']) probs.append(val['probability']) probs = cdf(probs) num_digits = weighted_choice(values, probs) digits = cls.random_digits(num_digits) number = Digits.rewrite(digits, language, num_type_props) if num_type == cls.NUMERIC: return safe_decode(number) else: letter = cls.random_letter(language, country=country) whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0)) whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('') if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}{}').format(letter, whitespace_phrase, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}{}').format(number, whitespace_phrase, letter) else: return cls.random_letter(language, country=country)
class Floor(NumberedComponent): # When we don't know the number of floors, use a Zipfian distribution # to choose randomly between 1 and max_floors with 1 being much more # likely than 2, etc. max_floors = 10 max_basements = 2 numbered_floors = range(max_floors + 1) + range(-1, -max_basements - 1, -1) floor_probs = zipfian_distribution(len(numbered_floors), 0.75) floor_probs_cdf = cdf(floor_probs) # For use with letters e.g. A0 is probably not as common floors_letters = range(1, max_floors + 1) + [0] floors_letters_probs = zipfian_distribution(len(floors_letters), 2.0) floors_letters_cdf = cdf(floors_letters_probs) @classmethod def sample_floors(cls, num_floors, num_basements=0): num_floors = int(num_floors) return random.randint(-num_basements, (num_floors - 1) if num_floors > 0 else 0) @classmethod def sample_floors_range(cls, min_floor, max_floor): return random.randint( min_floor, (max_floor - 1) if max_floor > min_floor else min_floor) @classmethod def random_int(cls, language, country=None, num_floors=None, num_basements=None): number = None if num_floors is not None: try: num_floors = int(num_floors) except (ValueError, TypeError): return weighted_choice(cls.numbered_floors, cls.floor_probs_cdf) if num_floors <= cls.max_floors: number = cls.sample_floors(num_floors, num_basements=num_basements or 0) else: number = cls.sample_floors_range(cls.max_floors + 1, num_floors) else: number = weighted_choice(cls.numbered_floors, cls.floor_probs_cdf) return number @classmethod def random_from_int(cls, number, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type( 'levels.alphanumeric', language, country=country) if num_type is None: return None numbering_starts_at = int( address_config.get_property('levels.numbering_starts_at', language, country=country, default=0)) if number >= 0: number += numbering_starts_at if num_type == cls.NUMERIC: return safe_decode(number) elif num_type == cls.ROMAN_NUMERAL: roman_numeral = numeric_expressions.roman_numeral(number) if roman_numeral is not None: return roman_numeral else: return safe_decode(number) elif num_type == cls.HYPHENATED_NUMBER: number2 = number + sample_floors_range(1, cls.max_floors) return u'{}-{}'.format(number, number2) else: alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) alphabet_probability = address_config.get_property( 'alphabet_probability', language, country=country, default=None) if alphabet_probability is not None and random.random( ) >= alphabet_probability: alphabet = latin_alphabet letter = sample_alphabet(alphabet) if num_type == cls.ALPHA: return letter else: number = weighted_choice(cls.floors_letters, cls.floors_letters_cdf) if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}').format(letter, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}').format(number, letter) return None @classmethod def random(cls, language, country=None, num_floors=None, num_basements=None): number = cls.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements) return cls.random_from_int(number, language, country=country) @classmethod def phrase(cls, floor, language, country=None, num_floors=None): if floor is None: return None integer_floor = False floor = safe_decode(floor) try: floor = int(floor) integer_floor = True except (ValueError, TypeError): try: floor = float(floor) integer_floor = int(floor) == floor except (ValueError, TypeError): return cls.numeric_phrase( 'levels.alphanumeric', floor, language, dictionaries=['level_types_numbered'], country=country) numbering_starts_at = int( address_config.get_property('levels.numbering_starts_at', language, country=country, default=0)) try: num_floors = int(num_floors) top_floor = num_floors if numbering_starts_at == 1 else num_floors - 1 is_top = num_floors and floor == top_floor except (ValueError, TypeError): is_top = False alias_prefix = 'levels.aliases' aliases = address_config.get_property(alias_prefix, language, country=country) if aliases: alias = None if not integer_floor and floor >= 0 and 'half_floors' in aliases: floor = int(floor) alias = 'half_floors' elif not integer_floor and floor < 0 and 'half_floors_negative' in aliases: floor = int(floor) alias = 'half_floors_negative' elif floor < -1 and '<-1' in aliases: alias = '<-1' elif is_top and 'top' in aliases: alias = 'top' elif safe_decode(floor) in aliases: alias = safe_decode(floor) floor = safe_decode(floor) if alias: alias_props = aliases.get(alias) # Aliases upon aliases, e.g. for something like "Upper Mezzanine" # where it's an alias for "1" under the half_floors key if safe_decode(floor) in alias_props.get('aliases', {}): alias_prefix = '{}.{}.aliases'.format(alias_prefix, alias) alias = safe_decode(floor) if alias: return cls.numeric_phrase('{}.{}'.format(alias_prefix, alias), floor, language, dictionaries=[ 'level_types_basement', 'level_types_mezzanine', 'level_types_numbered', 'level_types_standalone', 'level_types_sub_basement' ], country=country) return cls.numeric_phrase('levels.alphanumeric', floor, language, dictionaries=['level_types_numbered'], country=country)
def dropout_components(self, components, boundaries=(), country=None, population=None, unambiguous_city=False): containing_ids = set() for boundary in boundaries: object_type = boundary.get('type') object_id = safe_encode(boundary.get('id', '')) if not (object_type and object_id): continue containing_ids.add((object_type, object_id)) original_bitset = ComponentDependencies.component_bitset(components) names = defaultdict(list) admin_components = [c for c in components if c in self.ADMIN_COMPONENTS] for c in admin_components: names[components[c]].append(c) same_name = set() for c, v in six.iteritems(names): if len(v) > 1: same_name |= set(v) new_components = components.copy() city_replacements = set() if AddressFormatter.CITY not in components: city_replacements = self.city_replacements(country) for component in admin_components: include = self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city) if not include and component not in city_replacements: # Note: this check is for cities that have the same name as their admin # areas e.g. Luxembourg, Luxembourg. In cases like this, if we were to drop # city, we don't want to include country on its own. This should help the parser # default to the city in ambiguous cases where only one component is specified. if not (component == AddressFormatter.CITY and component in same_name): new_components.pop(component, None) else: value = components[component] for c in names[value]: new_components.pop(c, None) for component in self.ADMIN_COMPONENTS: value = self.get_property(('components', component, 'value'), country=country, default=None) if not value: values, probs = self.cdf_cache.get((country, component), (None, None)) if values is None: values = self.get_property(('components', component, 'values'), country=country, default=None) if values is not None: values, probs = zip(*[(v['value'], float(v['probability'])) for v in values]) probs = cdf(probs) self.cdf_cache[(country, component)] = (values, probs) if values is not None: value = weighted_choice(values, probs) if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city): new_components[component] = value self.drop_invalid_components(new_components, country, original_bitset=original_bitset) return new_components
def __init__(self, config_file=BOUNDARY_NAMES_CONFIG): config = yaml.load(open(config_file)) default_names = nested_get(config, ('names', 'keys')) name_keys, probs = alternative_probabilities(default_names) self.name_keys = name_keys self.name_key_probs = cdf(probs) self.component_name_keys = {} for component, component_config in six.iteritems(nested_get(config, ('names', 'components'), default={})): component_names = component_config.get('keys') component_name_keys, component_probs = alternative_probabilities(component_names) self.component_name_keys[component] = (component_name_keys, cdf(component_probs)) self.country_regex_replacements = defaultdict(list) for props in nested_get(config, ('names', 'regex_replacements',), default=[]): country = props.get('country') re_flags = re.I | re.UNICODE if not props.get('case_insensitive', True): re.flags ^= re.I pattern = re.compile(props['pattern'], re_flags) replace_group = props['replace_with_group'] replace_probability = props['replace_probability'] self.country_regex_replacements[country].append((pattern, replace_group, replace_probability)) self.country_regex_replacements = dict(self.country_regex_replacements) self.prefixes = {} self.prefix_regexes = {} self.suffixes = {} self.suffix_regexes = {} for language, components in six.iteritems(nested_get(config, ('names', 'prefixes', 'language'), default={}) ): for component, affixes in six.iteritems(components): affix_values, probs = alternative_probabilities(affixes) for val in affix_values: if 'prefix' not in val: raise AssertionError(six.u('Invalid prefix value for (language={}, component={}): {} ').format(language, component, val)) prefix_regex = six.u('|').join([six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex(v['prefix']) for v in affix_values]) self.prefix_regexes[(language, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U) if not isclose(sum(probs), 1.0): affix_values.append(None) probs.append(1.0 - sum(probs)) affix_probs_cdf = cdf(probs) self.prefixes[(language, component)] = affix_values, affix_probs_cdf for language, components in six.iteritems(nested_get(config, ('names', 'suffixes', 'language'), default={}) ): for component, affixes in six.iteritems(components): affix_values, probs = alternative_probabilities(affixes) for val in affix_values: if 'suffix' not in val: raise AssertionError(six.u('Invalid suffix value for (language={}, component={}): {} ').format(language, component, val)) suffix_regex = six.u('|').join([six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex(v['suffix']) for v in affix_values]) self.suffix_regexes[(language, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U) if not isclose(sum(probs), 1.0): affix_values.append(None) probs.append(1.0 - sum(probs)) affix_probs_cdf = cdf(probs) self.suffixes[(language, component)] = affix_values, affix_probs_cdf self.exceptions = {} for props in nested_get(config, ('names', 'exceptions'), default=[]): object_type = props['type'] object_id = safe_encode(props['id']) keys = [props['default']] probs = [props['probability']] for alt in props.get('alternatives', []): keys.append(alt['alternative']) probs.append(alt['probability']) probs = cdf(probs) self.exceptions[(object_type, object_id)] = (keys, probs)
def __init__(self, config_file=BOUNDARY_NAMES_CONFIG): config = yaml.load(open(config_file)) default_names = nested_get(config, ('names', 'keys')) name_keys, probs = alternative_probabilities(default_names) self.name_keys = name_keys self.name_key_probs = cdf(probs) self.component_name_keys = {} for component, component_config in six.iteritems( nested_get(config, ('names', 'components'), default={})): component_names = component_config.get('keys') component_name_keys, component_probs = alternative_probabilities( component_names) self.component_name_keys[component] = (component_name_keys, cdf(component_probs)) self.country_regex_replacements = defaultdict(list) for props in nested_get(config, ( 'names', 'regex_replacements', ), default=[]): country = props.get('country') re_flags = re.I | re.UNICODE if not props.get('case_insensitive', True): re.flags ^= re.I pattern = re.compile(props['pattern'], re_flags) replace_group = props['replace_with_group'] replace_probability = props['replace_probability'] self.country_regex_replacements[country].append( (pattern, replace_group, replace_probability)) self.country_regex_replacements = dict(self.country_regex_replacements) self.prefixes = {} self.prefix_regexes = {} self.suffixes = {} self.suffix_regexes = {} for language, components in six.iteritems( nested_get(config, ('names', 'prefixes', 'language'), default={})): for component, affixes in six.iteritems(components): affix_values, probs = alternative_probabilities(affixes) for val in affix_values: if 'prefix' not in val: raise AssertionError( six. u('Invalid prefix value for (language={}, component={}): {} ' ).format(language, component, val)) prefix_regex = six.u('|').join([ six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex( v['prefix']) for v in affix_values ]) self.prefix_regexes[(language, component)] = re.compile( six.u('^{}').format(prefix_regex), re.I | re.U) if not isclose(sum(probs), 1.0): affix_values.append(None) probs.append(1.0 - sum(probs)) affix_probs_cdf = cdf(probs) self.prefixes[(language, component)] = affix_values, affix_probs_cdf for language, components in six.iteritems( nested_get(config, ('names', 'suffixes', 'language'), default={})): for component, affixes in six.iteritems(components): affix_values, probs = alternative_probabilities(affixes) for val in affix_values: if 'suffix' not in val: raise AssertionError( six. u('Invalid suffix value for (language={}, component={}): {} ' ).format(language, component, val)) suffix_regex = six.u('|').join([ six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex( v['suffix']) for v in affix_values ]) self.suffix_regexes[(language, component)] = re.compile( six.u('{}$').format(suffix_regex), re.I | re.U) if not isclose(sum(probs), 1.0): affix_values.append(None) probs.append(1.0 - sum(probs)) affix_probs_cdf = cdf(probs) self.suffixes[(language, component)] = affix_values, affix_probs_cdf self.exceptions = {} for props in nested_get(config, ('names', 'exceptions'), default=[]): object_type = props['type'] object_id = safe_encode(props['id']) keys = [props['default']] probs = [props['probability']] for alt in props.get('alternatives', []): keys.append(alt['alternative']) probs.append(alt['probability']) probs = cdf(probs) self.exceptions[(object_type, object_id)] = (keys, probs)
def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False): has_alpha = False has_numeric = True is_integer = False is_none = False if num is not None: try: num_int = int(num) is_integer = True except ValueError: try: num_float = float(num) except ValueError: tokens = tokenize(safe_decode(num)) has_numeric = False for t, c in tokens: if c == token_types.NUMERIC: has_numeric = True if any((ch.isalpha() for ch in t)): has_alpha = True if strict_numeric and has_alpha: return safe_decode(num) else: is_none = True values, probs = None, None if is_alpha: values, probs = address_config.alternative_probabilities( '{}.alpha'.format(key), language, dictionaries=dictionaries, country=country) # Pick a phrase given the probability distribution from the config if values is None: values, probs = address_config.alternative_probabilities( key, language, dictionaries=dictionaries, country=country) if not values: return safe_decode(num) if not is_none else None phrase, phrase_props = weighted_choice(values, probs) values = [] probs = [] # Dictionaries are lowercased, so title case here if phrase_props.get('title_case', True): phrase = phrase.title() ''' There are a few ways we can express the number itself 1. Alias it as some standalone word like basement (for floor "-1") 2. Use the number itself, so "Floor 2" 3. Append/prepend an affix e.g. 2/F for second floor 4. As an ordinal expression e.g. "2nd Floor" ''' have_standalone = False have_null = False for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'): key = '{}_probability'.format(num_type) prob = phrase_props.get(key) if prob is not None: if num_type == 'standalone': have_standalone = True elif num_type == 'null': have_null = True values.append(num_type) probs.append(prob) elif num_type in phrase_props: values.append(num_type) probs.append(1.0) break if not probs or is_none: return phrase # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items if has_alpha: values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')]) total = float(sum(probs)) if isclose(total, 0.0): return None probs = [p / total for p in probs] probs = cdf(probs) if len(values) < 2: if have_standalone: num_type = 'standalone' elif have_null: num_type = 'null' else: num_type = 'numeric' else: num_type = weighted_choice(values, probs) if num_type == 'standalone': return phrase elif num_type == 'null': return safe_decode(num) props = phrase_props[num_type] if is_integer: num_int = int(num) if phrase_props.get('number_abs_value', False): num_int = abs(num_int) num = num_int if 'number_min_abs_value' in phrase_props and num_int < phrase_props[ 'number_min_abs_value']: return None if 'number_max_abs_value' in phrase_props and num_int > phrase_props[ 'number_max_abs_value']: return None if phrase_props.get('number_subtract_abs_value'): num_int -= phrase_props['number_subtract_abs_value'] num = num_int num = safe_decode(num) digits_props = props.get('digits') if digits_props: # Inherit the gender and category e.g. for ordinals for k in ('gender', 'category'): if k in props: digits_props[k] = props[k] num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL) # Do we add the numeric phrase e.g. Floor No 1 add_number_phrase = props.get('add_number_phrase', False) if add_number_phrase and random.random( ) < props['add_number_phrase_probability']: num = Number.phrase(num, language, country=country) whitespace_default = True if num_type == 'numeric_affix': phrase = props['affix'] if props.get('upper_case', True): phrase = phrase.upper() if 'zero_pad' in props and num.isdigit(): num = num.rjust(props['zero_pad'], props.get('zero_char', '0')) whitespace_default = False elif num_type == 'ordinal' and safe_decode(num).isdigit(): ordinal_expression = ordinal_expressions.suffixed_number( num, language, gender=props.get('gender', None)) if ordinal_expression is not None: num = ordinal_expression if 'null_phrase_probability' in props and ( num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))): if random.random() < props['null_phrase_probability']: return num direction = props['direction'] whitespace = props.get('whitespace', whitespace_default) whitespace_probability = props.get('whitespace_probability') if whitespace_probability is not None: whitespace = random.random() < whitespace_probability # Occasionally switch up if direction_probability is specified if random.random() > props.get('direction_probability', 1.0): if direction == 'left': direction = 'right' elif direction == 'right': direction = 'left' whitespace_phrase = six.u(' ') if whitespace else six.u('') # Phrase goes to the left of hte number if direction == 'left': return six.u('{}{}{}').format(phrase, whitespace_phrase, num) # Phrase goes to the right of the number elif direction == 'right': return six.u('{}{}{}').format(num, whitespace_phrase, phrase) # Need to specify a direction, otherwise return naked number else: return safe_decode(num)
def dropout_components(self, components, boundaries=(), country=None, population=None, unambiguous_city=False): containing_ids = set() for boundary in boundaries: object_type = boundary.get('type') object_id = safe_encode(boundary.get('id', '')) if not (object_type and object_id): continue containing_ids.add((object_type, object_id)) original_bitset = ComponentDependencies.component_bitset(components) names = defaultdict(list) admin_components = [ c for c in components if c in self.ADMIN_COMPONENTS ] for c in admin_components: names[components[c]].append(c) same_name = set() for c, v in six.iteritems(names): if len(v) > 1: same_name |= set(v) new_components = components.copy() city_replacements = set() if AddressFormatter.CITY not in components: city_replacements = self.city_replacements(country) for component in admin_components: include = self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city) if not include and component not in city_replacements: # Note: this check is for cities that have the same name as their admin # areas e.g. Luxembourg, Luxembourg. In cases like this, if we were to drop # city, we don't want to include country on its own. This should help the parser # default to the city in ambiguous cases where only one component is specified. if not (component == AddressFormatter.CITY and component in same_name): new_components.pop(component, None) else: value = components[component] for c in names[value]: new_components.pop(c, None) for component in self.ADMIN_COMPONENTS: value = self.get_property(('components', component, 'value'), country=country, default=None) if not value: values, probs = self.cdf_cache.get((country, component), (None, None)) if values is None: values = self.get_property( ('components', component, 'values'), country=country, default=None) if values is not None: values, probs = zip(*[(v['value'], float(v['probability'])) for v in values]) probs = cdf(probs) self.cdf_cache[(country, component)] = (values, probs) if values is not None: value = weighted_choice(values, probs) if value is not None and component not in components and self.include_component( component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city): new_components[component] = value self.drop_invalid_components(new_components, country, original_bitset=original_bitset) return new_components
class Unit(NumberedComponent): # When we don't know the number of units, use a Zipfian distribution # to choose randomly between 1 and max_units with 1 being much more # likely than 2, etc. max_units = 99 max_basements = 2 hundreds_numbered_units_tens = [ range(101, 110) + [100], range(201, 210) + [200], range(301, 310) + [300], range(401, 410) + [400], range(501, 510) + [500], ] hundreds_numbered_units = [ range(110, 200), range(210, 300), range(310, 400), range(410, 500), range(510, 600), ] thousands_numbered_units = [ range(1001, 1030) + [1000], range(2001, 2030) + [2000], range(3001, 3030) + [3000], range(4001, 4030) + [4000], range(5001, 5030) + [5000] ] numbered_units = range(1, 10) numbered_units.extend( itertools.chain(*itertools.izip(*hundreds_numbered_units_tens))) numbered_units.extend(range(10, 100)) numbered_units.extend( itertools.chain(*itertools.izip(*hundreds_numbered_units))) numbered_units.extend( itertools.chain(*itertools.izip(*thousands_numbered_units))) numbered_units.extend(range(10001, 10100) + [10000]) numbered_units.append(0) numbered_units.extend(range(0, -max_basements - 1, -1)) unit_probs = zipfian_distribution(len(numbered_units), 0.7) unit_probs_cdf = cdf(unit_probs) num_digits = [2, 3, 4] num_digits_probs = zipfian_distribution(len(num_digits), 4.0) num_digits_cdf = cdf(num_digits_probs) # For use with floors e.g. #301 more common than #389 positive_units_floors = range(1, 10) + [0] + range(10, max_units + 1) positive_units_floors_probs = zipfian_distribution( len(positive_units_floors), 0.6) positive_units_floors_cdf = cdf(positive_units_floors_probs) # For basic positive units positive_units = range(1, max_units + 1) positive_units_probs = zipfian_distribution(len(positive_units), 0.6) positive_units_cdf = cdf(positive_units_probs) # For use with letters e.g. A0 less common positive_units_letters = range(1, max_units + 1) + [0] positive_units_letters_probs = zipfian_distribution( len(positive_units_letters), 0.6) positive_units_letters_cdf = cdf(positive_units_letters_probs) RESIDENTIAL = 'residential' COMMERCIAL = 'commercial' INDUSTRIAL = 'industrial' UNIVERSITY = 'university' @classmethod def sample_num_digits(cls): return weighted_choice(cls.num_digits, cls.num_digits_cdf) @classmethod def for_floor(cls, floor_number, num_digits=None): num_digits = num_digits if num_digits is not None else cls.sample_num_digits( ) unit = weighted_choice(cls.positive_units_floors, cls.positive_units_floors_cdf) return six.u('{}{}').format(floor_number, safe_decode(unit).zfill(num_digits)) @classmethod def random(cls, language, country=None, num_floors=None, num_basements=None, floor=None): num_type, num_type_props = cls.choose_alphanumeric_type( 'units.alphanumeric', language, country=country) if num_type is None: return None use_floor_prob = address_config.get_property( 'units.alphanumeric.use_floor_probability', language, country=country, default=0.0) use_positive_numbers_prob = address_config.get_property( 'units.alphanumeric.use_positive_numbers_probability', language, country=country, default=0.0) if (num_floors is None and floor is None) or random.random() >= use_floor_prob: if random.random() >= use_positive_numbers_prob: number = weighted_choice(cls.numbered_units, cls.unit_probs_cdf) else: number = weighted_choice(cls.positive_units, cls.positive_units_cdf) else: if floor is None or not floor.isdigit(): floor = Floor.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements) floor_numbering_starts_at = address_config.get_property( 'levels.numbering_starts_at', language, country=country, default=0) ground_floor_starts_at = address_config.get_property( 'units.alphanumeric.use_floor_ground_starts_at', language, country=country, default=None) if ground_floor_starts_at is not None: try: floor = int(floor) if floor >= floor_numbering_starts_at: floor -= floor_numbering_starts_at floor += ground_floor_starts_at floor = safe_decode(floor) except (TypeError, ValueError): pass use_floor_affix_prob = address_config.get_property( 'units.alphanumeric.use_floor_numeric_affix_probability', language, country=country, default=0.0) if use_floor_affix_prob and random.random() < use_floor_affix_prob: floor_phrase = Floor.phrase(floor, language, country=country) # Only works if the floor phrase is strictly numeric e.g. "1" or "H1" if is_numeric_strict(floor_phrase): unit = weighted_choice(cls.positive_units, cls.positive_units_cdf) unit_num_digits = address_config.get_property( 'units.alphanumeric.use_floor_unit_num_digits', language, country=country, default=None) if unit_num_digits is not None: unit = safe_decode(unit).zfill(unit_num_digits) return six.u('{}{}').format(floor_phrase, unit) floor_num_digits = address_config.get_property( 'units.alphanumeric.use_floor_floor_num_digits', language, country=country, default=None) if floor_num_digits is not None and floor.isdigit(): floor = floor.zfill(floor_num_digits) number = cls.for_floor(floor) if num_type == cls.NUMERIC: return safe_decode(number) elif num_type == cls.HYPHENATED_NUMBER: number2 = weighted_choice(cls.positive_units, cls.positive_units_cdf) range_prob = float( address_config.get_property( 'units.alphanumeric.hyphenated_number.range_probability', language, country=country, default=0.5)) direction = address_config.get_property( 'units.alphanumeric.hyphenated_number.direction', language, country=country, default='right') direction_prob = float( address_config.get_property( 'units.alphanumeric.hyphenated_number.direction_probability', language, country=country, default=0.0)) if random.random() < direction_prob: direction = 'left' if direction == 'right' else 'right' direction_right = direction == 'right' if random.random() < range_prob: if direction_right: number2 += number else: number2 = max(0, number - number2) if direction == 'right': return u'{}-{}'.format(number, number2) else: return u'{}-{}'.format(number2, number) else: alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) alphabet_probability = address_config.get_property( 'alphabet_probability', language, country=country, default=None) if alphabet_probability is not None and random.random( ) >= alphabet_probability: alphabet = latin_alphabet letter = sample_alphabet(alphabet) if num_type == cls.ALPHA: return safe_decode(letter) else: if num_floors is None: number = weighted_choice(cls.positive_units_letters, cls.positive_units_letters_cdf) whitespace_probability = float( num_type_props.get('whitespace_probability', 0.0)) hyphen_probability = float( num_type_props.get('hyphen_probability', 0.0)) whitespace_phrase = u'' r = random.random() if r < whitespace_probability: whitespace_phrase = u' ' elif r < (whitespace_probability + hyphen_probability): whitespace_phrase = u'-' if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}{}').format(letter, whitespace_phrase, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}{}').format(number, whitespace_phrase, letter) @classmethod def add_direction(cls, key, unit, language, country=None): add_direction_probability = address_config.get_property( '{}.add_direction_probability'.format(key), language, country=country, default=0.0) if not random.random() < add_direction_probability: return unit add_direction_numeric = address_config.get_property( '{}.add_direction_numeric'.format(key), language, country=country) try: unit = int(unit) integer_unit = True except (ValueError, TypeError): integer_unit = False if add_direction_numeric and integer_unit: return RelativeDirection.phrase(unit, language, country=country) elif not integer_unit: add_direction_standalone = address_config.get_property( '{}.add_direction_standalone'.format(key), language, country=country) if add_direction_standalone: return RelativeDirection.phrase(None, language, country=country) @classmethod def add_quadrant(cls, key, unit, language, country=None): add_quadrant_probability = address_config.get_property( '{}.add_quadrant_probability'.format(key), language, country=country, default=0.0) if not random.random() < add_quadrant_probability: return unit add_quadrant_numeric = address_config.get_property( '{}.add_quadrant_numeric'.format(key), language, country=country) try: unit = int(unit) integer_unit = True except (ValueError, TypeError): integer_unit = False first_direction = address_config.get_property( '{}.add_quadrant_first_direction'.format(key), language, country=country) if first_direction == 'lateral': ordering = (LateralDirection, AnteroposteriorDirection) elif first_direction == 'anteroposterior': ordering = (AnteroposteriorDirection, LateralDirection) else: return unit if not integer_unit: add_quadrant_standalone = address_config.get_property( '{}.add_quadrant_standalone'.format(key), language, country=country) if add_quadrant_standalone: unit = None else: return None last_num_type = None for i, c in enumerate(ordering): num_type, phrase, props = c.pick_phrase_and_type(unit, language, country=country) whitespace_default = num_type == c.NUMERIC or last_num_type == c.NUMERIC unit = c.combine_with_number(unit, phrase, num_type, props, whitespace_default=whitespace_default) last_num_type = num_type return unit @classmethod def phrase(cls, unit, language, country=None, zone=None): if unit is not None: key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format( zone) if not address_config.get_property(key, language, country=country): return None is_alpha = safe_decode(unit).isalpha() direction_unit = None add_direction = address_config.get_property( '{}.add_direction'.format(key), language, country=country) if add_direction: direction_unit = cls.add_direction(key, unit, language, country=country) if direction_unit and direction_unit != unit: unit = direction_unit is_alpha = False else: add_quadrant = address_config.get_property( '{}.add_quadrant'.format(key), language, country=country) if add_quadrant: unit = cls.add_quadrant(key, unit, language, country=country) is_alpha = False return cls.numeric_phrase(key, safe_decode(unit), language, dictionaries=['unit_types_numbered'], country=country, is_alpha=is_alpha) else: key = 'units.standalone' values, probs = address_config.alternative_probabilities( key, language, dictionaries=['unit_types_standalone'], country=country) if values is None: return None phrase, phrase_props = weighted_choice(values, probs) return phrase.title()
def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False): has_alpha = False has_numeric = True is_integer = False is_none = False if num is not None: try: num_int = int(num) is_integer = True except ValueError: try: num_float = float(num) except ValueError: tokens = tokenize(safe_decode(num)) has_numeric = False for t, c in tokens: if c == token_types.NUMERIC: has_numeric = True if any((ch.isalpha() for ch in t)): has_alpha = True if strict_numeric and has_alpha: return safe_decode(num) else: is_none = True values, probs = None, None if is_alpha: values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country) # Pick a phrase given the probability distribution from the config if values is None: values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country) if not values: return safe_decode(num) if not is_none else None phrase, phrase_props = weighted_choice(values, probs) values = [] probs = [] # Dictionaries are lowercased, so title case here if phrase_props.get('title_case', True): phrase = phrase.title() ''' There are a few ways we can express the number itself 1. Alias it as some standalone word like basement (for floor "-1") 2. Use the number itself, so "Floor 2" 3. Append/prepend an affix e.g. 2/F for second floor 4. As an ordinal expression e.g. "2nd Floor" ''' have_standalone = False have_null = False for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'): key = '{}_probability'.format(num_type) prob = phrase_props.get(key) if prob is not None: if num_type == 'standalone': have_standalone = True elif num_type == 'null': have_null = True values.append(num_type) probs.append(prob) elif num_type in phrase_props: values.append(num_type) probs.append(1.0) break if not probs or is_none: return phrase # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items if has_alpha: values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')]) total = float(sum(probs)) if isclose(total, 0.0): return None probs = [p / total for p in probs] probs = cdf(probs) if len(values) < 2: if have_standalone: num_type = 'standalone' elif have_null: num_type = 'null' else: num_type = 'numeric' else: num_type = weighted_choice(values, probs) if num_type == 'standalone': return phrase elif num_type == 'null': return safe_decode(num) props = phrase_props[num_type] if is_integer: num_int = int(num) if phrase_props.get('number_abs_value', False): num_int = abs(num_int) num = num_int if 'number_min_abs_value' in phrase_props and num_int < phrase_props['number_min_abs_value']: return None if 'number_max_abs_value' in phrase_props and num_int > phrase_props['number_max_abs_value']: return None if phrase_props.get('number_subtract_abs_value'): num_int -= phrase_props['number_subtract_abs_value'] num = num_int num = safe_decode(num) digits_props = props.get('digits') if digits_props: # Inherit the gender and category e.g. for ordinals for k in ('gender', 'category'): if k in props: digits_props[k] = props[k] num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL) # Do we add the numeric phrase e.g. Floor No 1 add_number_phrase = props.get('add_number_phrase', False) if add_number_phrase and random.random() < props['add_number_phrase_probability']: num = Number.phrase(num, language, country=country) whitespace_default = True if num_type == 'numeric_affix': phrase = props['affix'] if props.get('upper_case', True): phrase = phrase.upper() if 'zero_pad' in props and num.isdigit(): num = num.rjust(props['zero_pad'], props.get('zero_char', '0')) whitespace_default = False elif num_type == 'ordinal' and safe_decode(num).isdigit(): ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None)) if ordinal_expression is not None: num = ordinal_expression if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))): if random.random() < props['null_phrase_probability']: return num direction = props['direction'] whitespace = props.get('whitespace', whitespace_default) whitespace_probability = props.get('whitespace_probability') if whitespace_probability is not None: whitespace = random.random() < whitespace_probability # Occasionally switch up if direction_probability is specified if random.random() > props.get('direction_probability', 1.0): if direction == 'left': direction = 'right' elif direction == 'right': direction = 'left' whitespace_phrase = six.u(' ') if whitespace else six.u('') # Phrase goes to the left of hte number if direction == 'left': return six.u('{}{}{}').format(phrase, whitespace_phrase, num) # Phrase goes to the right of the number elif direction == 'right': return six.u('{}{}{}').format(num, whitespace_phrase, phrase) # Need to specify a direction, otherwise return naked number else: return safe_decode(num)
class Block(NumberedComponent): max_blocks = 10 block_range = range(1, max_blocks + 1) block_range_probs = zipfian_distribution(len(block_range), 2.0) block_range_cdf = cdf(block_range_probs) @classmethod def random(cls, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type( 'blocks.alphanumeric', language, country=country) if num_type is None: return None if num_type == cls.NUMERIC: number = weighted_choice(cls.block_range, cls.block_range_cdf) return safe_decode(number) else: alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) alphabet_probability = address_config.get_property( 'alphabet_probability', language, country=country, default=None) if alphabet_probability is not None and random.random( ) >= alphabet_probability: alphabet = latin_alphabet letter = sample_alphabet(alphabet, 2.0) if num_type == cls.ALPHA: return safe_decode(letter) else: number = weighted_choice(cls.block_range, cls.block_range_cdf) whitespace_probability = float( num_type_props.get('whitespace_probability', 0.0)) whitespace_phrase = six.u( ' ') if whitespace_probability and random.random( ) < whitespace_probability else six.u('') if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}{}').format(letter, whitespace_phrase, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}{}').format(number, whitespace_phrase, letter) @classmethod def phrase(cls, block, language, country=None): if block is None: return None phrase_prob = address_config.get_property( 'blocks.alphanumeric_phrase_probability', language, country=country, default=0.0) if random.random() < phrase_prob: return cls.numeric_phrase('blocks.alphanumeric', block, language, dictionaries=['qualifiers'], country=country) else: return None