예제 #1
0
    def pick_phrase_and_type(cls, number, language, country=None):
        values, probs = address_config.alternative_probabilities(
            cls.key, language, dictionaries=cls.dictionaries, country=country)
        if not values:
            return None, safe_decode(
                number) if number is not None else None, None

        phrase, phrase_props = weighted_choice(values, probs)

        values = []
        probs = []

        for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
            key = '{}_probability'.format(num_type)
            prob = phrase_props.get(key, None)
            if prob is not None:
                values.append(num_type)
                probs.append(prob)

        if not probs:
            num_type = cls.NUMERIC
        else:
            probs = cdf(probs)
            num_type = weighted_choice(values, probs)

        return num_type, phrase, phrase_props[num_type]
예제 #2
0
파일: blocks.py 프로젝트: BERENZ/libpostal
    def random(cls, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type('blocks.alphanumeric', language, country=country)
        if num_type is None:
            return None

        if num_type == cls.NUMERIC:
            number = weighted_choice(cls.block_range, cls.block_range_cdf)
            return safe_decode(number)
        else:
            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
            if alphabet_probability is not None and random.random() >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet, 2.0)
            if num_type == cls.ALPHA:
                return safe_decode(letter)
            else:
                number = weighted_choice(cls.block_range, cls.block_range_cdf)

                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
                whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
예제 #3
0
    def random(cls, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type(
            'staircases.alphanumeric', language, country=country)
        if num_type is None:
            return None

        if num_type == cls.NUMERIC:
            number = weighted_choice(cls.staircase_range,
                                     cls.staircase_range_cdf)
            return safe_decode(number)
        elif num_type == cls.HYPHENATED_NUMBER:
            number = weighted_choice(cls.staircase_range,
                                     cls.staircase_range_cdf)
            number2 = number + weighted_choice(cls.staircase_range,
                                               cls.staircase_range_cdf)
            return u'{}-{}'.format(number, number2)
        else:
            alphabet = address_config.get_property('alphabet',
                                                   language,
                                                   country=country,
                                                   default=latin_alphabet)
            alphabet_probability = address_config.get_property(
                'alphabet_probability',
                language,
                country=country,
                default=None)
            if alphabet_probability is not None and random.random(
            ) >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet, 2.0)
            if num_type == cls.ALPHA:
                return safe_decode(letter)
            else:
                number = weighted_choice(cls.staircase_range,
                                         cls.staircase_range_cdf)

                whitespace_probability = float(
                    num_type_props.get('whitespace_probability', 0.0))
                hyphen_probability = float(
                    num_type_props.get('hyphen_probability', 0.0))
                whitespace_phrase = u''
                r = random.random()
                if r < whitespace_probability:
                    whitespace_phrase = u' '
                elif r < (whitespace_probability + hyphen_probability):
                    whitespace_phrase = u'-'

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase,
                                                  number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase,
                                                  letter)
예제 #4
0
    def join(cls, phrases, language, country=None):

        if not hasattr(phrases, '__iter__'):
            raise ValueError('Param phrases must be iterable')

        values, probs = address_config.alternative_probabilities(cls.key, language, country=country)
        phrase, props = weighted_choice(values, probs)

        whitespace = props.get('whitespace', True)
        whitespace_phrase = six.u(' ') if whitespace else six.u('')

        phrases = [safe_decode(p) for p in phrases]

        max_phrase_join = props.get('max_phrase_join', 2)
        if len(phrases) > max_phrase_join:
            default_join = safe_decode(props.get('default_join', cls.DEFAULT_WHITESPACE_JOIN if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN))
            prefix = default_join.join(phrases[:-max_phrase_join] + [six.u('')])
        else:
            prefix = six.u('')

        if whitespace:
            phrase = six.u('{}{}{}').format(whitespace_phrase, phrase, whitespace_phrase)
        joined_phrase = phrase.join(phrases[-max_phrase_join:])

        return six.u('').join([prefix, joined_phrase])
예제 #5
0
    def cldr_country_name(self, country_code, language, configs):
        cldr_country_prob = float(self.get_property('cldr_country_probability', *configs))

        country_name = None

        if random.random() < cldr_country_prob:
            localized, iso_3166, alpha2, alpha3 = values = range(4)
            localized_prob = float(self.get_property('localized_name_probability', *configs))
            iso_3166_prob = float(self.get_property('iso_3166_name_probability', *configs))
            alpha2_prob = float(self.get_property('iso_alpha_2_code_probability', *configs))
            alpha3_prob = float(self.get_property('iso_alpha_3_code_probability', *configs))

            probs = cdf([localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob])

            country_type = weighted_choice(values, probs)

            country_name = country_code.upper()
            if country_type == localized:
                country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name
            elif country_type == iso_3166:
                country_name = country_names.iso3166_name(country_code)
            elif country_type == alpha3:
                country_name = country_names.alpha3_code(country_code) or country_name

        return country_name
예제 #6
0
    def cldr_country_name(self, country_code, language, configs):
        cldr_country_prob = float(
            self.get_property('cldr_country_probability', *configs))

        country_name = None

        if random.random() < cldr_country_prob:
            localized, iso_3166, alpha2, alpha3 = values = range(4)
            localized_prob = float(
                self.get_property('localized_name_probability', *configs))
            iso_3166_prob = float(
                self.get_property('iso_3166_name_probability', *configs))
            alpha2_prob = float(
                self.get_property('iso_alpha_2_code_probability', *configs))
            alpha3_prob = float(
                self.get_property('iso_alpha_3_code_probability', *configs))

            probs = cdf(
                [localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob])

            country_type = weighted_choice(values, probs)

            country_name = country_code.upper()
            if country_type == localized:
                country_name = country_names.localized_name(
                    country_code, language) or country_names.localized_name(
                        country_code) or country_name
            elif country_type == iso_3166:
                country_name = country_names.iso3166_name(country_code)
            elif country_type == alpha3:
                country_name = country_names.alpha3_code(
                    country_code) or country_name

        return country_name
예제 #7
0
파일: phrases.py 프로젝트: BERENZ/libpostal
    def add_country_code(cls, postal_code, country):
        postal_code = postal_code.strip()
        if not postal_codes_config.get_property('add_country_code', country=country):
            return postal_code

        cc_probability = postal_codes_config.get_property('country_code_probablity', country=country, default=0.0)
        if random.random() >= cc_probability or not postal_code or not postal_code[0].isdigit():
            return postal_code

        country_code_phrases = postal_codes_config.get_property('country_code_phrase', country=country, default=None)
        if country_code_phrases is None:
            country_code_phrase = country.upper()
        else:
            alternates, probs = alternative_probabilities(country_code_phrases)
            probs_cdf = cdf(probs)
            country_code_phrase = weighted_choice(alternates, probs_cdf)

        cc_hyphen_probability = postal_codes_config.get_property('country_code_hyphen_probability', country=country, default=0.0)

        separator = u''
        r = random.random()
        if r < cc_hyphen_probability:
            separator = u'-'

        return u'{}{}{}'.format(country_code_phrase, separator, postal_code)
예제 #8
0
 def for_floor(cls, floor_number, num_digits=None):
     num_digits = num_digits if num_digits is not None else cls.sample_num_digits(
     )
     unit = weighted_choice(cls.positive_units_floors,
                            cls.positive_units_floors_cdf)
     return six.u('{}{}').format(floor_number,
                                 safe_decode(unit).zfill(num_digits))
예제 #9
0
    def join(cls, phrases, language, country=None):

        if not hasattr(phrases, '__iter__'):
            raise ValueError('Param phrases must be iterable')

        values, probs = address_config.alternative_probabilities(
            cls.key, language, country=country)
        phrase, props = weighted_choice(values, probs)

        whitespace = props.get('whitespace', True)
        whitespace_phrase = six.u(' ') if whitespace else six.u('')

        phrases = [safe_decode(p) for p in phrases]

        max_phrase_join = props.get('max_phrase_join', 2)
        if len(phrases) > max_phrase_join:
            default_join = safe_decode(
                props.get(
                    'default_join', cls.DEFAULT_WHITESPACE_JOIN
                    if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN))
            prefix = default_join.join(phrases[:-max_phrase_join] +
                                       [six.u('')])
        else:
            prefix = six.u('')

        if whitespace:
            phrase = six.u('{}{}{}').format(whitespace_phrase, phrase,
                                            whitespace_phrase)
        joined_phrase = phrase.join(phrases[-max_phrase_join:])

        return six.u('').join([prefix, joined_phrase])
예제 #10
0
 def phrase(cls, language, country=None):
     values, probs = address_config.alternative_probabilities(
         'cross_streets.intersection', language, country=country)
     if not values:
         return None
     phrase, props = weighted_choice(values, probs)
     return phrase
예제 #11
0
    def choose_alphanumeric_type(cls, key, language, country=None):
        alphanumeric_props = address_config.get_property(key,
                                                         language,
                                                         country=country,
                                                         default=None)
        if alphanumeric_props is None:
            return None, None

        values = []
        probs = []

        for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC,
                         cls.NUMERIC_PLUS_ALPHA, cls.HYPHENATED_NUMBER,
                         cls.ROMAN_NUMERAL):
            key = '{}_probability'.format(num_type)
            prob = alphanumeric_props.get(key)
            if prob is not None:
                values.append(num_type)
                probs.append(prob)

        if not values:
            return None, None

        probs = cdf(probs)
        num_type = weighted_choice(values, probs)
        num_type_props = alphanumeric_props.get(num_type, {})

        return num_type, num_type_props
예제 #12
0
파일: units.py 프로젝트: BERENZ/libpostal
    def phrase(cls, unit, language, country=None, zone=None):
        if unit is not None:
            key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format(zone)

            if not address_config.get_property(key, language, country=country):
                return None

            is_alpha = safe_decode(unit).isalpha()

            direction_unit = None
            add_direction = address_config.get_property('{}.add_direction'.format(key), language, country=country)
            if add_direction:
                direction_unit = cls.add_direction(key, unit, language, country=country)

            if direction_unit and direction_unit != unit:
                unit = direction_unit
                is_alpha = False
            else:
                add_quadrant = address_config.get_property('{}.add_quadrant'.format(key), language, country=country)
                if add_quadrant:
                    unit = cls.add_quadrant(key, unit, language, country=country)
                    is_alpha = False

            return cls.numeric_phrase(key, safe_decode(unit), language,
                                      dictionaries=['unit_types_numbered'], country=country, is_alpha=is_alpha)
        else:
            key = 'units.standalone'
            values, probs = address_config.alternative_probabilities(key, language,
                                                                     dictionaries=['unit_types_standalone'],
                                                                     country=country)
            if values is None:
                return None
            phrase, phrase_props = weighted_choice(values, probs)
            return phrase.title()
예제 #13
0
파일: names.py 프로젝트: BERENZ/libpostal
    def name(self, country, language, component, name):
        all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, [])

        prefixes, prefix_probs = self.prefixes.get((language, component), (None, None))
        suffixes, suffix_probs = self.suffixes.get((language, component), (None, None))

        if not all_replacements and not prefixes and not suffixes:
            return name

        for regex, group, prob in all_replacements:
            match = regex.match(name)
            if match and random.random() < prob:
                name = match.group(group)

        for affixes, affix_probs, regexes, key, direction in ((prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0),
                                                              (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)):
            if affixes is not None:
                regex = regexes[language, component]
                if regex.match(name):
                    continue

                affix = weighted_choice(affixes, affix_probs)

                if affix is not None:
                    whitespace = affix.get('whitespace', True)
                    space_val = six.u(' ') if whitespace else six.u('')
                    affix = affix[key]
                    if direction == 0:
                        return six.u('{}{}{}').format(affix, space_val, safe_decode(name))
                    else:
                        return six.u('{}{}{}').format(safe_decode(name), space_val, affix)

        return name
예제 #14
0
파일: floors.py 프로젝트: BERENZ/libpostal
    def random_int(cls, language, country=None, num_floors=None, num_basements=None):
        number = None
        if num_floors is not None:
            try:
                num_floors = int(num_floors)
            except (ValueError, TypeError):
                return weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)

            if num_floors <= cls.max_floors:
                number = cls.sample_floors(num_floors, num_basements=num_basements or 0)
            else:
                number = cls.sample_floors_range(cls.max_floors + 1, num_floors)

        else:
            number = weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)

        return number
예제 #15
0
    def random(cls, language, country=None, cardinal_proability=0.5):
        values = [cls.CARDINAL, cls.RELATIVE]
        probs_cdf = [cardinal_proability, 1.0]

        choice = weighted_choice(values, probs_cdf)
        if choice == cls.CARDINAL:
            return CardinalDirection.phrase(None, language, country=country)
        else:
            return RelativeDirection.phrase(None, language, country=country)
예제 #16
0
    def random(cls, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type(
            'blocks.alphanumeric', language, country=country)
        if num_type is None:
            return None

        if num_type == cls.NUMERIC:
            number = weighted_choice(cls.block_range, cls.block_range_cdf)
            return safe_decode(number)
        else:
            alphabet = address_config.get_property('alphabet',
                                                   language,
                                                   country=country,
                                                   default=latin_alphabet)
            alphabet_probability = address_config.get_property(
                'alphabet_probability',
                language,
                country=country,
                default=None)
            if alphabet_probability is not None and random.random(
            ) >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet, 2.0)
            if num_type == cls.ALPHA:
                return safe_decode(letter)
            else:
                number = weighted_choice(cls.block_range, cls.block_range_cdf)

                whitespace_probability = float(
                    num_type_props.get('whitespace_probability', 0.0))
                whitespace_phrase = six.u(
                    ' ') if whitespace_probability and random.random(
                    ) < whitespace_probability else six.u('')

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase,
                                                  number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase,
                                                  letter)
예제 #17
0
    def random_from_int(cls, number, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type(
            'levels.alphanumeric', language, country=country)
        if num_type is None:
            return None

        numbering_starts_at = int(
            address_config.get_property('levels.numbering_starts_at',
                                        language,
                                        country=country,
                                        default=0))

        if number >= 0:
            number += numbering_starts_at

        if num_type == cls.NUMERIC:
            return safe_decode(number)
        elif num_type == cls.ROMAN_NUMERAL:
            roman_numeral = numeric_expressions.roman_numeral(number)
            if roman_numeral is not None:
                return roman_numeral
            else:
                return safe_decode(number)
        elif num_type == cls.HYPHENATED_NUMBER:
            number2 = number + sample_floors_range(1, cls.max_floors)
            return u'{}-{}'.format(number, number2)
        else:
            alphabet = address_config.get_property('alphabet',
                                                   language,
                                                   country=country,
                                                   default=latin_alphabet)
            alphabet_probability = address_config.get_property(
                'alphabet_probability',
                language,
                country=country,
                default=None)
            if alphabet_probability is not None and random.random(
            ) >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet)
            if num_type == cls.ALPHA:
                return letter
            else:
                number = weighted_choice(cls.floors_letters,
                                         cls.floors_letters_cdf)

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}').format(letter, number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}').format(number, letter)

        return None
예제 #18
0
    def phrase(cls, unit, language, country=None, zone=None):
        if unit is not None:
            key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format(
                zone)

            if not address_config.get_property(key, language, country=country):
                return None

            is_alpha = safe_decode(unit).isalpha()

            direction_unit = None
            add_direction = address_config.get_property(
                '{}.add_direction'.format(key), language, country=country)
            if add_direction:
                direction_unit = cls.add_direction(key,
                                                   unit,
                                                   language,
                                                   country=country)

            if direction_unit and direction_unit != unit:
                unit = direction_unit
                is_alpha = False
            else:
                add_quadrant = address_config.get_property(
                    '{}.add_quadrant'.format(key), language, country=country)
                if add_quadrant:
                    unit = cls.add_quadrant(key,
                                            unit,
                                            language,
                                            country=country)
                    is_alpha = False

            return cls.numeric_phrase(key,
                                      safe_decode(unit),
                                      language,
                                      dictionaries=['unit_types_numbered'],
                                      country=country,
                                      is_alpha=is_alpha)
        else:
            key = 'units.standalone'
            values, probs = address_config.alternative_probabilities(
                key,
                language,
                dictionaries=['unit_types_standalone'],
                country=country)
            if values is None:
                return None
            phrase, phrase_props = weighted_choice(values, probs)
            return phrase.title()
예제 #19
0
    def random_int(cls,
                   language,
                   country=None,
                   num_floors=None,
                   num_basements=None):
        number = None
        if num_floors is not None:
            try:
                num_floors = int(num_floors)
            except (ValueError, TypeError):
                return weighted_choice(cls.numbered_floors,
                                       cls.floor_probs_cdf)

            if num_floors <= cls.max_floors:
                number = cls.sample_floors(num_floors,
                                           num_basements=num_basements or 0)
            else:
                number = cls.sample_floors_range(cls.max_floors + 1,
                                                 num_floors)

        else:
            number = weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)

        return number
예제 #20
0
    def pick_phrase_and_type(cls, number, language, country=None):
        values, probs = address_config.alternative_probabilities(cls.key, language, dictionaries=cls.dictionaries, country=country)
        if not values:
            return None, safe_decode(number) if number is not None else None, None

        phrase, phrase_props = weighted_choice(values, probs)

        values = []
        probs = []

        for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
            key = '{}_probability'.format(num_type)
            prob = phrase_props.get(key, None)
            if prob is not None:
                values.append(num_type)
                probs.append(prob)

        if not probs:
            num_type = cls.NUMERIC
        else:
            probs = cdf(probs)
            num_type = weighted_choice(values, probs)

        return num_type, phrase, phrase_props[num_type]
예제 #21
0
    def random(cls, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type('staircases.alphanumeric', language, country=country)
        if num_type is None:
            return None

        if num_type == cls.NUMERIC:
            number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
            return safe_decode(number)
        elif num_type == cls.HYPHENATED_NUMBER:
            number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
            number2 = number + weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
            return u'{}-{}'.format(number, number2)
        else:
            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
            if alphabet_probability is not None and random.random() >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet, 2.0)
            if num_type == cls.ALPHA:
                return safe_decode(letter)
            else:
                number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)

                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
                hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
                whitespace_phrase = u''
                r = random.random()
                if r < whitespace_probability:
                    whitespace_phrase = u' '
                elif r < (whitespace_probability + hyphen_probability):
                    whitespace_phrase = u'-'

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
예제 #22
0
    def random(cls, language, country=None):
        category_props = address_config.get_property('categories', language, country=country)
        if category_props is None:
            return None

        values = []
        probs = []

        for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
            k = '{}_probability'.format(prep_phrase_type)
            prob = category_props.get(k, None)
            if prob is not None:
                values.append(prep_phrase_type)
                probs.append(prob)

        probs = cdf(probs)

        return weighted_choice(values, probs)
예제 #23
0
    def random(cls, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type(
            'po_boxes.alphanumeric', language, country=country)
        if num_type is None:
            return None

        if num_type != cls.ALPHA:
            digit_config = address_config.get_property('po_boxes.digits',
                                                       language,
                                                       country=country,
                                                       default=[])
            values = []
            probs = []

            for val in digit_config:
                values.append(val['length'])
                probs.append(val['probability'])

            probs = cdf(probs)

            num_digits = weighted_choice(values, probs)

            digits = cls.random_digits(num_digits)
            number = Digits.rewrite(digits, language, num_type_props)

            if num_type == cls.NUMERIC:
                return safe_decode(number)
            else:
                letter = cls.random_letter(language, country=country)

                whitespace_probability = float(
                    num_type_props.get('whitespace_probability', 0.0))
                whitespace_phrase = six.u(
                    ' ') if whitespace_probability and random.random(
                    ) < whitespace_probability else six.u('')

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase,
                                                  number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase,
                                                  letter)
        else:
            return cls.random_letter(language, country=country)
예제 #24
0
    def phrase(cls, language, key, value, is_plural=False, country=None):
        category_phrase = category_config.get_phrase(language,
                                                     key,
                                                     value,
                                                     is_plural=is_plural)
        if not category_phrase:
            return NULL_CATEGORY_QUERY

        category_phrase = safe_decode(category_phrase)

        prep_phrase_type = CategoryPreposition.random(language,
                                                      country=country)

        if prep_phrase_type in (None, CategoryPreposition.NULL):
            return CategoryQuery(category_phrase,
                                 prep=None,
                                 add_place_name=True,
                                 add_address=True)

        values, probs = address_config.alternative_probabilities(
            'categories.{}'.format(prep_phrase_type),
            language,
            country=country)
        if not values:
            return CategoryQuery(category_phrase,
                                 prep=None,
                                 add_place_name=True,
                                 add_address=True)

        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
        prep_phrase = safe_decode(prep_phrase)

        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY,
                                               CategoryPreposition.NEAR_ME,
                                               CategoryPreposition.IN)
        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY,
                                                  CategoryPreposition.NEAR_ME)

        return CategoryQuery(category_phrase,
                             prep=prep_phrase,
                             add_place_name=add_place_name,
                             add_address=add_address)
예제 #25
0
    def rewrite(cls, d, lang, props, num_type=CARDINAL):
        if not props:
            return d

        d = safe_decode(d)

        values = []
        probs = []

        for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH,
                           cls.ROMAN_NUMERAL):
            key = '{}_probability'.format(digit_type)
            if key in props:
                values.append(digit_type)
                probs.append(props[key])

        if not isclose(sum(probs), 1.0):
            values.append(cls.ASCII)
            probs.append(1.0 - sum(probs))

        probs = cdf(probs)
        digit_type = weighted_choice(values, probs)

        if digit_type == cls.ASCII:
            return d
        elif digit_type == cls.SPELLOUT:
            return cls.rewrite_spellout(d, lang, num_type, props)
        elif digit_type == cls.ROMAN_NUMERAL:
            roman_numeral = cls.rewrite_roman_numeral(d)
            if random.random() < props.get('ordinal_suffix_probability', 0.0):
                ordinal_suffix = ordinal_expressions.get_suffix(
                    d, lang, gender=props.get('gender', None))
                if ordinal_suffix:
                    roman_numeral = six.u('{}{}').format(
                        roman_numeral, ordinal_suffix)
            return roman_numeral
        elif digit_type == cls.UNICODE_FULL_WIDTH:
            return cls.rewrite_full_width(d)
        else:
            return d
예제 #26
0
파일: query.py 프로젝트: BERENZ/libpostal
    def phrase(cls, chain, language, country=None):
        if not chain:
            return NULL_CHAIN_QUERY

        chain_phrase = safe_decode(chain)

        prep_phrase_type = CategoryPreposition.random(language, country=country)

        if prep_phrase_type in (None, CategoryPreposition.NULL):
            return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)

        values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
        if not values:
            return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)

        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
        prep_phrase = safe_decode(prep_phrase)

        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)

        return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
예제 #27
0
    def name(self, country, language, component, name):
        all_replacements = self.country_regex_replacements.get(
            country, []) + self.country_regex_replacements.get(None, [])

        prefixes, prefix_probs = self.prefixes.get((language, component),
                                                   (None, None))
        suffixes, suffix_probs = self.suffixes.get((language, component),
                                                   (None, None))

        if not all_replacements and not prefixes and not suffixes:
            return name

        for regex, group, prob in all_replacements:
            match = regex.match(name)
            if match and random.random() < prob:
                name = match.group(group)

        for affixes, affix_probs, regexes, key, direction in (
            (prefixes, prefix_probs, self.prefix_regexes, 'prefix',
             0), (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)):
            if affixes is not None:
                regex = regexes[language, component]
                if regex.match(name):
                    continue

                affix = weighted_choice(affixes, affix_probs)

                if affix is not None:
                    whitespace = affix.get('whitespace', True)
                    space_val = six.u(' ') if whitespace else six.u('')
                    affix = affix[key]
                    if direction == 0:
                        return six.u('{}{}{}').format(affix, space_val,
                                                      safe_decode(name))
                    else:
                        return six.u('{}{}{}').format(safe_decode(name),
                                                      space_val, affix)

        return name
예제 #28
0
파일: floors.py 프로젝트: BERENZ/libpostal
    def random_from_int(cls, number, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type('levels.alphanumeric', language, country=country)
        if num_type is None:
            return None

        numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))

        if number >= 0:
            number += numbering_starts_at

        if num_type == cls.NUMERIC:
            return safe_decode(number)
        elif num_type == cls.ROMAN_NUMERAL:
            roman_numeral = numeric_expressions.roman_numeral(number)
            if roman_numeral is not None:
                return roman_numeral
            else:
                return safe_decode(number)
        elif num_type == cls.HYPHENATED_NUMBER:
            number2 = number + sample_floors_range(1, cls.max_floors)
            return u'{}-{}'.format(number, number2)
        else:
            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
            if alphabet_probability is not None and random.random() >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet)
            if num_type == cls.ALPHA:
                return letter
            else:
                number = weighted_choice(cls.floors_letters, cls.floors_letters_cdf)

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}').format(letter, number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}').format(number, letter)

        return None
예제 #29
0
    def phrase(cls, chain, language, country=None):
        if not chain:
            return NULL_CHAIN_QUERY

        chain_phrase = safe_decode(chain)

        prep_phrase_type = CategoryPreposition.random(language,
                                                      country=country)

        if prep_phrase_type in (None, CategoryPreposition.NULL):
            return ChainQuery(chain_phrase,
                              prep=None,
                              add_place_name=True,
                              add_address=True)

        values, probs = address_config.alternative_probabilities(
            'categories.{}'.format(prep_phrase_type),
            language,
            country=country)
        if not values:
            return ChainQuery(chain_phrase,
                              prep=None,
                              add_place_name=True,
                              add_address=True)

        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
        prep_phrase = safe_decode(prep_phrase)

        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY,
                                               CategoryPreposition.NEAR_ME,
                                               CategoryPreposition.IN)
        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY,
                                                  CategoryPreposition.NEAR_ME)

        return ChainQuery(chain_phrase,
                          prep=prep_phrase,
                          add_place_name=add_place_name,
                          add_address=add_address)
예제 #30
0
파일: query.py 프로젝트: BERENZ/libpostal
    def phrase(cls, language, key, value, is_plural=False, country=None):
        category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
        if not category_phrase:
            return NULL_CATEGORY_QUERY

        category_phrase = safe_decode(category_phrase)

        prep_phrase_type = CategoryPreposition.random(language, country=country)

        if prep_phrase_type in (None, CategoryPreposition.NULL):
            return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)

        values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
        if not values:
            return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)

        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
        prep_phrase = safe_decode(prep_phrase)

        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)

        return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
예제 #31
0
    def choose_alphanumeric_type(cls, key, language, country=None):
        alphanumeric_props = address_config.get_property(key, language, country=country, default=None)
        if alphanumeric_props is None:
            return None, None

        values = []
        probs = []

        for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.HYPHENATED_NUMBER, cls.ROMAN_NUMERAL):
            key = '{}_probability'.format(num_type)
            prob = alphanumeric_props.get(key)
            if prob is not None:
                values.append(num_type)
                probs.append(prob)

        if not values:
            return None, None

        probs = cdf(probs)
        num_type = weighted_choice(values, probs)
        num_type_props = alphanumeric_props.get(num_type, {})

        return num_type, num_type_props
예제 #32
0
    def rewrite(cls, d, lang, props, num_type=CARDINAL):
        if not props:
            return d

        d = safe_decode(d)

        values = []
        probs = []

        for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH, cls.ROMAN_NUMERAL):
            key = '{}_probability'.format(digit_type)
            if key in props:
                values.append(digit_type)
                probs.append(props[key])

        if not isclose(sum(probs), 1.0):
            values.append(cls.ASCII)
            probs.append(1.0 - sum(probs))

        probs = cdf(probs)
        digit_type = weighted_choice(values, probs)

        if digit_type == cls.ASCII:
            return d
        elif digit_type == cls.SPELLOUT:
            return cls.rewrite_spellout(d, lang, num_type, props)
        elif digit_type == cls.ROMAN_NUMERAL:
            roman_numeral = cls.rewrite_roman_numeral(d)
            if random.random() < props.get('ordinal_suffix_probability', 0.0):
                ordinal_suffix = ordinal_expressions.get_suffix(d, lang, gender=props.get('gender', None))
                if ordinal_suffix:
                    roman_numeral = six.u('{}{}').format(roman_numeral, ordinal_suffix)
            return roman_numeral
        elif digit_type == cls.UNICODE_FULL_WIDTH:
            return cls.rewrite_full_width(d)
        else:
            return d
예제 #33
0
def sample_alphabet(alphabet, b=1.5):
    '''
    Sample an "alphabet" using a Zipfian distribution (frequent items are very
    frequent, long tail of infrequent items). If we look at something like
    unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
    "Unit Z" simply because most dwellings only have a few units. Sampling
    letters from a Zipfian distribution rather than uniformly means that instead
    of every letter having the same likelihood (1/26), letters toward the beginning
    of the alphabet are much more likely to be selected. Letters toward the end can
    still be selected sometimes, but are not very likely.

    Note letters don't necessarily need to be sorted alphabetically, just in order
    of frequency.
    '''
    global alphabets
    alphabet = tuple(alphabet)
    if alphabet not in alphabets:
        probs = zipfian_distribution(len(alphabet), b)
        probs_cdf = cdf(probs)

        alphabets[alphabet] = probs_cdf

    probs_cdf = alphabets[alphabet]
    return weighted_choice(alphabet, probs_cdf)
예제 #34
0
def sample_alphabet(alphabet, b=1.5):
    '''
    Sample an "alphabet" using a Zipfian distribution (frequent items are very
    frequent, long tail of infrequent items). If we look at something like
    unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
    "Unit Z" simply because most dwellings only have a few units. Sampling
    letters from a Zipfian distribution rather than uniformly means that instead
    of every letter having the same likelihood (1/26), letters toward the beginning
    of the alphabet are much more likely to be selected. Letters toward the end can
    still be selected sometimes, but are not very likely.

    Note letters don't necessarily need to be sorted alphabetically, just in order
    of frequency.
    '''
    global alphabets
    alphabet = tuple(alphabet)
    if alphabet not in alphabets:
        probs = zipfian_distribution(len(alphabet), b)
        probs_cdf = cdf(probs)

        alphabets[alphabet] = probs_cdf

    probs_cdf = alphabets[alphabet]
    return weighted_choice(alphabet, probs_cdf)
예제 #35
0
    def random(cls, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type('po_boxes.alphanumeric', language, country=country)
        if num_type is None:
            return None

        if num_type != cls.ALPHA:
            digit_config = address_config.get_property('po_boxes.digits', language, country=country, default=[])
            values = []
            probs = []

            for val in digit_config:
                values.append(val['length'])
                probs.append(val['probability'])

            probs = cdf(probs)

            num_digits = weighted_choice(values, probs)

            digits = cls.random_digits(num_digits)
            number = Digits.rewrite(digits, language, num_type_props)


            if num_type == cls.NUMERIC:
                return safe_decode(number)
            else:
                letter = cls.random_letter(language, country=country)

                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
                whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
        else:
            return cls.random_letter(language, country=country)
예제 #36
0
파일: names.py 프로젝트: BERENZ/libpostal
 def name_key(self, props, component):
     name_keys, probs = self.name_key_dist(props, component)
     return weighted_choice(name_keys, probs)
예제 #37
0
    def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
        has_alpha = False
        has_numeric = True
        is_integer = False
        is_none = False
        if num is not None:
            try:
                num_int = int(num)
                is_integer = True
            except ValueError:
                try:
                    num_float = float(num)
                except ValueError:
                    tokens = tokenize(safe_decode(num))
                    has_numeric = False
                    for t, c in tokens:
                        if c == token_types.NUMERIC:
                            has_numeric = True
                        if any((ch.isalpha() for ch in t)):
                            has_alpha = True

                    if strict_numeric and has_alpha:
                        return safe_decode(num)

        else:
            is_none = True

        values, probs = None, None

        if is_alpha:
            values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)

        # Pick a phrase given the probability distribution from the config
        if values is None:
            values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)

        if not values:
            return safe_decode(num) if not is_none else None

        phrase, phrase_props = weighted_choice(values, probs)

        values = []
        probs = []

        # Dictionaries are lowercased, so title case here
        if phrase_props.get('title_case', True):
            phrase = phrase.title()

        '''
        There are a few ways we can express the number itself

        1. Alias it as some standalone word like basement (for floor "-1")
        2. Use the number itself, so "Floor 2"
        3. Append/prepend an affix e.g. 2/F for second floor
        4. As an ordinal expression e.g. "2nd Floor"
        '''
        have_standalone = False
        have_null = False
        for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
            key = '{}_probability'.format(num_type)
            prob = phrase_props.get(key)
            if prob is not None:
                if num_type == 'standalone':
                    have_standalone = True
                elif num_type == 'null':
                    have_null = True
                values.append(num_type)
                probs.append(prob)
            elif num_type in phrase_props:
                values.append(num_type)
                probs.append(1.0)
                break

        if not probs or is_none:
            return phrase

        # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
        if has_alpha:
            values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
            total = float(sum(probs))
            if isclose(total, 0.0):
                return None

            probs = [p / total for p in probs]

        probs = cdf(probs)

        if len(values) < 2:
            if have_standalone:
                num_type = 'standalone'
            elif have_null:
                num_type = 'null'
            else:
                num_type = 'numeric'
        else:
            num_type = weighted_choice(values, probs)

        if num_type == 'standalone':
            return phrase
        elif num_type == 'null':
            return safe_decode(num)

        props = phrase_props[num_type]

        if is_integer:
            num_int = int(num)
            if phrase_props.get('number_abs_value', False):
                num_int = abs(num_int)
                num = num_int

            if 'number_min_abs_value' in phrase_props and num_int < phrase_props['number_min_abs_value']:
                return None

            if 'number_max_abs_value' in phrase_props and num_int > phrase_props['number_max_abs_value']:
                return None

            if phrase_props.get('number_subtract_abs_value'):
                num_int -= phrase_props['number_subtract_abs_value']
                num = num_int

        num = safe_decode(num)
        digits_props = props.get('digits')
        if digits_props:
            # Inherit the gender and category e.g. for ordinals
            for k in ('gender', 'category'):
                if k in props:
                    digits_props[k] = props[k]
            num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL)

        # Do we add the numeric phrase e.g. Floor No 1
        add_number_phrase = props.get('add_number_phrase', False)
        if add_number_phrase and random.random() < props['add_number_phrase_probability']:
            num = Number.phrase(num, language, country=country)

        whitespace_default = True

        if num_type == 'numeric_affix':
            phrase = props['affix']
            if props.get('upper_case', True):
                phrase = phrase.upper()
            if 'zero_pad' in props and num.isdigit():
                num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
            whitespace_default = False
        elif num_type == 'ordinal' and safe_decode(num).isdigit():
            ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))

            if ordinal_expression is not None:
                num = ordinal_expression

        if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
            if random.random() < props['null_phrase_probability']:
                return num

        direction = props['direction']
        whitespace = props.get('whitespace', whitespace_default)

        whitespace_probability = props.get('whitespace_probability')
        if whitespace_probability is not None:
            whitespace = random.random() < whitespace_probability

        # Occasionally switch up if direction_probability is specified
        if random.random() > props.get('direction_probability', 1.0):
            if direction == 'left':
                direction = 'right'
            elif direction == 'right':
                direction = 'left'

        whitespace_phrase = six.u(' ') if whitespace else six.u('')
        # Phrase goes to the left of hte number
        if direction == 'left':
            return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
        # Phrase goes to the right of the number
        elif direction == 'right':
            return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
        # Need to specify a direction, otherwise return naked number
        else:
            return safe_decode(num)
예제 #38
0
파일: config.py 프로젝트: BERENZ/libpostal
    def dropout_components(self, components, boundaries=(), country=None, population=None, unambiguous_city=False):
        containing_ids = set()

        for boundary in boundaries:
            object_type = boundary.get('type')
            object_id = safe_encode(boundary.get('id', ''))
            if not (object_type and object_id):
                continue
            containing_ids.add((object_type, object_id))

        original_bitset = ComponentDependencies.component_bitset(components)

        names = defaultdict(list)
        admin_components = [c for c in components if c in self.ADMIN_COMPONENTS]
        for c in admin_components:
            names[components[c]].append(c)

        same_name = set()
        for c, v in six.iteritems(names):
            if len(v) > 1:
                same_name |= set(v)

        new_components = components.copy()

        city_replacements = set()
        if AddressFormatter.CITY not in components:
            city_replacements = self.city_replacements(country)

        for component in admin_components:
            include = self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city)

            if not include and component not in city_replacements:
                # Note: this check is for cities that have the same name as their admin
                # areas e.g. Luxembourg, Luxembourg. In cases like this, if we were to drop
                # city, we don't want to include country on its own. This should help the parser
                # default to the city in ambiguous cases where only one component is specified.
                if not (component == AddressFormatter.CITY and component in same_name):
                    new_components.pop(component, None)
                else:
                    value = components[component]
                    for c in names[value]:
                        new_components.pop(c, None)

        for component in self.ADMIN_COMPONENTS:
            value = self.get_property(('components', component, 'value'), country=country, default=None)

            if not value:
                values, probs = self.cdf_cache.get((country, component), (None, None))
                if values is None:
                    values = self.get_property(('components', component, 'values'), country=country, default=None)
                    if values is not None:
                        values, probs = zip(*[(v['value'], float(v['probability'])) for v in values])
                        probs = cdf(probs)
                        self.cdf_cache[(country, component)] = (values, probs)

                if values is not None:
                    value = weighted_choice(values, probs)

            if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city):
                new_components[component] = value

        self.drop_invalid_components(new_components, country, original_bitset=original_bitset)

        return new_components
예제 #39
0
    def revised_template(self, template, components, country, language=None):
        if not template:
            return None

        country_language = None
        if language:
            country_language = '{}_{}'.format(country, language)

        alias_country = self.country_aliases.get(country.upper(), country).lower()
        for term in (country, country_language):
            if term in self.country_insertions or term in self.country_conditionals:
                break
        else:
            country = alias_country

        cache_keys = []

        invert_probability = self.country_invert_probabilities.get(country, self.global_invert_probability)
        if random.random() < invert_probability:
            cache_keys.append('inverted')
            cache_key = tuple(sorted(cache_keys))
            if cache_key in self.template_cache:
                template = self.template_cache[cache_key]
            else:
                template = self.inverted(template)
                self.template_cache[cache_key] = template

        for component in sorted(components, key=self.component_order.get):
            scope = country
            insertions = nested_get(self.country_insertions, (country, component), default=None)
            conditionals = nested_get(self.country_conditionals, (country, component), default=None)

            if insertions is None and language:
                insertions = nested_get(self.country_insertions, (country_language, component), default=None)
                scope = country_language

            if conditionals is None and language:
                conditionals = nested_get(self.country_conditionals, (country_language, component), default=None)

            if insertions is None and language:
                insertions = nested_get(self.language_insertions, (language, component), default=None)
                scope = 'lang:{}'.format(language)

            if conditionals is None and language:
                conditionals = nested_get(self.language_conditionals, (language, component), default=None)

            if insertions is None:
                insertions = nested_get(self.global_insertions, (component,), default=None)
                scope = None

            if conditionals is None:
                conditionals = nested_get(self.global_conditionals, (component,), default=None)

            if insertions is not None:
                conditional_insertions = None
                if conditionals is not None:
                    for k, v in six.iteritems(conditionals):
                        if k in components:
                            conditional_insertions = v
                            break

                order, other = None, None

                # Check the conditional probabilities first
                if conditional_insertions is not None:
                    values, probs = conditional_insertions
                    order, other = weighted_choice(values, probs)

                # If there are no conditional probabilites or the "default" value was chosen, sample from the marginals
                if other is None:
                    values, probs = insertions
                    order, other = weighted_choice(values, probs)

                # Even though we may change the value of "other" below, use
                # the original cache key because changes from here on are
                # deterministic and should be cached.
                insertion_id = (scope, component, order, other)
                cache_keys.append(insertion_id)

                cache_key = tuple(sorted(cache_keys))

                if cache_key in self.template_cache:
                    template = self.template_cache[cache_key]
                    continue

                other_token = self.tag_token(other)

                # Don't allow insertions between road and house_number
                # This can happen if e.g. "level" is supposed to be inserted
                # after house number assuming that it's a continental European
                # address where house number comes after road. If in a previous
                # insertion we were to swap house_number and road to create an
                # English-style address, the final ordering would be
                # house_number, unit, road, which we don't want. So effectively
                # treat house_number and road as an atomic unit.

                if other == self.HOUSE_NUMBER and component != self.ROAD:
                    road_tag = self.tag_token(self.ROAD)
                    house_number_tag = other_token

                    if house_number_tag in template and road_tag in template:
                        road_after_house_number = template.index(road_tag) > template.index(house_number_tag)

                        if road_after_house_number and order == self.AFTER:
                            other = self.ROAD
                        elif not road_after_house_number and order == self.BEFORE:
                            other = self.ROAD
                elif other == self.ROAD and component != self.HOUSE_NUMBER:
                    house_number_tag = self.tag_token(self.HOUSE_NUMBER)
                    road_tag = other_token

                    if house_number_tag in template and road_tag in template:
                        road_before_house_number = template.index(road_tag) < template.index(house_number_tag)

                        if road_before_house_number and order == self.AFTER:
                            other = self.HOUSE_NUMBER
                        elif not road_before_house_number and order == self.BEFORE:
                            other = self.HOUSE_NUMBER

                if order == self.BEFORE and other_token in template:
                    template = self.insert_component(template, component, before=other)
                elif order == self.AFTER and other_token in template:
                    template = self.insert_component(template, component, after=other)
                elif order == self.LAST:
                    template = self.insert_component(template, component, last=True)
                elif order == self.FIRST:
                    template = self.insert_component(template, component, first=True)
                else:
                    continue

                self.template_cache[cache_key] = template

        return template
예제 #40
0
    def numeric_phrase(cls,
                       key,
                       num,
                       language,
                       country=None,
                       dictionaries=(),
                       strict_numeric=False,
                       is_alpha=False):
        has_alpha = False
        has_numeric = True
        is_integer = False
        is_none = False
        if num is not None:
            try:
                num_int = int(num)
                is_integer = True
            except ValueError:
                try:
                    num_float = float(num)
                except ValueError:
                    tokens = tokenize(safe_decode(num))
                    has_numeric = False
                    for t, c in tokens:
                        if c == token_types.NUMERIC:
                            has_numeric = True
                        if any((ch.isalpha() for ch in t)):
                            has_alpha = True

                    if strict_numeric and has_alpha:
                        return safe_decode(num)

        else:
            is_none = True

        values, probs = None, None

        if is_alpha:
            values, probs = address_config.alternative_probabilities(
                '{}.alpha'.format(key),
                language,
                dictionaries=dictionaries,
                country=country)

        # Pick a phrase given the probability distribution from the config
        if values is None:
            values, probs = address_config.alternative_probabilities(
                key, language, dictionaries=dictionaries, country=country)

        if not values:
            return safe_decode(num) if not is_none else None

        phrase, phrase_props = weighted_choice(values, probs)

        values = []
        probs = []

        # Dictionaries are lowercased, so title case here
        if phrase_props.get('title_case', True):
            phrase = phrase.title()
        '''
        There are a few ways we can express the number itself

        1. Alias it as some standalone word like basement (for floor "-1")
        2. Use the number itself, so "Floor 2"
        3. Append/prepend an affix e.g. 2/F for second floor
        4. As an ordinal expression e.g. "2nd Floor"
        '''
        have_standalone = False
        have_null = False
        for num_type in ('standalone', 'null', 'numeric', 'numeric_affix',
                         'ordinal'):
            key = '{}_probability'.format(num_type)
            prob = phrase_props.get(key)
            if prob is not None:
                if num_type == 'standalone':
                    have_standalone = True
                elif num_type == 'null':
                    have_null = True
                values.append(num_type)
                probs.append(prob)
            elif num_type in phrase_props:
                values.append(num_type)
                probs.append(1.0)
                break

        if not probs or is_none:
            return phrase

        # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
        if has_alpha:
            values, probs = zip(*[(v, p) for v, p in zip(values, probs)
                                  if v in ('numeric', 'null', 'standalone')])
            total = float(sum(probs))
            if isclose(total, 0.0):
                return None

            probs = [p / total for p in probs]

        probs = cdf(probs)

        if len(values) < 2:
            if have_standalone:
                num_type = 'standalone'
            elif have_null:
                num_type = 'null'
            else:
                num_type = 'numeric'
        else:
            num_type = weighted_choice(values, probs)

        if num_type == 'standalone':
            return phrase
        elif num_type == 'null':
            return safe_decode(num)

        props = phrase_props[num_type]

        if is_integer:
            num_int = int(num)
            if phrase_props.get('number_abs_value', False):
                num_int = abs(num_int)
                num = num_int

            if 'number_min_abs_value' in phrase_props and num_int < phrase_props[
                    'number_min_abs_value']:
                return None

            if 'number_max_abs_value' in phrase_props and num_int > phrase_props[
                    'number_max_abs_value']:
                return None

            if phrase_props.get('number_subtract_abs_value'):
                num_int -= phrase_props['number_subtract_abs_value']
                num = num_int

        num = safe_decode(num)
        digits_props = props.get('digits')
        if digits_props:
            # Inherit the gender and category e.g. for ordinals
            for k in ('gender', 'category'):
                if k in props:
                    digits_props[k] = props[k]
            num = Digits.rewrite(num,
                                 language,
                                 digits_props,
                                 num_type=Digits.CARDINAL
                                 if num_type != 'ordinal' else Digits.ORDINAL)

        # Do we add the numeric phrase e.g. Floor No 1
        add_number_phrase = props.get('add_number_phrase', False)
        if add_number_phrase and random.random(
        ) < props['add_number_phrase_probability']:
            num = Number.phrase(num, language, country=country)

        whitespace_default = True

        if num_type == 'numeric_affix':
            phrase = props['affix']
            if props.get('upper_case', True):
                phrase = phrase.upper()
            if 'zero_pad' in props and num.isdigit():
                num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
            whitespace_default = False
        elif num_type == 'ordinal' and safe_decode(num).isdigit():
            ordinal_expression = ordinal_expressions.suffixed_number(
                num, language, gender=props.get('gender', None))

            if ordinal_expression is not None:
                num = ordinal_expression

        if 'null_phrase_probability' in props and (
                num_type == 'ordinal' or
            (has_alpha and
             (has_numeric or 'null_phrase_alpha_only' in props))):
            if random.random() < props['null_phrase_probability']:
                return num

        direction = props['direction']
        whitespace = props.get('whitespace', whitespace_default)

        whitespace_probability = props.get('whitespace_probability')
        if whitespace_probability is not None:
            whitespace = random.random() < whitespace_probability

        # Occasionally switch up if direction_probability is specified
        if random.random() > props.get('direction_probability', 1.0):
            if direction == 'left':
                direction = 'right'
            elif direction == 'right':
                direction = 'left'

        whitespace_phrase = six.u(' ') if whitespace else six.u('')
        # Phrase goes to the left of hte number
        if direction == 'left':
            return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
        # Phrase goes to the right of the number
        elif direction == 'right':
            return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
        # Need to specify a direction, otherwise return naked number
        else:
            return safe_decode(num)
예제 #41
0
파일: units.py 프로젝트: BERENZ/libpostal
 def for_floor(cls, floor_number, num_digits=None):
     num_digits = num_digits if num_digits is not None else cls.sample_num_digits()
     unit = weighted_choice(cls.positive_units_floors, cls.positive_units_floors_cdf)
     return six.u('{}{}').format(floor_number, safe_decode(unit).zfill(num_digits))
예제 #42
0
파일: units.py 프로젝트: BERENZ/libpostal
 def sample_num_digits(cls):
     return weighted_choice(cls.num_digits, cls.num_digits_cdf)
예제 #43
0
파일: units.py 프로젝트: BERENZ/libpostal
    def random(cls, language, country=None, num_floors=None, num_basements=None, floor=None):
        num_type, num_type_props = cls.choose_alphanumeric_type('units.alphanumeric', language, country=country)
        if num_type is None:
            return None

        use_floor_prob = address_config.get_property('units.alphanumeric.use_floor_probability', language, country=country, default=0.0)

        use_positive_numbers_prob = address_config.get_property('units.alphanumeric.use_positive_numbers_probability', language, country=country, default=0.0)

        if (num_floors is None and floor is None) or random.random() >= use_floor_prob:
            if random.random() >= use_positive_numbers_prob:
                number = weighted_choice(cls.numbered_units, cls.unit_probs_cdf)
            else:
                number = weighted_choice(cls.positive_units, cls.positive_units_cdf)
        else:
            if floor is None or not floor.isdigit():
                floor = Floor.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)

            floor_numbering_starts_at = address_config.get_property('levels.numbering_starts_at', language, country=country, default=0)
            ground_floor_starts_at = address_config.get_property('units.alphanumeric.use_floor_ground_starts_at', language, country=country, default=None)

            if ground_floor_starts_at is not None:
                try:
                    floor = int(floor)
                    if floor >= floor_numbering_starts_at:
                        floor -= floor_numbering_starts_at
                    floor += ground_floor_starts_at
                    floor = safe_decode(floor)
                except (TypeError, ValueError):
                    pass

            use_floor_affix_prob = address_config.get_property('units.alphanumeric.use_floor_numeric_affix_probability', language, country=country, default=0.0)
            if use_floor_affix_prob and random.random() < use_floor_affix_prob:
                floor_phrase = Floor.phrase(floor, language, country=country)
                # Only works if the floor phrase is strictly numeric e.g. "1" or "H1"
                if is_numeric_strict(floor_phrase):
                    unit = weighted_choice(cls.positive_units, cls.positive_units_cdf)

                    unit_num_digits = address_config.get_property('units.alphanumeric.use_floor_unit_num_digits', language, country=country, default=None)
                    if unit_num_digits is not None:
                        unit = safe_decode(unit).zfill(unit_num_digits)

                    return six.u('{}{}').format(floor_phrase, unit)

            floor_num_digits = address_config.get_property('units.alphanumeric.use_floor_floor_num_digits', language, country=country, default=None)
            if floor_num_digits is not None and floor.isdigit():
                floor = floor.zfill(floor_num_digits)

            number = cls.for_floor(floor)

        if num_type == cls.NUMERIC:
            return safe_decode(number)
        elif num_type == cls.HYPHENATED_NUMBER:
            number2 = weighted_choice(cls.positive_units, cls.positive_units_cdf)
            range_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.range_probability', language, country=country, default=0.5))
            direction = address_config.get_property('units.alphanumeric.hyphenated_number.direction', language, country=country, default='right')
            direction_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.direction_probability', language, country=country, default=0.0))

            if random.random() < direction_prob:
                direction = 'left' if direction == 'right' else 'right'

            direction_right = direction == 'right'

            if random.random() < range_prob:
                if direction_right:
                    number2 += number
                else:
                    number2 = max(0, number - number2)
            if direction == 'right':
                return u'{}-{}'.format(number, number2)
            else:
                return u'{}-{}'.format(number2, number)
        else:
            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
            if alphabet_probability is not None and random.random() >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet)
            if num_type == cls.ALPHA:
                return safe_decode(letter)
            else:
                if num_floors is None:
                    number = weighted_choice(cls.positive_units_letters, cls.positive_units_letters_cdf)

                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
                hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
                whitespace_phrase = u''
                r = random.random()
                if r < whitespace_probability:
                    whitespace_phrase = u' '
                elif r < (whitespace_probability + hyphen_probability):
                    whitespace_phrase = u'-' 

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
예제 #44
0
    def dropout_components(self,
                           components,
                           boundaries=(),
                           country=None,
                           population=None,
                           unambiguous_city=False):
        containing_ids = set()

        for boundary in boundaries:
            object_type = boundary.get('type')
            object_id = safe_encode(boundary.get('id', ''))
            if not (object_type and object_id):
                continue
            containing_ids.add((object_type, object_id))

        original_bitset = ComponentDependencies.component_bitset(components)

        names = defaultdict(list)
        admin_components = [
            c for c in components if c in self.ADMIN_COMPONENTS
        ]
        for c in admin_components:
            names[components[c]].append(c)

        same_name = set()
        for c, v in six.iteritems(names):
            if len(v) > 1:
                same_name |= set(v)

        new_components = components.copy()

        city_replacements = set()
        if AddressFormatter.CITY not in components:
            city_replacements = self.city_replacements(country)

        for component in admin_components:
            include = self.include_component(component,
                                             containing_ids,
                                             country=country,
                                             population=population,
                                             unambiguous_city=unambiguous_city)

            if not include and component not in city_replacements:
                # Note: this check is for cities that have the same name as their admin
                # areas e.g. Luxembourg, Luxembourg. In cases like this, if we were to drop
                # city, we don't want to include country on its own. This should help the parser
                # default to the city in ambiguous cases where only one component is specified.
                if not (component == AddressFormatter.CITY
                        and component in same_name):
                    new_components.pop(component, None)
                else:
                    value = components[component]
                    for c in names[value]:
                        new_components.pop(c, None)

        for component in self.ADMIN_COMPONENTS:
            value = self.get_property(('components', component, 'value'),
                                      country=country,
                                      default=None)

            if not value:
                values, probs = self.cdf_cache.get((country, component),
                                                   (None, None))
                if values is None:
                    values = self.get_property(
                        ('components', component, 'values'),
                        country=country,
                        default=None)
                    if values is not None:
                        values, probs = zip(*[(v['value'],
                                               float(v['probability']))
                                              for v in values])
                        probs = cdf(probs)
                        self.cdf_cache[(country, component)] = (values, probs)

                if values is not None:
                    value = weighted_choice(values, probs)

            if value is not None and component not in components and self.include_component(
                    component,
                    containing_ids,
                    country=country,
                    population=population,
                    unambiguous_city=unambiguous_city):
                new_components[component] = value

        self.drop_invalid_components(new_components,
                                     country,
                                     original_bitset=original_bitset)

        return new_components
예제 #45
0
    def revised_template(self, template, components, country, language=None):
        if not template:
            return None

        country_language = None
        if language:
            country_language = '{}_{}'.format(country, language)

        alias_country = self.country_aliases.get(country.upper(),
                                                 country).lower()
        for term in (country, country_language):
            if term in self.country_insertions or term in self.country_conditionals:
                break
        else:
            country = alias_country

        cache_keys = []

        invert_probability = self.country_invert_probabilities.get(
            country, self.global_invert_probability)
        if random.random() < invert_probability:
            cache_keys.append('inverted')
            cache_key = tuple(sorted(cache_keys))
            if cache_key in self.template_cache:
                template = self.template_cache[cache_key]
            else:
                template = self.inverted(template)
                self.template_cache[cache_key] = template

        for component in sorted(components, key=self.component_order.get):
            scope = country
            insertions = nested_get(self.country_insertions,
                                    (country, component),
                                    default=None)
            conditionals = nested_get(self.country_conditionals,
                                      (country, component),
                                      default=None)

            if insertions is None and language:
                insertions = nested_get(self.country_insertions,
                                        (country_language, component),
                                        default=None)
                scope = country_language

            if conditionals is None and language:
                conditionals = nested_get(self.country_conditionals,
                                          (country_language, component),
                                          default=None)

            if insertions is None and language:
                insertions = nested_get(self.language_insertions,
                                        (language, component),
                                        default=None)
                scope = 'lang:{}'.format(language)

            if conditionals is None and language:
                conditionals = nested_get(self.language_conditionals,
                                          (language, component),
                                          default=None)

            if insertions is None:
                insertions = nested_get(self.global_insertions, (component, ),
                                        default=None)
                scope = None

            if conditionals is None:
                conditionals = nested_get(self.global_conditionals,
                                          (component, ),
                                          default=None)

            if insertions is not None:
                conditional_insertions = None
                if conditionals is not None:
                    for k, v in six.iteritems(conditionals):
                        if k in components:
                            conditional_insertions = v
                            break

                order, other = None, None

                # Check the conditional probabilities first
                if conditional_insertions is not None:
                    values, probs = conditional_insertions
                    order, other = weighted_choice(values, probs)

                # If there are no conditional probabilites or the "default" value was chosen, sample from the marginals
                if other is None:
                    values, probs = insertions
                    order, other = weighted_choice(values, probs)

                # Even though we may change the value of "other" below, use
                # the original cache key because changes from here on are
                # deterministic and should be cached.
                insertion_id = (scope, component, order, other)
                cache_keys.append(insertion_id)

                cache_key = tuple(sorted(cache_keys))

                if cache_key in self.template_cache:
                    template = self.template_cache[cache_key]
                    continue

                other_token = self.tag_token(other)

                # Don't allow insertions between road and house_number
                # This can happen if e.g. "level" is supposed to be inserted
                # after house number assuming that it's a continental European
                # address where house number comes after road. If in a previous
                # insertion we were to swap house_number and road to create an
                # English-style address, the final ordering would be
                # house_number, unit, road, which we don't want. So effectively
                # treat house_number and road as an atomic unit.

                if other == self.HOUSE_NUMBER and component != self.ROAD:
                    road_tag = self.tag_token(self.ROAD)
                    house_number_tag = other_token

                    if house_number_tag in template and road_tag in template:
                        road_after_house_number = template.index(
                            road_tag) > template.index(house_number_tag)

                        if road_after_house_number and order == self.AFTER:
                            other = self.ROAD
                        elif not road_after_house_number and order == self.BEFORE:
                            other = self.ROAD
                elif other == self.ROAD and component != self.HOUSE_NUMBER:
                    house_number_tag = self.tag_token(self.HOUSE_NUMBER)
                    road_tag = other_token

                    if house_number_tag in template and road_tag in template:
                        road_before_house_number = template.index(
                            road_tag) < template.index(house_number_tag)

                        if road_before_house_number and order == self.AFTER:
                            other = self.HOUSE_NUMBER
                        elif not road_before_house_number and order == self.BEFORE:
                            other = self.HOUSE_NUMBER

                if order == self.BEFORE and other_token in template:
                    template = self.insert_component(template,
                                                     component,
                                                     before=other)
                elif order == self.AFTER and other_token in template:
                    template = self.insert_component(template,
                                                     component,
                                                     after=other)
                elif order == self.LAST:
                    template = self.insert_component(template,
                                                     component,
                                                     last=True)
                elif order == self.FIRST:
                    template = self.insert_component(template,
                                                     component,
                                                     first=True)
                else:
                    continue

                self.template_cache[cache_key] = template

        return template
예제 #46
0
 def name_key(self, props, component):
     name_keys, probs = self.name_key_dist(props, component)
     return weighted_choice(name_keys, probs)
예제 #47
0
    def random(cls,
               language,
               country=None,
               num_floors=None,
               num_basements=None,
               floor=None):
        num_type, num_type_props = cls.choose_alphanumeric_type(
            'units.alphanumeric', language, country=country)
        if num_type is None:
            return None

        use_floor_prob = address_config.get_property(
            'units.alphanumeric.use_floor_probability',
            language,
            country=country,
            default=0.0)

        use_positive_numbers_prob = address_config.get_property(
            'units.alphanumeric.use_positive_numbers_probability',
            language,
            country=country,
            default=0.0)

        if (num_floors is None
                and floor is None) or random.random() >= use_floor_prob:
            if random.random() >= use_positive_numbers_prob:
                number = weighted_choice(cls.numbered_units,
                                         cls.unit_probs_cdf)
            else:
                number = weighted_choice(cls.positive_units,
                                         cls.positive_units_cdf)
        else:
            if floor is None or not floor.isdigit():
                floor = Floor.random_int(language,
                                         country=country,
                                         num_floors=num_floors,
                                         num_basements=num_basements)

            floor_numbering_starts_at = address_config.get_property(
                'levels.numbering_starts_at',
                language,
                country=country,
                default=0)
            ground_floor_starts_at = address_config.get_property(
                'units.alphanumeric.use_floor_ground_starts_at',
                language,
                country=country,
                default=None)

            if ground_floor_starts_at is not None:
                try:
                    floor = int(floor)
                    if floor >= floor_numbering_starts_at:
                        floor -= floor_numbering_starts_at
                    floor += ground_floor_starts_at
                    floor = safe_decode(floor)
                except (TypeError, ValueError):
                    pass

            use_floor_affix_prob = address_config.get_property(
                'units.alphanumeric.use_floor_numeric_affix_probability',
                language,
                country=country,
                default=0.0)
            if use_floor_affix_prob and random.random() < use_floor_affix_prob:
                floor_phrase = Floor.phrase(floor, language, country=country)
                # Only works if the floor phrase is strictly numeric e.g. "1" or "H1"
                if is_numeric_strict(floor_phrase):
                    unit = weighted_choice(cls.positive_units,
                                           cls.positive_units_cdf)

                    unit_num_digits = address_config.get_property(
                        'units.alphanumeric.use_floor_unit_num_digits',
                        language,
                        country=country,
                        default=None)
                    if unit_num_digits is not None:
                        unit = safe_decode(unit).zfill(unit_num_digits)

                    return six.u('{}{}').format(floor_phrase, unit)

            floor_num_digits = address_config.get_property(
                'units.alphanumeric.use_floor_floor_num_digits',
                language,
                country=country,
                default=None)
            if floor_num_digits is not None and floor.isdigit():
                floor = floor.zfill(floor_num_digits)

            number = cls.for_floor(floor)

        if num_type == cls.NUMERIC:
            return safe_decode(number)
        elif num_type == cls.HYPHENATED_NUMBER:
            number2 = weighted_choice(cls.positive_units,
                                      cls.positive_units_cdf)
            range_prob = float(
                address_config.get_property(
                    'units.alphanumeric.hyphenated_number.range_probability',
                    language,
                    country=country,
                    default=0.5))
            direction = address_config.get_property(
                'units.alphanumeric.hyphenated_number.direction',
                language,
                country=country,
                default='right')
            direction_prob = float(
                address_config.get_property(
                    'units.alphanumeric.hyphenated_number.direction_probability',
                    language,
                    country=country,
                    default=0.0))

            if random.random() < direction_prob:
                direction = 'left' if direction == 'right' else 'right'

            direction_right = direction == 'right'

            if random.random() < range_prob:
                if direction_right:
                    number2 += number
                else:
                    number2 = max(0, number - number2)
            if direction == 'right':
                return u'{}-{}'.format(number, number2)
            else:
                return u'{}-{}'.format(number2, number)
        else:
            alphabet = address_config.get_property('alphabet',
                                                   language,
                                                   country=country,
                                                   default=latin_alphabet)
            alphabet_probability = address_config.get_property(
                'alphabet_probability',
                language,
                country=country,
                default=None)
            if alphabet_probability is not None and random.random(
            ) >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet)
            if num_type == cls.ALPHA:
                return safe_decode(letter)
            else:
                if num_floors is None:
                    number = weighted_choice(cls.positive_units_letters,
                                             cls.positive_units_letters_cdf)

                whitespace_probability = float(
                    num_type_props.get('whitespace_probability', 0.0))
                hyphen_probability = float(
                    num_type_props.get('hyphen_probability', 0.0))
                whitespace_phrase = u''
                r = random.random()
                if r < whitespace_probability:
                    whitespace_phrase = u' '
                elif r < (whitespace_probability + hyphen_probability):
                    whitespace_phrase = u'-'

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase,
                                                  number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase,
                                                  letter)
예제 #48
0
파일: query.py 프로젝트: BERENZ/libpostal
 def phrase(cls, language, country=None):
     values, probs = address_config.alternative_probabilities('cross_streets.intersection', language, country=country)
     if not values:
         return None
     phrase, props = weighted_choice(values, probs)
     return phrase
예제 #49
0
 def sample_num_digits(cls):
     return weighted_choice(cls.num_digits, cls.num_digits_cdf)