Exemplo n.º 1
0
def init_disambiguation():
    global char_scripts, script_languages
    char_scripts[:] = []
    char_scripts.extend(get_chars_by_script())
    script_languages.update({
        script: set(langs)
        for script, langs in get_script_languages().iteritems()
    })
Exemplo n.º 2
0
    def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR):
        '''
        Given an OSM file (planet or some other bounds) containing neighborhoods
        as points (some suburbs have boundaries)

        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other geometries.
        '''
        index = cls(save_dir=output_dir)

        ensure_dir(scratch_dir)

        logger = logging.getLogger('neighborhoods')
        logger.setLevel(logging.INFO)

        qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods')
        ensure_dir(qs_scratch_dir)
        logger.info('Creating Quattroshapes neighborhoods')

        qs = QuattroshapesReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
        logger.info('Creating Zetashapes neighborhoods')
        zs = cls.create_zetashapes_neighborhoods_index()

        logger.info('Creating IDF index')
        idf = IDFIndex()

        char_scripts = get_chars_by_script()

        for idx in (zs, qs):
            for i, (props, poly) in enumerate(idx.polygons):
                name = props.get('name')
                if name is not None:
                    doc = cls.count_words(name)
                    idf.update(doc)

        for key, attrs, deps in parse_osm(filename):
            for k, v in attrs.iteritems():
                if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
                    doc = cls.count_words(v)
                    idf.update(doc)

        qs.matched = [False] * qs.i
        zs.matched = [False] * zs.i

        logger.info('Matching OSM points to neighborhood polygons')
        # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons
        num_polys = 0
        for node_id, attrs, deps in parse_osm(filename):
            try:
                lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
            except ValueError:
                continue

            osm_name = attrs.get('name')
            if not osm_name:
                continue

            is_neighborhood = attrs.get('place') == 'neighbourhood'

            ranks = []
            osm_names = []

            for key in OSM_NAME_TAGS:
                name = attrs.get(key)
                if name:
                    osm_names.append(name)

            for name_key in OSM_NAME_TAGS:
                osm_names.extend([v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key))])

            for idx in (zs, qs):
                candidates = idx.get_candidate_polygons(lat, lon, all_levels=True)

                if candidates:
                    max_sim = 0.0
                    arg_max = None

                    normalized_qs_names = {}

                    for osm_name in osm_names:

                        contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
                                                   for c in safe_decode(osm_name)))

                        for i in candidates:
                            props, poly = idx.polygons[i]
                            name = normalized_qs_names.get(i)
                            if not name:
                                name = props.get('name')
                                if not name:
                                    continue
                                for pattern, repl in cls.regex_replacements:
                                    name = pattern.sub(repl, name)
                                normalized_qs_names[i] = name

                            level = props.get(QuattroshapesReverseGeocoder.LEVEL)
                            if is_neighborhood and level != 'neighborhood':
                                continue

                            if not contains_ideographs:
                                sim = NeighborhoodDeduper.compare(osm_name, name, idf)
                            else:
                                # Many Han/Hangul characters are common, shouldn't use IDF
                                sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)

                            if sim > max_sim:
                                max_sim = sim
                                arg_max = (max_sim, props, poly.context, idx, i)

                    if arg_max:
                        ranks.append(arg_max)

            ranks.sort(key=operator.itemgetter(0), reverse=True)
            if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
                score, props, poly, idx, i = ranks[0]

                if idx is zs:
                    attrs['polygon_type'] = 'neighborhood'
                else:
                    level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
                    if level == 'neighborhood':
                        attrs['polygon_type'] = 'neighborhood'
                    else:
                        attrs['polygon_type'] = 'local_admin'

                attrs['source'] = 'osm'
                index.index_polygon(poly)
                index.add_polygon(poly, attrs)
                idx.matched[i] = True

            num_polys += 1
            if num_polys % 1000 == 0 and num_polys > 0:
                logger.info('did {} neighborhoods'.format(num_polys))

        for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')):
            for i, (props, poly) in enumerate(idx.polygons):
                if idx.matched[i]:
                    continue
                props['source'] = source
                if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
                    props['polygon_type'] = 'neighborhood'
                else:
                    # We don't actually care about local admin polygons unless they match OSM
                    continue
                index.index_polygon(poly.context)
                index.add_polygon(poly.context, props)

        return index
Exemplo n.º 3
0
    def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
        '''
        Given an OSM file (planet or some other bounds) containing neighborhoods
        as points (some suburbs have boundaries)

        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other geometries.
        '''
        index = cls(save_dir=output_dir)

        logger = logging.getLogger('neighborhoods')

        qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods')
        ensure_dir(qs_scratch_dir)

        logger.info('Creating ClickThatHood neighborhoods')
        cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()

        logger.info('Creating OSM neighborhoods')
        osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)

        logger.info('Creating Quattroshapes neighborhoods')
        qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)

        country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)

        osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir)
        osm_admin_rtree.cache_size = 1000

        logger.info('Creating IDF index')
        idf = IDFIndex()

        char_scripts = get_chars_by_script()

        for idx in (cth, qs, osmn):
            for i in xrange(idx.i):
                props = idx.get_properties(i)
                name = props.get('name')
                if name is not None:
                    doc = cls.count_words(name)
                    idf.update(doc)

        for key, attrs, deps in parse_osm(filename):
            for k, v in six.iteritems(attrs):
                if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
                    doc = cls.count_words(v)
                    idf.update(doc)

        for i in six.moves.xrange(osmn.i):
            props = osmn.get_properties(i)
            poly = osmn.get_polygon(i)

            props['source'] = 'osm'
            props['component'] = AddressFormatter.SUBURB
            props['polygon_type'] = 'neighborhood'

            index.index_polygon(poly.context)
            index.add_polygon(poly.context, props)

        qs.matched = [False] * qs.i
        cth.matched = [False] * cth.i

        logger.info('Matching OSM points to neighborhood polygons')
        # Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons
        num_polys = 0
        for element_id, attrs, deps in parse_osm(filename):
            try:
                lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
            except ValueError:
                continue

            osm_name = attrs.get('name')
            if not osm_name:
                continue

            id_type, element_id = element_id.split(':')
            element_id = long(element_id)

            props['type'] = id_type
            props['id'] = element_id

            possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD)
            is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD)

            country, candidate_languages = country_rtree.country_and_languages(lat, lon)

            component_name = None

            component_name = osm_address_components.component_from_properties(country, attrs)

            ranks = []
            osm_names = []

            for key in OSM_NAME_TAGS:
                name = attrs.get(key)
                if name:
                    osm_names.append(name)

            for name_key in OSM_NAME_TAGS:
                osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])

            for idx in (cth, qs):
                candidates = idx.get_candidate_polygons(lat, lon, return_all=True)

                if candidates:
                    max_sim = 0.0
                    arg_max = None

                    normalized_qs_names = {}

                    for osm_name in osm_names:

                        contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
                                                   for c in safe_decode(osm_name)))

                        for i in candidates:
                            props = idx.get_properties(i)
                            name = normalized_qs_names.get(i)
                            if not name:
                                name = props.get('name')
                                if not name:
                                    continue
                                for pattern, repl in cls.regex_replacements:
                                    name = pattern.sub(repl, name)
                                normalized_qs_names[i] = name

                            if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood':
                                continue

                            if not contains_ideographs:
                                sim = NeighborhoodDeduper.compare(osm_name, name, idf)
                            else:
                                # Many Han/Hangul characters are common, shouldn't use IDF
                                sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)

                            if sim > max_sim:
                                max_sim = sim
                                poly = idx.get_polygon(i)
                                arg_max = (max_sim, props, poly.context, idx, i)

                    if arg_max:
                        ranks.append(arg_max)

            ranks.sort(key=operator.itemgetter(0), reverse=True)
            if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
                score, props, poly, idx, i = ranks[0]

                existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True)
                existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True)

                skip_node = False

                for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries):
                    for poly_index, osm_props in enumerate(boundaries):
                        containing_component = None
                        name = osm_props.get('name')
                        # Only exact name matches here since we're comparins OSM to OSM
                        if name and name.lower() != attrs.get('name', '').lower():
                            continue

                        if boundaries is existing_neighborhood_boundaries:
                            containing_component = AddressFormatter.SUBURB
                            skip_node = True
                            break
                        else:
                            containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]]

                            containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids)

                        if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]:
                            skip_node = True
                            break
                    if skip_node:
                        break

                # Skip this element
                if skip_node:
                    continue

                if idx is cth:
                    if props['component'] == AddressFormatter.SUBURB:
                        attrs['polygon_type'] = 'neighborhood'
                    elif props['component'] == AddressFormatter.CITY_DISTRICT:
                        attrs['polygon_type'] = 'local_admin'
                    else:
                        continue
                    source = 'osm_cth'
                else:
                    level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)

                    source = 'osm_quattro'
                    if level == 'neighborhood':
                        attrs['polygon_type'] = 'neighborhood'
                    else:
                        attrs['polygon_type'] = 'local_admin'

                containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries]
                component = osm_address_components.component_from_properties(country, attrs, containing=containing_ids)
                attrs['component'] = component

                attrs['source'] = source
                index.index_polygon(poly)
                index.add_polygon(poly, attrs)
                idx.matched[i] = True

            num_polys += 1
            if num_polys % 1000 == 0 and num_polys > 0:
                logger.info('did {} neighborhoods'.format(num_polys))

        for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')):
            for i in xrange(idx.i):
                props = idx.get_properties(i)
                poly = idx.get_polygon(i)
                if idx.matched[i]:
                    continue
                props['source'] = source
                if idx is cth:
                    component = props['component']
                    if component == AddressFormatter.SUBURB:
                        props['polygon_type'] = 'neighborhood'
                    elif component == AddressFormatter.CITY_DISTRICT:
                        props['polygon_type'] = 'local_admin'
                    else:
                        continue
                elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
                    component = AddressFormatter.SUBURB
                    name = props.get('name')
                    if not name:
                        continue
                    for pattern, repl in cls.regex_replacements:
                        name = pattern.sub(repl, name)

                    props['name'] = name

                    if cls.quattroshapes_city_district_regex.match(name):
                        component = AddressFormatter.CITY_DISTRICT

                    props['component'] = component
                    props['polygon_type'] = 'neighborhood'
                else:
                    # We don't actually care about local admin polygons unless they match OSM
                    continue
                index.index_polygon(poly.context)
                index.add_polygon(poly.context, props)

        return index
Exemplo n.º 4
0
def init_disambiguation():
    global char_scripts, script_languages
    char_scripts[:] = []
    char_scripts.extend(get_chars_by_script())
    script_languages.update({script: set(langs) for script, langs in get_script_languages().iteritems()})
Exemplo n.º 5
0
    def create_from_osm_and_quattroshapes(cls,
                                          filename,
                                          quattroshapes_dir,
                                          output_dir,
                                          scratch_dir=SCRATCH_DIR):
        '''
        Given an OSM file (planet or some other bounds) containing neighborhoods
        as points (some suburbs have boundaries)

        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other geometries.
        '''
        index = cls(save_dir=output_dir)

        ensure_dir(scratch_dir)

        logger = logging.getLogger('neighborhoods')
        logger.setLevel(logging.INFO)

        qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods')
        ensure_dir(qs_scratch_dir)
        logger.info('Creating Quattroshapes neighborhoods')

        qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(
            quattroshapes_dir, qs_scratch_dir)
        logger.info('Creating Zetashapes neighborhoods')
        zs = cls.create_zetashapes_neighborhoods_index()

        logger.info('Creating IDF index')
        idf = IDFIndex()

        char_scripts = get_chars_by_script()

        for idx in (zs, qs):
            for i, (props, poly) in enumerate(idx.polygons):
                name = props.get('name')
                if name is not None:
                    doc = cls.count_words(name)
                    idf.update(doc)

        for key, attrs, deps in parse_osm(filename):
            for k, v in attrs.iteritems():
                if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
                    doc = cls.count_words(v)
                    idf.update(doc)

        qs.matched = [False] * qs.i
        zs.matched = [False] * zs.i

        logger.info('Matching OSM points to neighborhood polygons')
        # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons
        num_polys = 0
        for node_id, attrs, deps in parse_osm(filename):
            try:
                lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
            except ValueError:
                continue

            osm_name = attrs.get('name')
            if not osm_name:
                continue

            is_neighborhood = attrs.get('place') == 'neighbourhood'

            ranks = []
            osm_names = []

            for key in OSM_NAME_TAGS:
                name = attrs.get(key)
                if name:
                    osm_names.append(name)

            for name_key in OSM_NAME_TAGS:
                osm_names.extend([
                    v for k, v in attrs.iteritems()
                    if k.startswith('{}:'.format(name_key))
                ])

            for idx in (zs, qs):
                candidates = idx.get_candidate_polygons(lat,
                                                        lon,
                                                        return_all=True)

                if candidates:
                    max_sim = 0.0
                    arg_max = None

                    normalized_qs_names = {}

                    for osm_name in osm_names:

                        contains_ideographs = any(
                            ((char_scripts[ord(c)] or '').lower()
                             in ideographic_scripts
                             for c in safe_decode(osm_name)))

                        for i in candidates:
                            props, poly = idx.polygons[i]
                            name = normalized_qs_names.get(i)
                            if not name:
                                name = props.get('name')
                                if not name:
                                    continue
                                for pattern, repl in cls.regex_replacements:
                                    name = pattern.sub(repl, name)
                                normalized_qs_names[i] = name

                            if is_neighborhood and idx is qs and props.get(
                                    QuattroshapesReverseGeocoder.LEVEL
                            ) != 'neighborhood':
                                continue

                            if not contains_ideographs:
                                sim = NeighborhoodDeduper.compare(
                                    osm_name, name, idf)
                            else:
                                # Many Han/Hangul characters are common, shouldn't use IDF
                                sim = NeighborhoodDeduper.compare_ideographs(
                                    osm_name, name)

                            if sim > max_sim:
                                max_sim = sim
                                arg_max = (max_sim, props, poly.context, idx,
                                           i)

                    if arg_max:
                        ranks.append(arg_max)

            ranks.sort(key=operator.itemgetter(0), reverse=True)
            if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
                score, props, poly, idx, i = ranks[0]

                if idx is zs:
                    attrs['polygon_type'] = 'neighborhood'
                    source = 'osm_zeta'
                else:
                    level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
                    source = 'osm_quattro'
                    if level == 'neighborhood':
                        attrs['polygon_type'] = 'neighborhood'
                    else:
                        attrs['polygon_type'] = 'local_admin'

                attrs['source'] = source
                index.index_polygon(poly)
                index.add_polygon(poly, attrs)
                idx.matched[i] = True

            num_polys += 1
            if num_polys % 1000 == 0 and num_polys > 0:
                logger.info('did {} neighborhoods'.format(num_polys))

        for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')):
            for i, (props, poly) in enumerate(idx.polygons):
                if idx.matched[i]:
                    continue
                props['source'] = source
                if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL,
                                          None) == 'neighborhood':
                    props['polygon_type'] = 'neighborhood'
                else:
                    # We don't actually care about local admin polygons unless they match OSM
                    continue
                index.index_polygon(poly.context)
                index.add_polygon(poly.context, props)

        return index
Exemplo n.º 6
0
                if prefix_search and self.trie.get(token[:prefix_len]):
                    yield (token_types.PHRASE, [(c, ) + t], prefix_search)
                    continue
            yield c, t, data


street_types_gazetteer = DictionaryPhraseFilter(
    'street_types.txt',
    'directionals.txt',
    'concatenated_suffixes_separable.txt',
    'concatenated_suffixes_inseparable.txt',
    'concatenated_prefixes_separable.txt',
    'stopwords.txt',
)

char_scripts = get_chars_by_script()
script_languages = {
    script: set(langs)
    for script, langs in get_script_languages().iteritems()
}

UNKNOWN_SCRIPT = 'Unknown'
COMMON_SCRIPT = 'Common'
MAX_ASCII = 127


def get_string_script(s):
    s = safe_decode(s)
    str_len = len(s)
    script = last_script = UNKNOWN_SCRIPT
    is_ascii = True
Exemplo n.º 7
0
# For toponyms, we want to limit the countries we consider to those where
# the place names can themselves be considered training examples of the language
WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
    'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']),
    'fr': set(['fr']),
    'it': set(['it']),
    'de': set(['de', 'at']),
    'nl': set(['nl']),
    'es': set(['es', 'ar', 'mx', 'cl', 'co', 'pe', 'ec', 'pr', 'uy',
               've', 'cu', 'do', 'bo', 'gt', 'cr', 'py', 'sv', 'pa',
               'ni', 'hn']),
    'pt': set(['pt', 'br']),
}

char_scripts = get_chars_by_script()
script_languages = {script: set(langs) for script, langs in six.iteritems(get_script_languages())}
lang_scripts = defaultdict(set)

for script, langs in six.iteritems(script_languages):
    for lang in langs:
        lang_scripts[lang].add(script)

lang_scripts = dict(lang_scripts)

UNKNOWN_SCRIPT = 'Unknown'
COMMON_SCRIPT = 'Common'
MAX_ASCII = 127


def get_string_script(s):