Exemplo n.º 1
0
def fill_lang_geo(backend, alt_names_file):
    def lang_to_db_generator(lang_dict):
        for geoname_id, langs in lang_dict.items():
            yield {
                'filter': {
                    'geoname_id': geoname_id
                },
                'update': {
                    'langs': langs
                }
            }

    langs_by_id = {}
    for line in file_utils.txt_to_dict(alt_names_file):
        if len(line) >= 8:  # Historical name
            continue
        geoname_id = line[1]
        iso_language = line[2]
        alt_name = line[3]
        if not iso_language or not alt_name or iso_language in AVOID_LANGS:
            continue
        langs = langs_by_id.get(geoname_id, {})
        lang_names = [alt_name]
        if iso_language in langs:
            lang_names = set(langs[iso_language])
            lang_names.add(alt_name)
        langs[iso_language] = list(lang_names)
        langs_by_id[geoname_id] = langs

    for geoname_id, langs in langs_by_id.items():
        backend.collection.update_one({'geoname_id': geoname_id},
                                      {'$set': {
                                          'langs': langs
                                      }})
Exemplo n.º 2
0
def correct_es_hierarchy(backend, spain_pc_file=None):
    """
    Try to connect ES cities & provinces as far as we can
    """
    if spain_pc_file is None:
        return

    provs = {}
    cities = {}
    ops = []

    def _get_all_names(line):
        yield line[2]
        line_name = line[2]
        if '/' in line_name:
            for name in line_name.split('/'):
                yield name.strip()

    for prov in backend.collection.find({'country_code': 'ES',
                                        'geo_type': 'Province'}):
        if 'ES-' in prov['iso_code']:
            provs[prov['iso_code'].split('-')[1]] = prov
        else:
            provs[prov['iso_code']] = provs

    # Constructing mapping
    for city in file_utils.txt_to_dict(spain_pc_file):
        city_d = {'province': city[4],
                  }

        for name in _get_all_names(city):
            try:
                key = "%s-%s" % (provs[city_d['province']]['parent_id'],
                                 name.lower())
            except KeyError:
                # Province did not exist in our Adwords file
                continue
            cities[key] = city_d

    # Browsing collection to update when possible
    for city in backend.collection.find({'country_code': 'ES',
                                         'geo_type': 'City'}):
        cdata = cities.get('%s-%s' % (city['parent_id'], city['name'].lower()))
        if cdata is None:
            continue
        if cdata['province'] not in provs:
            continue
        city['parent_id'] = provs[cdata['province']]['dolead_id']
        ops.append(ReplaceOne({'_id': city['_id']}, city))

    if ops:
        print('%s/%s ES hierarchy corrected' % (
            len(ops), backend.collection.find({'country_code': 'ES',
                                               'geo_type': 'City'}).count()))
        backend.collection.bulk_write(ops)
Exemplo n.º 3
0
def parse_alt_names(alt_names_txt):
    alt_names = {}
    for alt_name in file_utils.txt_to_dict(alt_names_txt):
        geoname_id = alt_name[1]
        iso_language = alt_name[2].upper()
        name = alt_name[3]
        # Did not find enough data in the txt file for this one
        if not (geoname_id and iso_language and name):
            continue
        # Geonames include link to wikipedia and postal code
        if iso_language in ['LINK', 'POST']:
            continue

        if geoname_id in alt_names:
            alt_names[geoname_id].append({'lang': iso_language, 'name': name})
        else:
            alt_names[geoname_id] = [{'lang': iso_language, 'name': name}]
    return alt_names
Exemplo n.º 4
0
def parse_cities(cities_txt):
    cities = {}
    for city in file_utils.txt_to_dict(cities_txt):
        geoname_id = city[0]
        name = city[1].lower()
        ascii_name = city[2].lower()
        feature_class = city[6]
        country = city[8].upper()
        population = city[14].upper()
        # Did not find enough data in the txt file for this one
        if not (geoname_id and name and country):
            continue
        # Feature class must be a city: P in geoNames
        if feature_class != 'P':
            continue
        cities[(name, country)] = (geoname_id, population)
        if name != ascii_name:
            cities[(ascii_name, country)] = (geoname_id, population)
    return cities
Exemplo n.º 5
0
def fill_geo_data(backend, cities_file):
    """
    Will parse all file to get the data we need and push it all in the backend

    cities_file columns can be found in:
    http://download.geonames.org/export/dump/
    """
    def _translate_geonames(line):
        return {
            'geoname_id': line[0],
            'name': line[1],
            'name_lower': unidecode(line[1].lower()),
            'asciiname': line[2],
            'alternate_names': unidecode(line[3].lower()),
            'latitude': line[4],
            'longitude': line[5],
            'feature_class': line[6],
            'country_code': line[8],
            'alt_country_code': line[9],
            'admin1_code': line[10],
            'admin2_code': line[11],
            'admin3_code': line[12],
            'admin4_code': line[13],
            'population': int(line[14]) if line[14] else 0,
            'elevation': int(line[15]) if line[15] else 0,
            'timezone': line[17],
        }

    lines = []
    for line in file_utils.txt_to_dict(cities_file):
        trans_line = _translate_geonames(line)
        lines.append(trans_line)
        if len(lines) >= WRITE_BATCH_SIZE:
            logger.info('writing %d lines into %s', WRITE_BATCH_SIZE,
                        backend.network)
            backend.insert_many(lines)
            lines = []
    if lines:
        backend.insert_many(lines)
        logger.info('writing %d lines into %s', len(lines), backend.network)