def _get_region(french_regions_tsv: str, prefix_tsv: str, region_id: str, default: Dict[str, str]) -> Dict[str, str]: if not _REGIONS: _REGIONS.append( cleaned_data.french_regions( filename=french_regions_tsv, prefix_filename=prefix_tsv).to_dict(orient='index')) return _REGIONS[0].get(region_id, default)
def make_dicts(french_regions_tsv, prefix_tsv): """Import régions info in MongoDB. Args: french_regions_tsv: path to a TSV file containing the main information about régions from INSEE. prefix_tsv: path to a TSV file containing the prefix for each région. Returns: A list of dict that maps the JSON representation of Departement protos. """ regions = cleaned_data.french_regions(filename=french_regions_tsv, prefix_filename=prefix_tsv) regions['_id'] = regions.index return regions[['_id', 'name', 'prefix']].to_dict('records')
def prepare_cities(data_folder='data', stats_filename=None, urban_entities_filename=None, transport_scores_filename=None): """Prepare cities for upload to Algolia. Args: data_folder: the root of the data folder. stats_filename: path to a file containing more stats about cities. urban_entities_filename: path to an excel file containing the description about French urban entities. transport_scores_filename: path to an html file containing the scores for public transportation in some French cities. Returns: A list of dict JSON-like objects each containing properties of a French city. """ cities = cleaned_data.french_cities(data_folder) useful_columns = [ 'objectID', 'cityId', 'name', 'departementId', 'departementName', 'departementPrefix', 'regionId', 'regionName' ] # Keep only cities that are still cities on 2016-01-01 and arrondissements. cities = cities[cities.current | cities.arrondissement] # Set city ID on objectID as this is what Algolia uses. cities['objectID'] = cities.index # Get département's names. cities['departementId'] = cities.departement_id departements = cleaned_data.french_departements(data_folder) cities['departementName'] = cities.departement_id.map(departements.name) cities['departementPrefix'] = cities.departement_id.map( departements.prefix) # Get région's names. cities['regionId'] = cities.region_id cities['regionName'] = cities.region_id.map( cleaned_data.french_regions(data_folder).name) if stats_filename: city_stats = pandas.read_csv( stats_filename, sep=',', header=None, usecols=[8, 10, 14], names=['zipCode', 'city_id', 'population'], dtype={ 'zipCode': str, 'city_id': str, 'population': int }) city_stats.set_index('city_id', inplace=True) cities = cities.join(city_stats) cities.zipCode.fillna('', inplace=True) cities.population.fillna(0, inplace=True) useful_columns.extend(['zipCode', 'population']) cities['cityId'] = cities['objectID'] # Treat arrondissements specifically: remove the full cities and use the # full city ID for the arrondissements.. cities.loc[cities.objectID.str.startswith('132'), 'cityId'] = '13055' cities.loc[cities.objectID.str.startswith('751'), 'cityId'] = '75056' cities.loc[cities.objectID.str.startswith('6938'), 'cityId'] = '69123' cities.drop(['13055', '75056', '69123'], errors='ignore', inplace=True) # The urban score is 0 for rural, 1 for cities in urban areas between # 2k and 5k inhabitants, 2 for urban areas below 10k, 3 for 20k, 4 for # 50k, 5 for 100k, 6 for 200k, 7 for 2M, and 8 for Paris urban area. if urban_entities_filename: urban = cleaned_data.french_urban_entities( filename=urban_entities_filename) cities['urban'] = cities.cityId.map(urban.urban) cities.urban.fillna(0, inplace=True) cities.urban.astype(int, inplace=True) useful_columns.append('urban') if transport_scores_filename: transport = cleaned_data.transport_scores( filename=transport_scores_filename) cities['transport'] = cities.objectID.map(transport) cities.transport.fillna(0, inplace=True) useful_columns.append('transport') return cities[useful_columns].to_dict(orient='records')
def prepare_cities( data_folder: str = 'data', stats_filename: Optional[str] = None, urban_entities_filename: Optional[str] = None, transport_scores_filename: Optional[str] = None) \ -> list[dict[str, Any]]: """Prepare cities for upload to Algolia. Args: data_folder: the root of the data folder. stats_filename: path to a file containing more stats about cities. urban_entities_filename: path to an excel file containing the description about French urban entities. transport_scores_filename: path to an html file containing the scores for public transportation in some French cities. Returns: A list of dict JSON-like objects each containing properties of a French city. """ cities = cleaned_data.french_cities(data_folder) useful_columns = [ 'objectID', 'cityId', 'name', 'departementId', 'departementName', 'departementPrefix', 'regionId', 'regionName'] # Keep only cities that are still cities on 2016-01-01. cities = cities[cities.current] # Set city ID on objectID as this is what Algolia uses. cities['objectID'] = cities.index # Get département's names. cities['departementId'] = cities.departement_id departements = cleaned_data.french_departements(data_folder) cities['departementName'] = cities.departement_id.map(departements.name) cities['departementPrefix'] = cities.departement_id.map(departements.prefix) # Get région's names. cities['regionId'] = cities.region_id cities['regionName'] = cities.region_id.map( cleaned_data.french_regions(data_folder).name) if stats_filename: city_stats = pandas.read_csv( stats_filename, sep=',', header=None, usecols=[8, 10, 14, 19, 20], names=['zipCode', 'city_id', 'population', 'longitude', 'latitude'], dtype={ 'zipCode': str, 'city_id': str, 'population': int, 'latitude': float, 'longitude': float, }) city_stats.set_index('city_id', inplace=True) cities = cities.join(city_stats) cities.zipCode.fillna('', inplace=True) cities.population.fillna(0, inplace=True) cities.latitude.fillna(0, inplace=True) cities.longitude.fillna(0, inplace=True) useful_columns.extend(['zipCode', 'population', 'latitude', 'longitude']) # cityId is the ID used throughout the app. cities['cityId'] = cities['objectID'] # The urban score is 0 for rural, 1 for cities in urban areas between # 2k and 5k inhabitants, 2 for urban areas below 10k, 3 for 20k, 4 for # 50k, 5 for 100k, 6 for 200k, 7 for 2M, and 8 for Paris urban area. if urban_entities_filename: urban = cleaned_data.french_urban_entities(filename=urban_entities_filename) cities['urban'] = cities.cityId.map(urban.urban) cities.urban.fillna(0, inplace=True) cities.urban.astype(int) useful_columns.append('urban') if transport_scores_filename: transport = cleaned_data.transport_scores(filename=transport_scores_filename) cities['transport'] = cities.objectID.map(transport) cities.transport.fillna(0, inplace=True) useful_columns.append('transport') return typing.cast( list[dict[str, Any]], cities.sort_index()[useful_columns].to_dict(orient='records'))