def batch_geocode_gouv(df, l_cols=['num_niv_type_voie', 'cd_postal', 'nom_ville']): """ geocode from gouv opendata API, only working for french geoloc advantages : free, no limit, and faster disadvantages : only for France :param l_cols: ordered list of columns defining the adress :return: same df with 2 new cols (lat, lng) """ cols = list(df) if ADRESS_COL_NAME not in list(df): df = add_adress(df=df, l_cols=l_cols) Sp = 20000 # chunk size if df.shape[0] >= Sp: l_df = [] list_df = [df.iloc[i:i + Sp] for i in range(0, df.shape[0], Sp)] for dd in list_df: df_res = apigouv_prepocess_request(df=dd) l_df.append(df_res) df_api_res = pd.concat(l_df, ignore_index=True) else: df_api_res = apigouv_prepocess_request(df=df) new_cols = ['latitude', 'longitude'] df[new_cols] = df_api_res[new_cols] del df_api_res df = df.filter(items=cols + new_cols) df.rename(columns={"latitude": "lat", "longitude": "lng"}, inplace=True) return df
def get_local_adress_dataset(N=100): """ Get sample of clean dataset from url With all columns adding full addres columns :return: """ sep = " " PATH = "gpm/data/address-01-sample-2000.csv" df = pd.read_csv(PATH) df['code_postal'] = df.code_postal.apply(str).str.zfill(5) l_cols = ['numero', 'nom_voie', 'nom_commune', 'code_postal'] df = add_adress(df=df, l_cols=l_cols) df = df.rename(columns={"lat": "true_lat", "lng": "true_lng"}) df = df.sample(N).reset_index(drop=True) return df
def get_iris_batch(csv_path=data_path, sep=',', save=False, df_iris=None, l_cols=['num_niv_type_voie', 'cd_postal', 'nom_ville'], N=None): """ Get input csv with predefined set of columns and add IRIS code :param df_iris: iris dataframe from load_iris_local function (saves time) :param save: :param csv_path: :param sep: :return: """ if not N: df = pd.read_csv(csv_path, sep=sep) else: df = pd.read_csv(csv_path, sep=sep, nrows=N) cols = list(df) df = add_adress(df=df, l_cols=l_cols) df = preprocess(df, to_geopandas=True, geocode=True, l_cols=l_cols) if type(df_iris) == geopandas.geodataframe.GeoDataFrame: places_iris = df_iris else: # places_iris = load_iris_local() # 30 seconds to load places_iris = load_iris_url() # 30 seconds to load bad_geocod = df[df.lat.isnull()].full_address.values if len(bad_geocod) > 0: print( "##WARNING## \n {} adresses were not able to be geocoded : \n {}". format(len(bad_geocod), bad_geocod)) # Deal with Not correctly geocoded data result = geopandas.tools.sjoin(df[~df.lat.isnull()], places_iris, how="left") result = result.append(df[df.lat.isnull()], sort=True) result = result.sort_index() cols.append('code_iris') if save: output_name = csv_path.split('.')[0] + '_iris.csv' df.to_csv(output_name, index=False) print( colored( "output file saved with IRIS codes: \n {}".format(output_name), "blue")) return result[cols]
if geocode: if batch: df = batch_geocode_gouv(df=df, l_cols=l_cols) else: kind = "here" print("Geocoding using {} API in process".format(kind)) tic = time.time() # apply function one by one geocod = geo_coder(offline=False, kind=kind) df['latlng'] = df.apply(lambda x: geocod.run(x[ADRESS_COL_NAME])[0], axis=1) t = round(time.time() - tic, 2) print("in {} seconds \n".format(t)) df['lat'], df['lng'] = df['latlng'].str.split(',', 1).str df['lat'], df['lng'] = df['lat'].apply(float), df['lng'].apply(float) if to_geopandas: df['geometry'] = df.apply(lambda row: Point(row['lng'], row['lat']), axis=1) df = geopandas.GeoDataFrame(df, geometry="geometry") df.crs = {"init": "epsg:4326"} # cols2keep = ['full_address', 'lat', 'lng', 'geometry'] return df if __name__ == '__main__': data_path = "/Users/jeanbizot/Documents/projets/GROUPAMA/gpm/gpm/data/data2_code.csv" adress_cols = ['num_niv_type_voie', 'Code postal', 'nom_ville'] N = 110 df = pd.read_csv(data_path, sep=";", nrows=N) cols = list(df) df = add_adress(df=df, l_cols=adress_cols) df = preprocess(df, to_geopandas=True, geocode=True, batch=True, l_cols=adress_cols)