Пример #1
0
def batch_geocode_gouv(df, l_cols=['num_niv_type_voie', 'cd_postal', 'nom_ville']):
    """
    geocode from gouv opendata API, only working for french geoloc
    advantages : free, no limit, and faster
    disadvantages : only for France
    :param l_cols: ordered list of columns defining the adress
    :return: same df with 2 new cols (lat, lng)
    """
    cols = list(df)
    if ADRESS_COL_NAME not in list(df):
        df = add_adress(df=df, l_cols=l_cols)
    Sp = 20000  # chunk size
    if df.shape[0] >= Sp:
        l_df = []
        list_df = [df.iloc[i:i + Sp] for i in range(0, df.shape[0], Sp)]
        for dd in list_df:
            df_res = apigouv_prepocess_request(df=dd)
            l_df.append(df_res)
        df_api_res = pd.concat(l_df, ignore_index=True)
    else:
        df_api_res = apigouv_prepocess_request(df=df)
    new_cols = ['latitude', 'longitude']
    df[new_cols] = df_api_res[new_cols]
    del df_api_res
    df = df.filter(items=cols + new_cols)
    df.rename(columns={"latitude": "lat", "longitude": "lng"}, inplace=True)
    return df
Пример #2
0
def get_local_adress_dataset(N=100):
    """
    Get sample of clean dataset from url
    With all columns adding full addres columns
    :return:
    """
    sep = " "
    PATH = "gpm/data/address-01-sample-2000.csv"
    df = pd.read_csv(PATH)
    df['code_postal'] = df.code_postal.apply(str).str.zfill(5)
    l_cols = ['numero', 'nom_voie', 'nom_commune', 'code_postal']
    df = add_adress(df=df, l_cols=l_cols)
    df = df.rename(columns={"lat": "true_lat", "lng": "true_lng"})
    df = df.sample(N).reset_index(drop=True)
    return df
Пример #3
0
def get_iris_batch(csv_path=data_path,
                   sep=',',
                   save=False,
                   df_iris=None,
                   l_cols=['num_niv_type_voie', 'cd_postal', 'nom_ville'],
                   N=None):
    """
    Get input csv with predefined set of columns and add IRIS code
    :param df_iris: iris dataframe from load_iris_local function (saves time)
    :param save:
    :param csv_path:
    :param sep:
    :return:
    """
    if not N:
        df = pd.read_csv(csv_path, sep=sep)
    else:
        df = pd.read_csv(csv_path, sep=sep, nrows=N)
    cols = list(df)
    df = add_adress(df=df, l_cols=l_cols)
    df = preprocess(df, to_geopandas=True, geocode=True, l_cols=l_cols)
    if type(df_iris) == geopandas.geodataframe.GeoDataFrame:
        places_iris = df_iris
    else:
        # places_iris = load_iris_local()  # 30 seconds to load
        places_iris = load_iris_url()  # 30 seconds to load
    bad_geocod = df[df.lat.isnull()].full_address.values
    if len(bad_geocod) > 0:
        print(
            "##WARNING## \n {} adresses were not able to be geocoded : \n {}".
            format(len(bad_geocod), bad_geocod))
    # Deal with Not correctly geocoded data
    result = geopandas.tools.sjoin(df[~df.lat.isnull()],
                                   places_iris,
                                   how="left")
    result = result.append(df[df.lat.isnull()], sort=True)
    result = result.sort_index()
    cols.append('code_iris')
    if save:
        output_name = csv_path.split('.')[0] + '_iris.csv'
        df.to_csv(output_name, index=False)
        print(
            colored(
                "output file saved with IRIS codes: \n {}".format(output_name),
                "blue"))
    return result[cols]
Пример #4
0
    if geocode:
        if batch:
            df = batch_geocode_gouv(df=df, l_cols=l_cols)
        else:
            kind = "here"
            print("Geocoding using {} API in process".format(kind))
            tic = time.time()
            # apply function one by one
            geocod = geo_coder(offline=False, kind=kind)
            df['latlng'] = df.apply(lambda x: geocod.run(x[ADRESS_COL_NAME])[0], axis=1)
            t = round(time.time() - tic, 2)
            print("in {} seconds \n".format(t))
            df['lat'], df['lng'] = df['latlng'].str.split(',', 1).str
            df['lat'], df['lng'] = df['lat'].apply(float), df['lng'].apply(float)
    if to_geopandas:
        df['geometry'] = df.apply(lambda row: Point(row['lng'], row['lat']), axis=1)
        df = geopandas.GeoDataFrame(df, geometry="geometry")
        df.crs = {"init": "epsg:4326"}
    # cols2keep = ['full_address', 'lat', 'lng', 'geometry']
    return df


if __name__ == '__main__':
    data_path = "/Users/jeanbizot/Documents/projets/GROUPAMA/gpm/gpm/data/data2_code.csv"
    adress_cols = ['num_niv_type_voie', 'Code postal', 'nom_ville']
    N = 110
    df = pd.read_csv(data_path, sep=";", nrows=N)
    cols = list(df)
    df = add_adress(df=df, l_cols=adress_cols)
    df = preprocess(df, to_geopandas=True, geocode=True, batch=True, l_cols=adress_cols)