Exemplo n.º 1
0
def main(top_left, bottom_left, bottom_right, top_right, config_id):

    # ------#
    # SETUP #
    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    # connect to db and read config table
    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(config_id), engine)

    raster = config["satellite_grid"][0]
    nightlights_date = config.get("nightlights_date")[0]
    base_raster = "../tmp/local_raster.tif"
    if config['satellite_config'][0].get('satellite_images') == 'Y':
        step = config['satellite_config'][0].get("satellite_step")

    # ----------------------------------- #
    # WorldPop Raster too fine, aggregate #
    aggregate(raster, base_raster, 1)

    # -------------------  #
    # CLIP RASTER TO SCOPE #
    geoms = [{'type': 'Polygon', 'coordinates': [[top_left, bottom_left, bottom_right, top_right]]}]

    with rasterio.open(base_raster) as src:
        out_image, out_transform = mask(src, geoms, crop=True)
        out_meta = src.meta.copy()

    # save the resulting raster
    out_meta.update({"driver": "GTiff",
                     "height": out_image.shape[1],
                     "width": out_image.shape[2],
                     "transform": out_transform
                     })

    with rasterio.open(base_raster, "w", **out_meta) as dest:
        dest.write(out_image)

    # load the new clipped raster to the img_lib
    GRID = RasterGrid(base_raster)
    with rasterio.open(base_raster) as src:
        list_j, list_i = np.where(src.read()[0] != src.nodata)
    print("INFO: downloading images in scope ...")
    coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)

    # ------------------------------------------------------------- #
    # download images from Google and Sentinel and Extract Features #
    # ------------------------------------------------------------- #
    if config["satellite_config"][0]["satellite_images"] != 'N':

        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        for sat in ['Google', 'Sentinel']:
            print('INFO: routine for provider: ', sat)
            # dopwnlaod the images from the relevant API
            GRID.download_images(list_i, list_j, step, sat, start_date, end_date)
            print('INFO: images downloaded.')

            print('INFO: scoring ...')
            # extarct the features
            network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID)
            print('INFO: extractor instantiated.')
            features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline='scoring')
            # normalize the features
            features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, config_id, 'scoring'), index=False)

        g_features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format("Google", config_id, 'scoring'))
        s_features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format("Sentinel", config_id, 'scoring'))

        data = pd.merge(g_features, s_features, on=['i', 'j', 'index'])
        data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(config_id), index=False)

        print('INFO: features extracted.')

    else:
        data = pd.DataFrame({'gpsLongitude': coords_x, 'gpsLatitude': coords_y, 'j': list_j, 'i': list_i})
    # --------------- #
    # add nightlights #
    # --------------- #
    from geojson import Polygon
    from nightlights import Nightlights

    area = Polygon([[top_left, bottom_left, bottom_right, top_right]])

    NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date)
    data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y
    data['nightlights'] = NGT.nightlights_values(data)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(data)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}
    osm_features = []

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            osm_tree = OSM.gpd_to_tree(osm_gdf["value"])
            dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1)
            # density = data.apply(OSM.density, args=(osm_gdf["value"],), axis=1)
            data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x))
            osm_features.append('distance_{}'.format(value))
            # data['density_{}'.format(value)] = density.apply(lambda x: np.log(0.0001 + x))
            # osm_features.append('density_{}'.format(value))

    # ---------------------- #
    # LOAD MODEL AND PREDICT #
    print("INFO: load model and predict ...")
    try:
        X = data.drop(['index', 'i', 'j', 'gpsLongitude', 'gpsLatitude'], axis=1)
    except ValueError:
        X = data.drop(['i', 'j', 'gpsLongitude', 'gpsLatitude'], axis=1)
    # load model and predict
    try:
        RmSense = joblib.load('../Models/RmSense_model_config_id_{}.pkl'.format(config_id))
        kNN = joblib.load('../Models/kNN_model_config_id_{}.pkl'.format(config_id))
    except FileNotFoundError:
        print('ERROR: model not found')

    yhat = (RmSense.predict(X) + kNN.predict(data[['i','j']])) / 2.
    results = pd.DataFrame({'i': list_i, 'j': list_j, 'lat': coords_y, 'lon': coords_x, 'yhat': yhat})

    outfile = "../Data/Results/scalerout_{}.tif".format(config_id)
    tifgenerator(outfile=outfile,
                 raster_path=base_raster,
                 df=results)
Exemplo n.º 2
0
def run(id):
    # ----------------- #
    # SETUP #############
    # ----------------- #

    print(str(np.datetime64('now')), " INFO: config id =", id)

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    indicator = config["indicator"][0]
    raster = config["satellite_grid"][0]
    aggregate_factor = config["base_raster_aggregation"][0]
    scope = config["scope"][0]
    nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), config["nightlights_date"][0].get("end")
    s2_date_start, s2_date_end = config["NDs_date"][0].get("start"), config["NDs_date"][0].get("end")
    if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step")

    # ----------------------------------- #
    # WorldPop Raster too fine, aggregate #
    from utils import aggregate
    if aggregate_factor > 1:
        print('INFO: aggregating raster {}'.format(raster))
        base_raster = "../tmp/local_raster.tif"
        aggregate(raster, base_raster, aggregate_factor)
    else:
        base_raster = raster

    # -------- #
    # DATAPREP #
    # -------- #
    data = pd.read_csv(dataset)
    data_cols = data.columns.values

    # grid
    GRID = RasterGrid(base_raster)
    list_i, list_j = GRID.get_gridcoordinates(data)

    # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES
    # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y

    data["i"], data["j"] = list_i, list_j

    # Get Polygon Geojson of the boundaries
    minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude")
    area = points_to_polygon(minlon, minlat, maxlon, maxlat)

    print("Number of clusters: {} ".format(len(data)))

    list_i, list_j, pipeline = data["i"], data["j"], 'evaluation'

    # ------------------------------------------------------------- #
    # download images from Google and Sentinel and Extract Features #
    # ------------------------------------------------------------- #
    if config["satellite_config"][0]["satellite_images"] != 'N':

        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        for sat in ['Google', 'Sentinel']:
            print('INFO: routine for provider: ', sat)
            # downlaod the images from the relevant API
            GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000)
            print('INFO: images downloaded.')

            if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)):
                print('INFO: already scored.')
                features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline))
            else:
                print('INFO: scoring ...')
                # extract the features
                network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID)
                print('INFO: extractor instantiated.')

                features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline)
                # normalize the features

                features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False)

            features = features.drop('index', 1)
            data = data.merge(features, on=["i", "j"])

        data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

        print('INFO: features extracted.')

    # --------------- #
    # add nightlights #
    # --------------- #

    from nightlights import Nightlights

    NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end)
    data['nightlights'] = NGT.nightlights_values(data)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(minlon, minlat, maxlon, maxlat)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}
    osm_features = []

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            osm_tree = OSM.gpd_to_tree(osm_gdf["value"])
            dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1)
            data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x))
            osm_features.append('distance_{}'.format(value))

    # ---------------- #
    #   NDBI,NDVI,NDWI #
    # ---------------- #
    print('INFO: getting NDBI, NDVI, NDWI ...')

    from rms_indexes import S2indexes

    S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope)
    S2.download()
    data[['max_NDVI', 'max_NDBI', 'max_NDWI']] = S2.rms_values(data).apply(pd.Series)
    # --------------- #
    # save features   #
    # --------------- #
    # features to be use in the linear model
    features_list = list(sorted(set(data.columns) - set(data_cols) - set(['i', 'j'])))

    # Standardize Features (0 mean and 1 std)
    #data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std()
    print("Normalizing : max")
    data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].max()

    data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

    # --------------- #
    # model indicator #
    # --------------- #
    # shuffle dataset
    data = data.sample(frac=1, random_state=1783).reset_index(drop=True)  # shuffle data

    # if set in the config, take log of indicator
    if config['log'][0]:
        data[indicator] = np.log(data[indicator])

    from modeller import Modeller
    X, y = data[features_list + ["gpsLatitude", "gpsLongitude"]], data[indicator]
    modeller = Modeller(X, rs_features=features_list, spatial_features=["gpsLatitude", "gpsLongitude"], scoring='r2', cv_loops=20)

    kNN_pipeline = modeller.make_model_pipeline('kNN')
    kNN_scores = modeller.compute_scores(kNN_pipeline, y)
    kNN_R2_mean = kNN_scores.mean()
    kNN_R2_std = kNN_scores.std()
    print("kNN_R2_mean: ", kNN_R2_mean, "kNN_R2_std: ", kNN_R2_std)

    Ridge_pipeline = modeller.make_model_pipeline('Ridge')
    Ridge_scores = modeller.compute_scores(Ridge_pipeline, y)
    Ridge_R2_mean = Ridge_scores.mean()
    Ridge_R2_std = Ridge_scores.std()
    print("Ridge_R2_mean: ", Ridge_R2_mean, "Ridge_R2_std: ", Ridge_R2_std)

    Ensemble_pipeline = modeller.make_ensemble_pipeline([kNN_pipeline, Ridge_pipeline])
    Ensemble_scores = modeller.compute_scores(Ensemble_pipeline, y)
    Ensemble_R2_mean = Ensemble_scores.mean()
    Ensemble_R2_std = Ensemble_scores.std()
    print("Ensemble_R2_mean: ", Ensemble_R2_mean, "Ensemble_R2_std: ", Ensemble_R2_std)

    # ------------------ #
    # write scores to DB #
    # ------------------ #

    query = """
    insert into results_new (run_date, config_id, r2, r2_sd, r2_knn, r2_sd_knn, r2_features, r2_sd_features, mape_rmsense)
    values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format(
        config['id'][0],
        Ensemble_R2_mean, Ensemble_R2_std, kNN_R2_mean, kNN_R2_std, Ridge_R2_mean, Ridge_R2_std, 0)
    engine.execute(query)

    # ------------------------- #
    # write predictions to file #
    # ------------------------- #

    print('INFO: writing predictions to disk ...')

    from sklearn.model_selection import cross_val_predict
    results = pd.DataFrame({
        'yhat': cross_val_predict(Ensemble_pipeline, X.values, y),
        'y': data[indicator].values,
        'lat': data['gpsLatitude'],
        'lon': data['gpsLongitude']})
    results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False)

    # save model for production
    Ensemble_pipeline.fit(X.values, y)

    # Best n_neighbors (kNN)
    print('INFO: number of neighbours chosen: ', Ensemble_pipeline.regr_[0].named_steps['gridsearchcv'].best_params_)
    # Best alpha (Ridge)
    print('INFO: regularization param chosen: ', Ensemble_pipeline.regr_[1].named_steps['gridsearchcv'].best_params_)

    from sklearn.externals import joblib
    joblib.dump(Ensemble_pipeline, '../Models/Ensemble_model_config_id_{}.pkl'.format(id))
    print(str(np.datetime64('now')), 'INFO: model saved.')
Exemplo n.º 3
0
def main(id, aggregate_factor, min_pop, minlat, maxlat, minlon, maxlon, shapefile):

    # ----------------- #
    # SETUP #############
    # ----------------- #

    print(str(np.datetime64('now')), " INFO: config id =", id)

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    raster = config["satellite_grid"][0]

    scope = config["scope"][0]
    nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), config["nightlights_date"][0].get("end")
    s2_date_start, s2_date_end = config["NDs_date"][0].get("start"), config["NDs_date"][0].get("end")
    if config['satellite_config'][0].get('satellite_images') == 'Y':
        step = config['satellite_config'][0].get("satellite_step")

    # ----------------------------------- #
    # WorldPop Raster too fine, aggregate #
    if aggregate_factor is None:
        aggregate_factor = config["base_raster_aggregation"][0]

    if aggregate_factor > 1:
        print('INFO: aggregating raster with factor {}'.format(aggregate_factor))
        base_raster = "../tmp/local_raster.tif"
        aggregate(raster, base_raster, aggregate_factor)
    else:
        base_raster = raster

    # ---------------- #
    # AREA OF INTEREST #
    # ---------------- #
    dataset_df = pd.read_csv(dataset)
    data_cols = dataset_df.columns.values

    # create geometry
    if (minlat is None) and (maxlat is None) and (minlon is None) and (maxlon is None):
        minlat, maxlat, minlon, maxlon = df_boundaries(dataset_df, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude")

    area = points_to_polygon(minlon, minlat, maxlon, maxlat)

    # crop raster
    with rasterio.open(base_raster) as src:
        out_image, out_transform = mask(src, [area], crop=True)
        out_meta = src.meta.copy()

    # save the resulting raster
    out_meta.update({"driver": "GTiff",
                     "height": out_image.shape[1],
                     "width": out_image.shape[2],
                     "transform": out_transform
                     })

    final_raster = "../tmp/final_raster.tif"
    print('INFO: Remiving tiles with population under {}'.format(min_pop))  # only score areas where there are at agg factor living
    with rasterio.open(final_raster, "w", **out_meta) as dest:
        out_image[out_image < min_pop] = dest.nodata
        dest.write(out_image)
        list_j, list_i = np.where(out_image[0] != dest.nodata)

    # instantiate GRID
    GRID = RasterGrid(final_raster)

    coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)

    data = pd.DataFrame({"i": list_i, "j": list_j})
    data["gpsLatitude"] = coords_y
    data["gpsLongitude"] = coords_x

    print("Number of clusters: {} ".format(len(data)))

    list_i, list_j, pipeline = data["i"], data["j"], 'scoring'

    # ------------------------------------------------------------- #
    # download images from Google and Sentinel and Extract Features #
    # ------------------------------------------------------------- #
    if config["satellite_config"][0]["satellite_images"] != 'N':

        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        for sat in ['Google', 'Sentinel']:
            print('INFO: routine for provider: ', sat)
            # downlaod the images from the relevant API
            GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000)
            print('INFO: images downloaded.')

            print('INFO: scoring ...')
            # extract the features
            network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID)
            print('INFO: extractor instantiated.')

            features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline)
            # normalize the features

            features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False)

            features = features.drop('index', 1)
            data = data.merge(features, on=["i", "j"])

        data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format(id, pipeline), index=False)

        print('INFO: features extracted.')

    # --------------- #
    # add nightlights #
    # --------------- #

    from nightlights import Nightlights

    NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end)
    data['nightlights'] = NGT.nightlights_values(data)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(minlon, minlat, maxlon, maxlat)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}
    osm_features = []

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            osm_tree = OSM.gpd_to_tree(osm_gdf["value"])
            dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1)
            data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x))
            osm_features.append('distance_{}'.format(value))

    # ---------------- #
    #   NDBI,NDVI,NDWI #
    # ---------------- #
    print('INFO: getting NDBI, NDVI, NDWI ...')

    from rms_indexes import S2indexes

    S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope)
    S2.download()
    data[['max_NDVI', 'max_NDBI', 'max_NDWI']] = S2.rms_values(data).apply(pd.Series)

    # --------------- #
    # save features   #
    # --------------- #

    features_list = list(sorted(set(data.columns) - set(data_cols) - set(['i', 'j'])))

    # Standardize Features (0 mean and 1 std)
    # TODO: use mean and max from training
    print("INFO: Normalizing by the max")
    data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].max()

    data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format(id, pipeline), index=False)

    # Open model
    ensemble_pipeline = joblib.load('../Models/Ensemble_model_config_id_{}.pkl'.format(id))
    print(str(np.datetime64('now')), 'INFO: model loaded.')

    X = data[features_list + ["gpsLatitude", "gpsLongitude"]]
    ensemble_predictions = ensemble_pipeline.predict(X.values)

    # if take log of indicator
    if config['log'][0]:
        ensemble_predictions = np.exp(ensemble_predictions)

    results = pd.DataFrame({'i': list_i, 'j': list_j, 'lat': coords_y, 'lon': coords_x, 'yhat': ensemble_predictions})

    outfile = "../Data/Results/scalerout_{}.tif".format(id)
    tifgenerator(outfile=outfile,
                 raster_path=final_raster,
                 df=results)

    outfile = "../Data/Results/scalerout_{}_kNN.tif".format(id)
    results['yhat_kNN'] = ensemble_pipeline.regr_[0].predict(X.values)
    tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_kNN')

    outfile = "../Data/Results/scalerout_{}_Ridge.tif".format(id)
    results['yhat_Ridge'] = ensemble_pipeline.regr_[1].predict(X.values)
    tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_Ridge')

    if shapefile is not None:
        input_rst = "../Data/Results/scalerout_{}.tif".format(id)
        weight_rst = "../tmp/final_raster.tif"

        output_shp = "../Data/Results/scalerout_{}_aggregated.shp".format(id)
        from utils import weighted_sum_by_polygon
        weighted_sum_by_polygon(shapefile, input_rst, weight_rst, output_shp)
Exemplo n.º 4
0
def run(id):

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""".format(
        private_config['DB']['user'], private_config['DB']['password'],
        private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config where id = {}".format(id),
                               engine)

    dataset = config["dataset_filename"][0]
    indicator = config["indicator"][0]
    raster = config["satellite_grid"][0]
    indicator_log = config['indicator_log'][0]

    ## load data

    GRID = RasterGrid(raster)
    list_i, list_j = GRID.get_gridcoordinates(dataset)  #

    hh_data = pd.read_csv(dataset)

    data = hh_data
    data["i"] = list_i
    data["j"] = list_j

    cluster_N = 'countbyEA'

    try:
        data = data.groupby(["i", "j"]).apply(
            lambda x: np.average(x[indicator], weights=x[cluster_N])).to_frame(
                name=indicator).reset_index()
    except:
        data = data.groupby(["i", "j"]).mean()

    X = pd.DataFrame({"i": data["i"], "j": data["j"]})
    y = data[indicator].values

    # Log-normal distribution
    if indicator_log == True:
        y = np.log(y)

    # TRAIN MODEL
    outer_cv = KFold(5, shuffle=True, random_state=75788)
    inner_cv = KFold(5, shuffle=True, random_state=1673)
    print(str(np.datetime64('now')), " INFO: training model ...")

    from sklearn.neighbors import KNeighborsRegressor

    k = np.arange(20) + 1
    parameters = {'n_neighbors': k}

    model = KNeighborsRegressor(weights='distance')
    clf = GridSearchCV(estimator=model,
                       param_grid=parameters,
                       cv=inner_cv,
                       scoring=r2_pearson)

    score = cross_val_score(clf, X, y, scoring=r2_pearson, cv=outer_cv)
    score_r2 = cross_val_score(clf, X, y, scoring=r2, cv=outer_cv)
    score_MAPE = cross_val_score(clf, X, y, scoring=MAPE, cv=outer_cv)

    print('INFO: Pearson score: ', score.mean())

    clf.fit(X, y)
    print('INFO: best parameter: ', clf.fit(X, y).best_params_)

    ##  Create list of i,j

    src = rasterio.open(raster)
    list_j, list_i = np.where(src.read()[0] != src.nodata)

    src.close()

    ## Score images

    X = pd.DataFrame({"i": list_i, "j": list_j})

    y_hat = clf.predict(X)

    outfile = "../Data/Outputs/config_id_{}_KNN.tif".format(id)

    ds = gdal.Open(raster)
    band = ds.GetRasterBand(1)
    arr = band.ReadAsArray()
    [cols, rows] = arr.shape
    arr_out = np.zeros(arr.shape) - 99
    arr_out[list_j, list_i] = y_hat
    driver = gdal.GetDriverByName("GTiff")
    outdata = driver.Create(outfile, rows, cols, 1, gdal.GDT_Float32)

    outdata.SetGeoTransform(
        ds.GetGeoTransform())  # sets same geotransform as input
    outdata.SetProjection(ds.GetProjection())  # sets same projection as input

    outdata.GetRasterBand(1).SetNoDataValue(-99)
    outdata.GetRasterBand(1).WriteArray(arr_out)

    outdata.FlushCache()  # saves to disk!!
    outdata = None
    band = None
    ds = None
Exemplo n.º 5
0
def run(id):
    # ----------------- #
    # SETUP #############
    # ----------------- #

    print(str(np.datetime64('now')), " INFO: config id =", id)

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    indicator = config["indicator"][0]
    raster = config["satellite_grid"][0]
    aggregate_factor = config["aggregation"][0]

    # ----------------------------------- #
    # WorldPop Raster too fine, aggregate #
    from utils import aggregate
    if aggregate_factor > 1:
        print('INFO: aggregating raster ...')
        base_raster = "../tmp/local_raster.tif"
        aggregate(raster, base_raster, aggregate_factor)
    else:
        base_raster = raster

    nightlights_date_start = config["nightlights_date"][0].get("start")
    nightlights_date_end = config["nightlights_date"][0].get("end")

    if config['satellite_config'][0].get('satellite_images') == 'Y':
        step = config['satellite_config'][0].get("satellite_step")

    # -------- #
    # DATAPREP #
    # -------- #
    data = pd.read_csv(dataset)
    data_cols = data.columns.values

    # grid
    GRID = RasterGrid(base_raster)
    list_i, list_j = GRID.get_gridcoordinates(data)

    # to use the centroid from the tile instead
    # coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)
    #data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y
    coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)

    # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES
    # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y

    data["i"], data["j"] = list_i, list_j

    # Get Polygon Geojson of the boundaries
    minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude")
    area = points_to_polygon(minlon, minlat, maxlon, maxlat)

    # --------------------------- #
    # GROUP CLUSTERS IN SAME TILE #
    # --------------------------- #
    # TODO: looks like shit
    cluster_N = 'n'
    print("Number of clusters: {} ".format(len(data)))

    def wavg(g, df, weight_series):
        w = df.ix[g.index][weight_series]
        return (g * w).sum() / w.sum()

    fnc = functools.partial(wavg, df=data, weight_series=cluster_N)

    try:
        data = data.groupby(["i", "j"]).agg({indicator: fnc, 'gpsLatitude': fnc, 'gpsLongitude': fnc}).reset_index()
    except KeyError:
        print("No weights, taking the average per i and j")
        data = data[['i', 'j', 'n', 'gpsLatitude', 'gpsLongitude', indicator]].groupby(["i", "j"]).mean().reset_index()

    print("Number of unique tiles: {} ".format(len(data)))

    list_i, list_j, pipeline = data["i"], data["j"], 'evaluation'

    # ------------------------------------------------------------- #
    # download images from Google and Sentinel and Extract Features #
    # ------------------------------------------------------------- #
    if config["satellite_config"][0]["satellite_images"] != 'N':

        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        for sat in ['Google', 'Sentinel']:
            print('INFO: routine for provider: ', sat)
            # downlaod the images from the relevant API
            GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000)
            print('INFO: images downloaded.')

            if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)):
                print('INFO: already scored.')
                features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline))
            else:
                print('INFO: scoring ...')
                # extract the features
                network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID)
                print('INFO: extractor instantiated.')

                features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline)
                # normalize the features

                features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False)

            features = features.drop('index', 1)
            data = data.merge(features, on=["i", "j"])

        data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

        print('INFO: features extracted.')

    # --------------- #
    # add nightlights #
    # --------------- #

    from nightlights import Nightlights

    NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end)
    data['nightlights'] = NGT.nightlights_values(data)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(data)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}
    osm_features = []

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            osm_tree = OSM.gpd_to_tree(osm_gdf["value"])
            dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1)
            #density = data.apply(OSM.density, args=(osm_gdf["value"],), axis=1)
            data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x))
            osm_features.append('distance_{}'.format(value))
            #data['density_{}'.format(value)] = density.apply(lambda x: np.log(0.0001 + x))
            #osm_features.append('density_{}'.format(value))

    # ---------------- #
    #   NDBI,NDVI,NDWI #
    # ---------------- #
    # TODO: Use efficiently maxNDBImaxNDVImaxNDWI_sum_todf
    print('INFO: getting NDBI, NDVI, NDWI ...')

    start_date = "2017-01-01"  # TODO: Add to config, be careful no image before 2015
    end_date = "2018-01-01"
    for i in date_range(start_date, end_date, 3):
        print('INFO: getting max NDVI between dates: {}'.format(i))
        gee_ndvi_max_raster = gee_sentinel_raster(i[0], i[1], area, ind="NDVI")
        data["max_NDVI_{}_{}".format(i[0], i[1])] = data.apply(gee_raster_mean, args=(gee_ndvi_max_raster, "gpsLatitude", "gpsLongitude", "NDVI"), axis=1)

    print('INFO: getting max NDBI')
    gee_ndbi_max_raster = gee_sentinel_raster(start_date, end_date, area, ind="NDBI")
    data["max_NDBI"] = data.apply(gee_raster_mean, args=(gee_ndbi_max_raster, "gpsLatitude", "gpsLongitude", "NDBI"), axis=1)

    print('INFO: getting max NDWI')
    gee_ndwi_max_raster = gee_sentinel_raster(start_date, end_date, area, ind="NDWI")
    data["max_NDWI"] = data.apply(gee_raster_mean, args=(gee_ndwi_max_raster, "gpsLatitude", "gpsLongitude", "NDWI"), axis=1)

    # --------------- #
    # save features   #
    # --------------- #

    features_list = list(set(data.columns) - set(data_cols) - set(['i', 'j']))

    # Standardize Features (0 mean and 1 std)
    data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std()

    data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

    # --------------- #
    # model indicator #
    # --------------- #
    data = data.sample(frac=1, random_state=1783).reset_index(drop=True)  # shuffle data

    data_features = data[features_list]

    # if take log of indicator
    if config['log'][0]:
        data[indicator] = np.log(data[indicator])
    from modeller import Modeller
    md = Modeller(['kNN', 'Kriging', 'RmSense', 'Ensamble'], data_features)
    cv_loops = 20
    md.compute(data[['i', 'j']], data[indicator].values, cv_loops)

    # save model for production
    md.save_models(id)
    print(str(np.datetime64('now')), 'INFO: model saved.')

    # ------------------ #
    # write scores to DB #
    # ------------------ #

    r2, r2_var = np.mean(md.scores['Ensamble']), np.var(md.scores['Ensamble'])
    r2_knn, r2_var_knn = np.mean(md.scores['kNN']), np.var(md.scores['kNN'])
    r2_rmsense, r2_var_rmsense = np.mean(md.scores['RmSense']), np.var(md.scores['RmSense'])
    y_duplicated = np.repeat(data[indicator], cv_loops)
    mape_rmsense = np.mean(np.abs([item for sublist in md.results['RmSense'] for item in sublist] - y_duplicated) / y_duplicated)
    if mape_rmsense == float("inf") or mape_rmsense == float("-inf"):
        mape_rmsense = 0

    query = """
    insert into results_new (run_date, config_id, r2, r2_var, r2_knn, r2_var_knn, r2_features, r2_var_features, mape_rmsense)
    values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format(
        config['id'][0],
        r2, r2_var, r2_knn, r2_var_knn, r2_rmsense, r2_var_rmsense, mape_rmsense)
    engine.execute(query)

    # ------------------------- #
    # write predictions to file #
    # ------------------------- #
    print('INFO: writing predictions to disk ...')
    results = pd.DataFrame({
        #'yhat': [item for sublist in md.results['kNN'] for item in sublist],
        'y': data[indicator].values,
        'lat': data['gpsLatitude'],
        'lon': data['gpsLongitude']})
    results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False)
Exemplo n.º 6
0
def downscale(config, request):

    country = request.form['country']
    algorithm = request.form['algorithm']

    # country raster --------------------------------------
    # use the country 2 raster app to generate new ones: https://countrytoraster.herokuapp.com/
    raster = '{}_0.01_4326_1.tif'.format(country)
    local_raster = 'temp/' + raster
    print('-> getting raster ', raster)
    # download from AWS S3
    import boto3
    bucket_name = config['rasters_bucket']
    s3 = boto3.resource('s3')
    s3.Bucket(bucket_name).download_file(raster, local_raster)
    print('-> raster loaded.')

    # load dataset from input -------------------------------
    print('-> loading dataset from input form...')
    data = pd.read_csv(request.files['file'])

    # load relative raster
    print('-> loading raster ', local_raster)
    GRID = RasterGrid(local_raster)
    try:
        data['i'], data['j'] = GRID.get_gridcoordinates(data)
    except IndexError:
        print('ERROR: raster and data are not from the same country!')
        raise
    # ------------------------------------

    # Grouping clusters that belong to the same tile.
    cluster_N = 'countbyEA'
    print("Number of clusters: {} ".format(len(data)))

    def wavg(g, df, weight_series):
        w = df.ix[g.index][weight_series]
        return (g * w).sum() / w.sum()

    import functools
    fnc = functools.partial(wavg, df=data, weight_series=cluster_N)

    try:
        data = data.groupby(["i", "j"]).agg({
            'Indicator': fnc,
            'gpsLatitude': fnc,
            'gpsLongitude': fnc
        }).reset_index()
    except:
        print("No weights, taking the average per i and j")
        data = data[['gpsLatitude', 'gpsLongitude',
                     'Indicator']].groupby(["i", "j"]).mean().reset_index()

    print("Number of unique tiles: {} ".format(len(data)))

    # train model ------------------------------------
    X = pd.DataFrame({"i": data["i"], "j": data["j"]})
    y = data.Indicator.values

    from model import IndicatorScaler

    model = IndicatorScaler(algorithm, X, y)

    # all country predictions ------------
    print('-> loading all grid points in the country')
    import rasterio
    src = rasterio.open(local_raster)
    list_j, list_i = np.where(src.read()[0] > 0)
    src.close()

    # also add the gps coordinates to the data for later use
    coords_i, coords_j = GRID.get_gpscoordinates(list_i, list_j)
    res = pd.DataFrame({
        "i": list_i,
        "j": list_j,
        "gpsLongitude": coords_i,
        "gpsLatitude": coords_j
    })

    # ------------------------------------

    # filter on built areas -------------
    # use WorlPop layer to filter on inhabited locations.
    pop_raster = '{}_worldpop.tif'.format(country)
    local_pop_raster = 'temp/' + pop_raster
    print('-> getting population from WorldPop ({})'.format(local_pop_raster))
    if not os.path.exists(local_pop_raster):
        s3.Bucket(bucket_name).download_file(pop_raster, local_pop_raster)

    from img_utils import getRastervalue
    res = getRastervalue(res, local_pop_raster)
    # ------------------------------------

    # predictions for all data left -------
    print('-> running predictions...')
    res['yhat'] = model.model.predict(res[['i', 'j']])
    # ------------------------------------

    # saves to disk ---------------------
    # no idea how this works
    from exporter import tifgenerator
    outfile = "temp/scalerout_{}_{}.tif".format(country, algorithm)
    tifgenerator(outfile=outfile, raster_path=local_raster, df=res)
    # -------------------------------------

    print('-> return file to client.')
    return send_file('../' + outfile,
                     mimetype='image/tiff',
                     as_attachment=True,
                     attachment_filename=country + "_" + algorithm + ".tif")
Exemplo n.º 7
0
def run(id):

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""".format(
        private_config['DB']['user'], private_config['DB']['password'],
        private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config where id = {}".format(id),
                               engine)

    dataset = config["dataset_filename"][0]
    indicator = config["indicator"][0]
    raster = config["satellite_grid"][0]
    step = config["satellite_step"][0]
    provider = config["satellite_source"][0]
    start_date = config["sentinel_start"][0]
    end_date = config["sentinel_end"][0]
    land_use_raster = config["land_use_raster"][0]
    network_model = config['network_model'][0]
    custom_weights = config['custom_weights'][0]
    indicator_log = config['indicator_log'][0]
    model_pca = config['model_pca'][0]
    output = config['output'][0]
    model_grid_parameters = config['model_grid_parameters'][0]

    ## 1. Rasterize Country Shapefile

    country_shp = "../Data/Geofiles/Shapefiles/ADM0/sen_admbnda_adm0_1m_gov_ocha_04082017/sen_admbnda_adm0_1m_gov_ocha_04082017.shp"
    cell_size = 0.05
    no_data = -99
    output = "../Data/Geofiles/Rasters/Senegal_raster_nodata_lowres.tif"

    #gdal_rasterize -a_nodata -99 -burn 1 -tr 0.05 0.05 -l sen_admbnda_adm0_1m_gov_ocha_04082017 "/Users/pasquierjb/Google Drive/WFP_Shared/Projects/HRM/Data/Shapefiles/ADM0/sen_admbnda_adm0_1m_gov_ocha_04082017/sen_admbnda_adm0_1m_gov_ocha_04082017.shp" /Users/pasquierjb/Desktop/test6.tif

    ## 2. Create list of i,j

    #raster="../Data/Geofiles/Rasters/Senegal_raster_nodata.tif"
    src = rasterio.open(raster)
    list_j, list_i = np.where(src.read()[0] != src.nodata)

    src.close()

    ## 3. Download images

    GRID = RasterGrid(raster)

    data = pd.DataFrame({"i": list_i, "j": list_j})

    for sat in provider.split(","):
        data = download_score_merge(id,
                                    data,
                                    GRID,
                                    list_i,
                                    list_j,
                                    raster,
                                    step,
                                    sat,
                                    start_date,
                                    end_date,
                                    network_model,
                                    custom_weights,
                                    pipeline="prediction")

    data.to_csv(
        "../Data/Features/features_all_id_{}_prediction.csv".format(id),
        index=False)

    X = data[list(
        set(data.columns) - set(['index', 'index_x', 'index_y', 'i', 'j']))]
    #X = data.drop(['index', 'index_x', 'index_y', 'i', 'j'], axis=1)
    clf = joblib.load('../Models/ridge_model_config_id_{}.pkl'.format(id))
    y_hat = clf.predict(X)

    outfile = "../Data/Outputs/{}.tif".format(id)

    ds = gdal.Open(raster)
    band = ds.GetRasterBand(1)
    arr = band.ReadAsArray()
    [cols, rows] = arr.shape
    arr_out = np.zeros(arr.shape) - 99
    arr_out[list_j, list_i] = y_hat
    driver = gdal.GetDriverByName("GTiff")
    outdata = driver.Create(outfile, rows, cols, 1, gdal.GDT_Float32)

    outdata.SetGeoTransform(
        ds.GetGeoTransform())  ##sets same geotransform as input
    outdata.SetProjection(ds.GetProjection())  ##sets same projection as input

    outdata.GetRasterBand(1).SetNoDataValue(-99)
    outdata.GetRasterBand(1).WriteArray(arr_out)

    outdata.FlushCache()  ##saves to disk!!
    outdata = None
    band = None
    ds = None
Exemplo n.º 8
0
feature_columns = list(set(features.columns.values) - set(non_feature_columns))
feature_matrix = features[feature_columns]
feature_matrix = feature_matrix.reindex_axis(sorted(feature_matrix.columns),
                                             axis=1)

# ----------------------------------------------------------------------
# apply PCA
from sklearn.decomposition import PCA
pc = PCA(n_components=2)
x = pc.fit_transform(features[feature_columns])
dfx = pd.DataFrame(x, columns=['x', 'y'])

# ----------------------------------------------------------------------
# get raster and retrieve relevant raster coordinates
GRId = RasterGrid(
    raster='../Data/Satellite/F182013.v4c_web.stable_lights.avg_vis.tif',
    image_dir='../Data/Satellite/Google')
dfx['i'], dfx['j'] = features['i'], features['j']

# for what clusters?
dfx['lon'], dfx['lat'] = GRId.get_gpscoordinates(dfx['i'], dfx['j'], step=0)
dfx['lonlat'] = dfx[['lon',
                     'lat']].round(4).astype(str).apply(lambda x: ','.join(x),
                                                        axis=1)

# ----------------------------------------------------------------------
# add indicators score
hh_data = pd.read_csv(
    "../Data/datasets/VAM_ENSA_Nigeria_national_2017_indiv_reduced.csv")[[
        'FCS', 'i', 'j'
    ]]
preprocess_input = keras.applications.resnet50.preprocess_input

dataset = "../Data/datasets/WFP_ENSAN_Senegal_2013_individual.csv"
indicator = "FCS"
raster = "../Data/Geofiles/Rasters/Senegal_0005_4326_1.tif"
step = 0
provider = "Google"
start_date = None
end_date = None

data = pd.read_csv(dataset)
data = data.loc[data[indicator] > 0]
data = data.sample(frac=1,
                   random_state=1783).reset_index(drop=True)  # shuffle data

GRID = RasterGrid(raster)
list_i, list_j = GRID.get_gridcoordinates(data)

data["i"] = list_i
data["j"] = list_j

print("Number of survey records: {} ".format(len(data)))

# Aggregate survey points at the grid level
data = data[['i', 'j', 'gpsLatitude', 'gpsLongitude',
             indicator]].groupby(["i", "j"]).mean()

print("Number of unique tiles: {} ".format(len(data)))

print(data.head())