def expression_summary(dbs, confs): """Return the expression summary.""" chart = {} stats, failed = aggregate(dbs, confs['configurations'], _expression_summary, lambda x, y: x + y) average_by = len(confs['configurations']) - failed if average_by == 1: label = '' if average_by == 0: label = 'For one %s' % confs['resolution']['id'] else: label = 'Average over %s %ss' % (average_by, confs['resolution']['id']) description = [ (label, 'string'), ('Total', 'number'), ('Detected', 'number'), ('Percent', 'number'), ] chart['table_description'] = description chart['table_data'] = _percentage_expression_summary(stats, average_by) return chart
def splicing_summary(dbs, confs): """Fetch splicing summary chart""" chart = {} def adding(x, y): """Add the values. Ignoring the absence of a value for the total.""" z = {'detected': x['detected'] + y['detected']} if x['total'] is None: z['total'] = None else: z['total'] = x['total'] + y['total'] return z stats, failed = aggregate(dbs, confs['configurations'], _splicing_summary, adding) average_by = len(confs['configurations']) - failed if average_by == 0: label = '' elif average_by == 1: label = 'For one set of %ss' % confs['resolution']['id'] else: label = 'Average over %s %ss' % (average_by, confs['resolution']['id']) chart['table_description'] = [ (label, 'string'), ('Total', 'number'), ('Percent', 'number'), ] chart['table_data'] = _percentage_splicing_summary(stats, average_by) return chart
def mapping_summary(dbs, confs): """Return an overview of the results after mapping""" chart = {} stats, failed = aggregate(dbs, confs['configurations'], _mapping_summary, lambda x, y: x + y) average_by = len(confs['configurations']) - failed if average_by == 0: label = '' elif average_by == 1: label = 'For one set of %ss' % confs['resolution']['id'] else: label = 'Average over %s sets of %ss' % (average_by, confs['resolution']['id']) description = [(label, 'string'), ('Total', 'number'), ('Percent', 'number'), ] chart['table_description'] = description chart['table_data'] = _percentage_mapping_summary(stats, average_by) return chart
def mapping_summary(dbs, confs): """Return an overview of the results after mapping""" chart = {} stats, failed = aggregate(dbs, confs['configurations'], _mapping_summary, lambda x, y: x + y) average_by = len(confs['configurations']) - failed if average_by == 0: label = '' elif average_by == 1: label = 'For one set of %ss' % confs['resolution']['id'] else: label = 'Average over %s sets of %ss' % (average_by, confs['resolution']['id']) description = [ (label, 'string'), ('Total', 'number'), ('Percent', 'number'), ] chart['table_description'] = description chart['table_data'] = _percentage_mapping_summary(stats, average_by) return chart
def _p_reads_containing_only_unambiguous_nucleotides(dbs, confs, partition_id): """Return reads containing only unambiguous nucleotides of the partition""" method = _reads_containing_only_unambiguous_nucleotides stats, failed = aggregate(dbs, confs, method, lambda x, y: x + y) if len(confs) - failed == 0: percent = None else: only = float(stats['reads_containing_only_unambiguous_nucleotides']) total_number_of_reads = float(stats['total_number_of_reads']) percent = only / total_number_of_reads * 100.0 return [partition_id, percent]
def _partition_reads_containing_ambiguous_nucleotides(dbs, confs, partition_id): """Return reads containing ambiguous nucleotides for the partition""" method = _reads_containing_ambiguous_nucleotides stats, failed = aggregate(dbs, confs, method, lambda x, y: x + y) if len(confs) - failed == 0: percent = None else: containing = float(stats['reads_containing_ambiguous_nucleotides']) total_number_of_reads = float(stats['total_number_of_reads']) percent = containing / total_number_of_reads * 100.0 return [partition_id, percent]
def _partition_average_and_average_unique_reads(dbs, confs, partition_id): """Return the average and average unique reads for the partition""" stats, failed = aggregate(dbs, confs, _average_and_average_unique_reads, lambda x, y: x + y) average_by = len(confs) - failed if average_by == 0: unique = None total = None else: unique = float(stats['unique']) / average_by total = float(stats['total']) / average_by return [partition_id, total, unique]
def _partition_total_ambiguous_and_unambiguous_reads(dbs, confs, partition_id): """Return the total ambiguous and unambiguous reads for the partition""" stats, failed = aggregate(dbs, confs, _total_ambiguous_and_unambiguous_reads, lambda x, y: x + y) if failed: unambiguous = None ambiguous = None total = None else: unambiguous = float(stats['unambiguous']) ambiguous = float(stats['ambiguous']) total = float(stats['total']) return [partition_id, total, unambiguous, ambiguous]
def _partition_average_percentage_of_unique_reads(dbs, confs, partition_id): """Return the average percentage of unique reads for the partition""" stats, failed = aggregate(dbs, confs, _average_percentage_of_unique_reads, lambda x, y: x + y) average_by = len(confs) - failed if average_by == 0: percent = None else: unique_reads = float(stats['unique_reads']) total_number_of_reads = float(stats['total_number_of_reads']) percent = unique_reads / total_number_of_reads * 100.0 return [partition_id, percent]
def _mapped_reads(dbs, confs, partition, tableid): """Calculate read mappings using different SQL tables""" stats, failed = aggregate(dbs, confs, _raw_mapped_reads, lambda x, y: x + y, tableid=tableid) average_by = len(confs) - failed if average_by == 0: return [partition, None, None, None, None] total = float(stats['totalReads']) / average_by mapped = float(stats['mappedReads']) / average_by unique = float(stats['uniqueReads']) / average_by onezerozero = float(stats['100uniqueReads']) / average_by return [partition, total, mapped, unique, onezerozero]
def __init__(self, length=5, train=True): super(my_dataset, self).__init__() self.database = [] self.length = length self.train = train if self.train: self.size = (60 - self.length) * 2 else: self.size = (15 - self.length) * 2 s = ['j', 'k', 's', 'y'] for s1 in range(4): for s2 in range(4): data = loadcsv('szy_double/%s%s.csv' % (s[s1], s[s2]), [2, 3, 8, 13, 17, 18], begining=10, ending=85) _, _, data = to_DFS(data, 100, 100, cut=[-6, 6]) data = aggregate(data, [2, 3, 8, 13, 17, 18]) data = torch.tensor(data, dtype=torch.float32) self.database.append((data, s1 * 10 + s2))
def read_summary(dbs, confs): """Return the read summary table""" chart = {} method = _read_summary configurations = confs['configurations'] stats, failed = aggregate(dbs, configurations, method, lambda x, y: x + y) average_by = len(configurations) - failed if average_by == 0: label = '' elif average_by == 1: label = 'For one set of %ss' % confs['resolution']['id'] else: label = 'Average over %s sets of %ss' % (average_by, confs['resolution']['id']) chart['table_description'] = [(label, 'string'), ('Total', 'number'), ('Percent', 'number'), ] chart['table_data'] = _percentage_read_summary(stats, average_by) return chart
def plot_loss(dpath, list_dname, output_path, low=.05, high=.95, com=10): """ Plot loss and accuracy from tensorboard file Args: dpath (str): path to folder contain (eg: saved/logs) list_dname (list(str)): list of run_id to plot. output_path (str): path to save csv file after concat logs from different times Return: """ ax = plt.gca() dict_data_frame = aggregate(dpath, list_dname, output_path, False) color = ['red', 'green'] index = 0 for key in dict_data_frame.keys(): df = dict_data_frame[key] quant_df = df.quantile([low, high]) df = df[(df['Value'] > quant_df.loc[low, 'Value']) & (df['Value'] < quant_df.loc[high, 'Value'])] df['Value'] = df['Value'].ewm(com=com).mean() df.plot.line(x='Step', y='Value', label=key, color=color[index], ax=ax) index += 1 plt.show()
def read_summary(dbs, confs): """Return the read summary table""" chart = {} method = _read_summary configurations = confs['configurations'] stats, failed = aggregate(dbs, configurations, method, lambda x, y: x + y) average_by = len(configurations) - failed if average_by == 0: label = '' elif average_by == 1: label = 'For one set of %ss' % confs['resolution']['id'] else: label = 'Average over %s sets of %ss' % (average_by, confs['resolution']['id']) chart['table_description'] = [ (label, 'string'), ('Total', 'number'), ('Percent', 'number'), ] chart['table_data'] = _percentage_read_summary(stats, average_by) return chart
def interpolate(): traindataset = Train_dataset(1) iterations = math.ceil((len(traindataset.subject_list) * 0.2)) # 817 subjects total. De 0 a 654 training. De 654 a 817 test. totalpsnr = 0 totalssim = 0 array_psnr = np.empty(iterations) array_ssim = np.empty(iterations) batch_size = 1 div_patches = 4 num_patches = traindataset.num_patches img_width = 32 # 64 img_height = 32 # 64 img_depth = 23 # 46 for i in range(0, iterations): XT_total = traindataset.data_true(654 + i) XT_mask = traindataset.mask(654 + i) volume_real = XT_total[0][:, :, :, np.newaxis] #volume_real_down = zoom(gaussian_filter(volume_real, sigma=1), [0.5, 0.5, 0.5, 1], prefilter=False, order=1) volume_real_down = zoom(volume_real, [0.5, 0.5, 0.5, 1]) volume_generated = zoom(volume_real_down, [2, 2, 2, 1]) #volume_generated = volume_generated[:, :, :, np.newaxis] #volume_real = XT_total[0][:, :, :, np.newaxis] volume_mask = aggregate(XT_mask) # compute metrics max_gen = np.amax(volume_generated) max_real = np.amax(volume_real) if max_gen > max_real: max = max_gen else: max = max_real min_gen = np.amin(volume_generated) min_real = np.amin(volume_real) if min_gen < min_real: min = min_gen else: min = min_real val_psnr = psnr(np.multiply(volume_real, volume_mask), np.multiply(volume_generated, volume_mask), dynamic_range=max - min) # val_psnr = psnr(volume_real, volume_generated, # dynamic_range=max - min) array_psnr[i] = val_psnr totalpsnr += val_psnr val_ssim = ssim(np.multiply(volume_real, volume_mask), np.multiply(volume_generated, volume_mask), dynamic_range=max - min, multichannel=True) array_ssim[i] = val_ssim totalssim += val_ssim print(val_psnr) print(val_ssim) #save volumes filename_gen = os.path.join(DEFAULT_SAVE_PATH_PREDICTIONS, str(i) + 'gen.nii.gz') img_volume_gen = nib.Nifti1Image(volume_generated, np.eye(4)) img_volume_gen.to_filename(filename_gen) filename_real = os.path.join(DEFAULT_SAVE_PATH_PREDICTIONS, str(i) + 'real.nii.gz') img_volume_real = nib.Nifti1Image(volume_real, np.eye(4)) img_volume_real.to_filename(filename_real) filename_down = os.path.join(DEFAULT_SAVE_PATH_PREDICTIONS, str(i) + 'down.nii.gz') img_volume_down = nib.Nifti1Image(volume_real_down, np.eye(4)) img_volume_down.to_filename(filename_down) return array_psnr, array_ssim
return (obs["_starter_K"] / obs["_starter_BB"]) @staticmethod def calc_AVGIP(obs): return (obs._starter_IP / obs.n_starts) if __name__ == "__main__": print("Beginning current season stat aggregation...") current_season = pd.read_csv("./all_data/current_season.csv") season_totals = aggregate(current_season) print("Beginning starting pitcher stat aggregation...") career_data = pd.read_csv("./all_data/past_raw.csv.gz", compression="gzip") career_starter = aggregate_starter_career(current_season, career_data) season_starter = aggregate_starter_season(current_season, career_data) print("Beginning metric calculation...") calc = Calculator(season_totals, season_starter, career_starter) calc.create_metrics()
def detected_genes(dbs, confs): """Return a list of detected genes.""" chart = {} def adding(x, y): """Add detected and keep biotyle and reliability.""" return { 'detected': x['detected'] + y['detected'], 'biotype': x['biotype'], 'reliability': x['reliability'], } stats, failed = aggregate(dbs, confs['configurations'], _detected_genes, strategy=adding) if stats is None: chart['table_description'] = ['Type'] chart['table_data'] = [[None]] else: biotypes = set() reliabilities = set() replicateids = set() for expid, biotype, reliability in stats.keys(): replicateids.add(expid) reliabilities.add(reliability) biotypes.add(biotype) replicateids = list(replicateids) replicateids.sort() reliabilities = list(reliabilities) reliabilities.sort() biotypes = list(biotypes) biotypes.sort() description = [ ('Type', 'string'), ] #c:[{v:'miRNA'},{v:'NOVEL'},{v:27}] for reliability in reliabilities: description.append((reliability, 'number')) description.append((confs['resolution']['title'], 'string')) chart['table_description'] = description results = [] for expid in replicateids: for biotype in biotypes: row = [biotype] for reliability in reliabilities: detected = stats.get((expid, biotype, reliability), None) if detected is None: row.append(None) else: row.append(int(detected['detected'])) if row[1] or row[2]: results.append(row + [expid]) results.sort() chart['table_data'] = results return chart
def run(id): # ----------------- # # SETUP ############# # ----------------- # print(str(np.datetime64('now')), " INFO: config id =", id) with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""" .format(private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine) dataset = config.get("dataset_filename")[0] indicator = config["indicator"][0] raster = config["satellite_grid"][0] aggregate_factor = config["aggregation"][0] # ----------------------------------- # # WorldPop Raster too fine, aggregate # from utils import aggregate if aggregate_factor > 1: print('INFO: aggregating raster ...') base_raster = "../tmp/local_raster.tif" aggregate(raster, base_raster, aggregate_factor) else: base_raster = raster nightlights_date_start = config["nightlights_date"][0].get("start") nightlights_date_end = config["nightlights_date"][0].get("end") if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step") # -------- # # DATAPREP # # -------- # data = pd.read_csv(dataset) data_cols = data.columns.values # grid GRID = RasterGrid(base_raster) list_i, list_j = GRID.get_gridcoordinates(data) # to use the centroid from the tile instead # coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5) #data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5) # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y data["i"], data["j"] = list_i, list_j # Get Polygon Geojson of the boundaries minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude") area = points_to_polygon(minlon, minlat, maxlon, maxlat) # --------------------------- # # GROUP CLUSTERS IN SAME TILE # # --------------------------- # # TODO: looks like shit cluster_N = 'n' print("Number of clusters: {} ".format(len(data))) def wavg(g, df, weight_series): w = df.ix[g.index][weight_series] return (g * w).sum() / w.sum() fnc = functools.partial(wavg, df=data, weight_series=cluster_N) try: data = data.groupby(["i", "j"]).agg({indicator: fnc, 'gpsLatitude': fnc, 'gpsLongitude': fnc}).reset_index() except KeyError: print("No weights, taking the average per i and j") data = data[['i', 'j', 'n', 'gpsLatitude', 'gpsLongitude', indicator]].groupby(["i", "j"]).mean().reset_index() print("Number of unique tiles: {} ".format(len(data))) list_i, list_j, pipeline = data["i"], data["j"], 'evaluation' # ------------------------------------------------------------- # # download images from Google and Sentinel and Extract Features # # ------------------------------------------------------------- # if config["satellite_config"][0]["satellite_images"] != 'N': start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] for sat in ['Google', 'Sentinel']: print('INFO: routine for provider: ', sat) # downlaod the images from the relevant API GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000) print('INFO: images downloaded.') if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)): print('INFO: already scored.') features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)) else: print('INFO: scoring ...') # extract the features network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID) print('INFO: extractor instantiated.') features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline) # normalize the features features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False) features = features.drop('index', 1) data = data.merge(features, on=["i", "j"]) data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False) print('INFO: features extracted.') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end) data['nightlights'] = NGT.nightlights_values(data) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(data) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} osm_features = [] for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) osm_tree = OSM.gpd_to_tree(osm_gdf["value"]) dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1) #density = data.apply(OSM.density, args=(osm_gdf["value"],), axis=1) data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x)) osm_features.append('distance_{}'.format(value)) #data['density_{}'.format(value)] = density.apply(lambda x: np.log(0.0001 + x)) #osm_features.append('density_{}'.format(value)) # ---------------- # # NDBI,NDVI,NDWI # # ---------------- # # TODO: Use efficiently maxNDBImaxNDVImaxNDWI_sum_todf print('INFO: getting NDBI, NDVI, NDWI ...') start_date = "2017-01-01" # TODO: Add to config, be careful no image before 2015 end_date = "2018-01-01" for i in date_range(start_date, end_date, 3): print('INFO: getting max NDVI between dates: {}'.format(i)) gee_ndvi_max_raster = gee_sentinel_raster(i[0], i[1], area, ind="NDVI") data["max_NDVI_{}_{}".format(i[0], i[1])] = data.apply(gee_raster_mean, args=(gee_ndvi_max_raster, "gpsLatitude", "gpsLongitude", "NDVI"), axis=1) print('INFO: getting max NDBI') gee_ndbi_max_raster = gee_sentinel_raster(start_date, end_date, area, ind="NDBI") data["max_NDBI"] = data.apply(gee_raster_mean, args=(gee_ndbi_max_raster, "gpsLatitude", "gpsLongitude", "NDBI"), axis=1) print('INFO: getting max NDWI') gee_ndwi_max_raster = gee_sentinel_raster(start_date, end_date, area, ind="NDWI") data["max_NDWI"] = data.apply(gee_raster_mean, args=(gee_ndwi_max_raster, "gpsLatitude", "gpsLongitude", "NDWI"), axis=1) # --------------- # # save features # # --------------- # features_list = list(set(data.columns) - set(data_cols) - set(['i', 'j'])) # Standardize Features (0 mean and 1 std) data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std() data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False) # --------------- # # model indicator # # --------------- # data = data.sample(frac=1, random_state=1783).reset_index(drop=True) # shuffle data data_features = data[features_list] # if take log of indicator if config['log'][0]: data[indicator] = np.log(data[indicator]) from modeller import Modeller md = Modeller(['kNN', 'Kriging', 'RmSense', 'Ensamble'], data_features) cv_loops = 20 md.compute(data[['i', 'j']], data[indicator].values, cv_loops) # save model for production md.save_models(id) print(str(np.datetime64('now')), 'INFO: model saved.') # ------------------ # # write scores to DB # # ------------------ # r2, r2_var = np.mean(md.scores['Ensamble']), np.var(md.scores['Ensamble']) r2_knn, r2_var_knn = np.mean(md.scores['kNN']), np.var(md.scores['kNN']) r2_rmsense, r2_var_rmsense = np.mean(md.scores['RmSense']), np.var(md.scores['RmSense']) y_duplicated = np.repeat(data[indicator], cv_loops) mape_rmsense = np.mean(np.abs([item for sublist in md.results['RmSense'] for item in sublist] - y_duplicated) / y_duplicated) if mape_rmsense == float("inf") or mape_rmsense == float("-inf"): mape_rmsense = 0 query = """ insert into results_new (run_date, config_id, r2, r2_var, r2_knn, r2_var_knn, r2_features, r2_var_features, mape_rmsense) values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format( config['id'][0], r2, r2_var, r2_knn, r2_var_knn, r2_rmsense, r2_var_rmsense, mape_rmsense) engine.execute(query) # ------------------------- # # write predictions to file # # ------------------------- # print('INFO: writing predictions to disk ...') results = pd.DataFrame({ #'yhat': [item for sublist in md.results['kNN'] for item in sublist], 'y': data[indicator].values, 'lat': data['gpsLatitude'], 'lon': data['gpsLongitude']}) results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False)
def main(id, aggregate_factor, min_pop, minlat, maxlat, minlon, maxlon, shapefile): # ----------------- # # SETUP ############# # ----------------- # print(str(np.datetime64('now')), " INFO: config id =", id) with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""" .format(private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine) dataset = config.get("dataset_filename")[0] raster = config["satellite_grid"][0] scope = config["scope"][0] nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), config["nightlights_date"][0].get("end") s2_date_start, s2_date_end = config["NDs_date"][0].get("start"), config["NDs_date"][0].get("end") if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step") # ----------------------------------- # # WorldPop Raster too fine, aggregate # if aggregate_factor is None: aggregate_factor = config["base_raster_aggregation"][0] if aggregate_factor > 1: print('INFO: aggregating raster with factor {}'.format(aggregate_factor)) base_raster = "../tmp/local_raster.tif" aggregate(raster, base_raster, aggregate_factor) else: base_raster = raster # ---------------- # # AREA OF INTEREST # # ---------------- # dataset_df = pd.read_csv(dataset) data_cols = dataset_df.columns.values # create geometry if (minlat is None) and (maxlat is None) and (minlon is None) and (maxlon is None): minlat, maxlat, minlon, maxlon = df_boundaries(dataset_df, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude") area = points_to_polygon(minlon, minlat, maxlon, maxlat) # crop raster with rasterio.open(base_raster) as src: out_image, out_transform = mask(src, [area], crop=True) out_meta = src.meta.copy() # save the resulting raster out_meta.update({"driver": "GTiff", "height": out_image.shape[1], "width": out_image.shape[2], "transform": out_transform }) final_raster = "../tmp/final_raster.tif" print('INFO: Remiving tiles with population under {}'.format(min_pop)) # only score areas where there are at agg factor living with rasterio.open(final_raster, "w", **out_meta) as dest: out_image[out_image < min_pop] = dest.nodata dest.write(out_image) list_j, list_i = np.where(out_image[0] != dest.nodata) # instantiate GRID GRID = RasterGrid(final_raster) coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5) data = pd.DataFrame({"i": list_i, "j": list_j}) data["gpsLatitude"] = coords_y data["gpsLongitude"] = coords_x print("Number of clusters: {} ".format(len(data))) list_i, list_j, pipeline = data["i"], data["j"], 'scoring' # ------------------------------------------------------------- # # download images from Google and Sentinel and Extract Features # # ------------------------------------------------------------- # if config["satellite_config"][0]["satellite_images"] != 'N': start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] for sat in ['Google', 'Sentinel']: print('INFO: routine for provider: ', sat) # downlaod the images from the relevant API GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000) print('INFO: images downloaded.') print('INFO: scoring ...') # extract the features network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID) print('INFO: extractor instantiated.') features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline) # normalize the features features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False) features = features.drop('index', 1) data = data.merge(features, on=["i", "j"]) data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format(id, pipeline), index=False) print('INFO: features extracted.') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end) data['nightlights'] = NGT.nightlights_values(data) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(minlon, minlat, maxlon, maxlat) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} osm_features = [] for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) osm_tree = OSM.gpd_to_tree(osm_gdf["value"]) dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1) data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x)) osm_features.append('distance_{}'.format(value)) # ---------------- # # NDBI,NDVI,NDWI # # ---------------- # print('INFO: getting NDBI, NDVI, NDWI ...') from rms_indexes import S2indexes S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope) S2.download() data[['max_NDVI', 'max_NDBI', 'max_NDWI']] = S2.rms_values(data).apply(pd.Series) # --------------- # # save features # # --------------- # features_list = list(sorted(set(data.columns) - set(data_cols) - set(['i', 'j']))) # Standardize Features (0 mean and 1 std) # TODO: use mean and max from training print("INFO: Normalizing by the max") data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].max() data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format(id, pipeline), index=False) # Open model ensemble_pipeline = joblib.load('../Models/Ensemble_model_config_id_{}.pkl'.format(id)) print(str(np.datetime64('now')), 'INFO: model loaded.') X = data[features_list + ["gpsLatitude", "gpsLongitude"]] ensemble_predictions = ensemble_pipeline.predict(X.values) # if take log of indicator if config['log'][0]: ensemble_predictions = np.exp(ensemble_predictions) results = pd.DataFrame({'i': list_i, 'j': list_j, 'lat': coords_y, 'lon': coords_x, 'yhat': ensemble_predictions}) outfile = "../Data/Results/scalerout_{}.tif".format(id) tifgenerator(outfile=outfile, raster_path=final_raster, df=results) outfile = "../Data/Results/scalerout_{}_kNN.tif".format(id) results['yhat_kNN'] = ensemble_pipeline.regr_[0].predict(X.values) tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_kNN') outfile = "../Data/Results/scalerout_{}_Ridge.tif".format(id) results['yhat_Ridge'] = ensemble_pipeline.regr_[1].predict(X.values) tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_Ridge') if shapefile is not None: input_rst = "../Data/Results/scalerout_{}.tif".format(id) weight_rst = "../tmp/final_raster.tif" output_shp = "../Data/Results/scalerout_{}_aggregated.shp".format(id) from utils import weighted_sum_by_polygon weighted_sum_by_polygon(shapefile, input_rst, weight_rst, output_shp)
def main(top_left, bottom_left, bottom_right, top_right, config_id): # ------# # SETUP # with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) # connect to db and read config table engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""" .format(private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config_new where id = {}".format(config_id), engine) raster = config["satellite_grid"][0] nightlights_date = config.get("nightlights_date")[0] base_raster = "../tmp/local_raster.tif" if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step") # ----------------------------------- # # WorldPop Raster too fine, aggregate # aggregate(raster, base_raster, 1) # ------------------- # # CLIP RASTER TO SCOPE # geoms = [{'type': 'Polygon', 'coordinates': [[top_left, bottom_left, bottom_right, top_right]]}] with rasterio.open(base_raster) as src: out_image, out_transform = mask(src, geoms, crop=True) out_meta = src.meta.copy() # save the resulting raster out_meta.update({"driver": "GTiff", "height": out_image.shape[1], "width": out_image.shape[2], "transform": out_transform }) with rasterio.open(base_raster, "w", **out_meta) as dest: dest.write(out_image) # load the new clipped raster to the img_lib GRID = RasterGrid(base_raster) with rasterio.open(base_raster) as src: list_j, list_i = np.where(src.read()[0] != src.nodata) print("INFO: downloading images in scope ...") coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5) # ------------------------------------------------------------- # # download images from Google and Sentinel and Extract Features # # ------------------------------------------------------------- # if config["satellite_config"][0]["satellite_images"] != 'N': start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] for sat in ['Google', 'Sentinel']: print('INFO: routine for provider: ', sat) # dopwnlaod the images from the relevant API GRID.download_images(list_i, list_j, step, sat, start_date, end_date) print('INFO: images downloaded.') print('INFO: scoring ...') # extarct the features network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID) print('INFO: extractor instantiated.') features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline='scoring') # normalize the features features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, config_id, 'scoring'), index=False) g_features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format("Google", config_id, 'scoring')) s_features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format("Sentinel", config_id, 'scoring')) data = pd.merge(g_features, s_features, on=['i', 'j', 'index']) data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(config_id), index=False) print('INFO: features extracted.') else: data = pd.DataFrame({'gpsLongitude': coords_x, 'gpsLatitude': coords_y, 'j': list_j, 'i': list_i}) # --------------- # # add nightlights # # --------------- # from geojson import Polygon from nightlights import Nightlights area = Polygon([[top_left, bottom_left, bottom_right, top_right]]) NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date) data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y data['nightlights'] = NGT.nightlights_values(data) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(data) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} osm_features = [] for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) osm_tree = OSM.gpd_to_tree(osm_gdf["value"]) dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1) # density = data.apply(OSM.density, args=(osm_gdf["value"],), axis=1) data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x)) osm_features.append('distance_{}'.format(value)) # data['density_{}'.format(value)] = density.apply(lambda x: np.log(0.0001 + x)) # osm_features.append('density_{}'.format(value)) # ---------------------- # # LOAD MODEL AND PREDICT # print("INFO: load model and predict ...") try: X = data.drop(['index', 'i', 'j', 'gpsLongitude', 'gpsLatitude'], axis=1) except ValueError: X = data.drop(['i', 'j', 'gpsLongitude', 'gpsLatitude'], axis=1) # load model and predict try: RmSense = joblib.load('../Models/RmSense_model_config_id_{}.pkl'.format(config_id)) kNN = joblib.load('../Models/kNN_model_config_id_{}.pkl'.format(config_id)) except FileNotFoundError: print('ERROR: model not found') yhat = (RmSense.predict(X) + kNN.predict(data[['i','j']])) / 2. results = pd.DataFrame({'i': list_i, 'j': list_j, 'lat': coords_y, 'lon': coords_x, 'yhat': yhat}) outfile = "../Data/Results/scalerout_{}.tif".format(config_id) tifgenerator(outfile=outfile, raster_path=base_raster, df=results)
def AggregateSlopes(self): """Remove sequential entries which have the same slope.""" self.entries['slope'] = utils.aggregate(self.entries['slope'])
def evaluate(img_width, img_height, img_depth, upsampling_factor): # dataset & variables traindataset = Train_dataset(1) iterations = math.ceil( (len(traindataset.subject_list) * 0.2)) # 817 subjects total. De 0 a 654 training. De 654 a 817 test. print(len(traindataset.subject_list)) print(iterations) totalpsnr = 0 totalssim = 0 array_psnr = np.empty(iterations) array_ssim = np.empty(iterations) batch_size = 1 div_patches = 4 num_patches = traindataset.num_patches img_width = img_width # 224 img_height = img_height # 224 img_depth = img_depth # 152 # define model t_input_gen = tf.placeholder('float32', [1, None, None, None, 1], name='t_image_input_to_SRGAN_generator') srgan_network = generator(t_input_gen, kernel=3, nb=6, upscaling_factor=upsampling_factor, is_train=False, reuse=False, img_width=img_width, img_height=img_height, img_depth=img_depth) # restore g sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) saver = tf.train.Saver( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="SRGAN_g")) saver.restore( sess, tf.train.latest_checkpoint( '/work/isanchez/g/ds4-gdl-lrdecay/subpixel')) for i in range(0, iterations): # extract volumes xt_total = traindataset.data_true( 654 + i) # [[self.batch_size, 224, 224, 152]] xt_mask = traindataset.mask(654 + i) xg_generated = np.empty([1, 224, 224, 152, 1]) normfactor = (np.amax(xt_total[0])) / 2 x_generator = ((xt_total[0] - normfactor) / normfactor) res = 1 / upsampling_factor x_generator = x_generator[:, :, :, np.newaxis] x_generator = zoom(x_generator, [res, res, res, 1]) # x_generator = gaussian_filter(x_generator, sigma=1) xg_generated[0] = sess.run(srgan_network.outputs, {t_input_gen: x_generator[np.newaxis, :]}) xg_generated[0] = ((xg_generated[0] + 1) * normfactor) volume_real = xt_total[0] volume_real = volume_real[:, :, :, np.newaxis] volume_generated = xg_generated[0] volume_mask = aggregate(xt_mask) # compute metrics max_gen = np.amax(volume_generated) max_real = np.amax(volume_real) if max_gen > max_real: val_max = max_gen else: val_max = max_real min_gen = np.amin(volume_generated) min_real = np.amin(volume_real) if min_gen < min_real: val_min = min_gen else: val_min = min_real val_psnr = psnr(np.multiply(volume_real, volume_mask), np.multiply(volume_generated, volume_mask), dynamic_range=val_max - val_min) array_psnr[i] = val_psnr totalpsnr += val_psnr val_ssim = ssim(np.multiply(volume_real, volume_mask), np.multiply(volume_generated, volume_mask), dynamic_range=val_max - val_min, multichannel=True) array_ssim[i] = val_ssim totalssim += val_ssim print(val_psnr) print(val_ssim) # save volumes filename_gen = os.path.join(args.path_volumes, str(i) + 'gen.nii.gz') img_volume_gen = nib.Nifti1Image(volume_generated, np.eye(4)) img_volume_gen.to_filename(filename_gen) filename_real = os.path.join(args.path_volumes, str(i) + 'real.nii.gz') img_volume_real = nib.Nifti1Image(volume_real, np.eye(4)) img_volume_real.to_filename(filename_real) print('{}{}'.format('Mean PSNR: ', array_psnr.mean())) print('{}{}'.format('Mean SSIM: ', array_ssim.mean())) print('{}{}'.format('Variance PSNR: ', array_psnr.var())) print('{}{}'.format('Variance SSIM: ', array_ssim.var())) print('{}{}'.format('Max PSNR: ', array_psnr.max())) print('{}{}'.format('Min PSNR: ', array_psnr.min())) print('{}{}'.format('Max SSIM: ', array_ssim.max())) print('{}{}'.format('Min SSIM: ', array_ssim.min()))
domains = utils.read_json(utils.data_location / subfolder / 'domains.json') all_domains.extend(domains) if config['rebuttals']: rebuttals = utils.read_json(utils.data_location / subfolder / 'rebuttals.json') for source_url, rebuttal_l in rebuttals.items(): for rebuttal_url, source in rebuttal_l.items(): all_rebuttals[source_url][rebuttal_url].append(source) urls_cnt = len(all_urls) domains_cnt = len(all_domains) fake_urls_cnt = len([el for el in all_urls if el['label'] == 'fake']) fake_domains_cnt = len([el for el in all_domains if el['label'] == 'fake']) print('#urls', urls_cnt, ': fake', fake_urls_cnt, 'true', urls_cnt - fake_urls_cnt) print('#domains', domains_cnt, ': fake', fake_domains_cnt, 'true', domains_cnt - fake_domains_cnt) aggregated_urls = utils.aggregate(all_urls) aggregated_domains = utils.aggregate(all_domains, 'domain') utils.write_json_with_path(aggregated_urls, utils.data_location, 'aggregated_urls.json') utils.write_json_with_path(aggregated_domains, utils.data_location, 'aggregated_domains.json') utils.write_json_with_path(all_rebuttals, utils.data_location, 'aggregated_rebuttals.json') # copy to backend utils.write_json_with_path(aggregated_urls, Path('../backend'), 'aggregated_urls.json') utils.write_json_with_path(aggregated_domains, Path('../backend'), 'aggregated_domains.json') utils.write_json_with_path(all_rebuttals, Path('../backend'), 'aggregated_rebuttals.json') utils.print_stats(aggregated_urls) utils.print_stats(aggregated_domains) print('updating mappings, it may take a while')
def callback(ch, method, properties, body): data_dict = json.loads(body) fickle_file = os.path.join(MRB_TOP, 'MC_genie_numu_CC_seed-service.fcl') print data_dict job_id = str(data_dict.get('job_id')) batch_id = str(data_dict.get('batch_id')) batch_object = Batch.objects.get(job_id=job_id, batch_id=batch_id) if not job_id or not batch_id or not batch_object: print('No id!') ch.basic_ack(delivery_tag=method.delivery_tag) return print("<received_job>\n <job_id> {0}\n " "<batch_id> {1}\n".format(job_id, batch_id)) # Create log and error file if data_dict.get('log_dir'): log_path = os.path.join(LOG_TOP, data_dict.get('log_dir')) else: log_path = os.path.join(LOG_TOP, job_id) mkdir_p(log_path) log_file_path = os.path.join(log_path, str(batch_id) + '.log') err_file_path = os.path.join(log_path, 'err_' + str(batch_id) + '.log') log_file = open(log_file_path, 'w') err_file = open(err_file_path, 'w') batch_object.log_path = log_file_path batch_object.err_path = err_file_path batch_object.start_time = datetime.datetime.now() # if new file, copy new file to MRB_TOP DATA_TOP + new_file try: if data_dict.get('new_file'): new_file = str(data_dict.get('new_file')).replace(" ", "") new_file_path = os.path.join(DATA_TOP, new_file) shutil.copy(new_file_path, MRB_TOP) print >> log_file, "<new_file> '{0}' copied to $MRB_TOP".format(new_file) fickle_file = os.path.join(MRB_TOP, new_file.split('/')[-1]) # check out_path exists, if exist cd, else make and cd, if no exist then make id dir # $DATA_TOP/out_dir/id if data_dict.get('out_dir'): out_path = os.path.join(DATA_TOP, data_dict.get('out_dir'), job_id) if not os.path.exists(out_path): mkdir_p(out_path) os.chdir(out_path) else: out_path = os.path.join(DATA_TOP, job_id) mkdir_p(out_path) os.chdir(out_path) batch_object.out_path = out_path batch_object.save() except Exception as e: update_error(batch_id, job_id, e) print >> err_file, '<error> {0}'.format(e) print(e) ch.basic_ack(delivery_tag=method.delivery_tag) return exec_list = [ 'lar -c {0} -n {1} -o single_gen_{2}.root'.format(fickle_file, data_dict.get('events'), batch_id), 'lar -c /products/dev/WireDump_numu_NC-1.fcl -s single_gen_{0}.root -T wire_dump_out_{0}.root'.format( batch_id), 'python /products/dev/ProcessRootFile.py ./wire_dump_out_{0}.root out_{0}'.format(batch_id)] print >> log_file, '<working dir> {0}'.format(out_path) for cmd in exec_list: print >> log_file, '<cmd> {0}'.format(cmd) try: update_status(batch_id, job_id, cmd) sp.call(cmd, shell=True, stdout=log_file, stderr=err_file) except Exception as e: print(e) print >> err_file, '<error> {0}'.format(e) update_error(batch_id, job_id, e) # Change permissions for r, d, f in os.walk(os.getcwd()): os.chmod(r, 0777) # Aggregate output aggregate(out_path, LINK_PATH) print "<job end>" print >> log_file, "<job end>" log_file.close() err_file.close() update_complete(batch_id, job_id) ch.basic_ack(delivery_tag=method.delivery_tag) return
def process_record(data_line_, prediction_line_, neg_gap_, feature_dir_, record_dir_, match_fn, all_doc_scores, all_ans_scores, z_scores): missing_count_ = 0 total_count_ = 0 stop_count_ = 0 data = json.loads(data_line_) question = data['question'] q_id = slugify(question) q_path = os.path.join(feature_dir_, '%s.json' % q_id) n_q = [0 for _ in Tokenizer.FEAT] if os.path.exists(q_path): q_data = open(q_path, encoding=ENCODING).read() record = json.loads(q_data) q_ner = record['ner'] q_pos = record['pos'] for feat in q_ner + q_pos: n_q[Tokenizer.FEAT_DICT[feat]] += 1 else: print('question feature file %s not exist!' % q_path) sys.stdout.flush() missing_count_ += 1 return missing_count_, total_count_, stop_count_ answer = [normalize(a) for a in data['answer']] prediction = json.loads(prediction_line_) # MAKE SURE REVERSE IS TRUE ranked_prediction = sorted(prediction, key=lambda k: k['doc_score'], reverse=True) correct_rank = get_rank(prediction, answer, match_fn) if correct_rank > 150: # if correct_rank < 50 or correct_rank > 150: return missing_count_, total_count_, stop_count_ all_corr_rank.append(correct_rank - 1) all_n_p = [] all_n_a = [] all_p_scores = [] all_a_scores = [] all_probs = [] all_spans = [] repeats = 0 for i, entry in enumerate(ranked_prediction): doc_id = entry['doc_id'] start = int(entry['start']) end = int(entry['end']) doc_score = entry['doc_score'] ans_score = entry['span_score'] prob = entry['prob'] span = entry['span'] # RESTRICT TO MAX 1000000000 # print("Threshold 1000000") # ans_score=min(ans_score, 1000000) #restrict to max of million if span in all_spans: repeats += 1 all_spans.append(span) ################Calculate sample z score (t statistic) for answer score if all_a_scores == [] or len( all_a_scores ) == 1: # dont use a_zscore feature at the beginning or if we only have 1 a_zscore = 0 else: # Take the sample mean of the previous ones, take zscore of the current with respect to that # sample_mean = np.mean(all_a_scores + [ans_score]) sample_mean = np.mean(all_a_scores) # sample_std = np.std(all_a_scores + [ans_score]) sample_std = np.std(all_a_scores) # if sample_std != 0: a_zscore = (ans_score - sample_mean) / sample_std # else: # a_zscore = 0 z_scores.append(a_zscore) # THESE ARE FOR STATISTISTICS OVER ENTIRE DATA SET, IGNORE all_doc_scores.append(doc_score) all_ans_scores.append(ans_score) corr_doc_score = (doc_score - DOC_MEAN) / DOC_STD corr_ans_mean_score = (np.mean(all_a_scores + [ans_score]) - ANS_MEAN) / ANS_STD all_probs.append(prob) ############### p_pos = dict() p_ner = dict() feat_file = os.path.join(feature_dir_, '%s.json' % doc_id) if os.path.exists(feat_file): record = json.load(open(feat_file)) p_ner[doc_id] = record['ner'] p_pos[doc_id] = record['pos'] n_p = [0 for _ in Tokenizer.FEAT] n_a = [0 for _ in Tokenizer.FEAT] for feat in p_ner[doc_id] + p_pos[doc_id]: n_p[Tokenizer.FEAT_DICT[feat]] += 1 for feat in p_ner[doc_id][start:end + 1] + p_pos[doc_id][start:end + 1]: n_a[Tokenizer.FEAT_DICT[feat]] += 1 all_n_p.append(n_p) all_n_a.append(n_a) all_p_scores.append(doc_score) all_a_scores.append(ans_score) f_np = aggregate(all_n_p) f_na = aggregate(all_n_a) f_sp = aggregate(all_p_scores) f_sa = aggregate_ans(all_a_scores) record = OrderedDict() # sp, nq, np, na, ha record['sp'] = f_sp record['nq'] = list(map(float, n_q)) record['np'] = f_np record['na'] = f_na record['sa'] = f_sa record['a_zscore'] = a_zscore record['corr_doc_score'] = corr_doc_score record['i'] = i record['prob_avg'] = sum(all_probs) / len(all_probs) record['prob'] = prob record['repeats'] = repeats record['ans_avg'] = corr_ans_mean_score if i + 1 == correct_rank: # if i + 1 >= correct_rank: record['stop'] = 1 stop_count_ += 1 write_record = True # if i % neg_gap_ ==0: # write_record = True # else: # write_record = False should_return = True # if i + 1 - correct_rank > 30: # should_return = True # else: # should_return = False else: should_return = False if i % neg_gap_ == 0: record['stop'] = 0 write_record = True else: write_record = False if write_record: record_path = os.path.join(record_dir_, '%s_%s.pkl' % (q_id, doc_id)) with open(record_path, 'wb') as f: pk.dump(record, f) total_count_ += 1 if should_return: return missing_count_, total_count_, stop_count_ return missing_count_, total_count_, stop_count_
def run(id): # ----------------- # # SETUP ############# # ----------------- # print(str(np.datetime64('now')), " INFO: config id =", id) with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""" .format(private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine) dataset = config.get("dataset_filename")[0] indicator = config["indicator"][0] raster = config["satellite_grid"][0] aggregate_factor = config["base_raster_aggregation"][0] scope = config["scope"][0] nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), config["nightlights_date"][0].get("end") s2_date_start, s2_date_end = config["NDs_date"][0].get("start"), config["NDs_date"][0].get("end") if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step") # ----------------------------------- # # WorldPop Raster too fine, aggregate # from utils import aggregate if aggregate_factor > 1: print('INFO: aggregating raster {}'.format(raster)) base_raster = "../tmp/local_raster.tif" aggregate(raster, base_raster, aggregate_factor) else: base_raster = raster # -------- # # DATAPREP # # -------- # data = pd.read_csv(dataset) data_cols = data.columns.values # grid GRID = RasterGrid(base_raster) list_i, list_j = GRID.get_gridcoordinates(data) # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y data["i"], data["j"] = list_i, list_j # Get Polygon Geojson of the boundaries minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude") area = points_to_polygon(minlon, minlat, maxlon, maxlat) print("Number of clusters: {} ".format(len(data))) list_i, list_j, pipeline = data["i"], data["j"], 'evaluation' # ------------------------------------------------------------- # # download images from Google and Sentinel and Extract Features # # ------------------------------------------------------------- # if config["satellite_config"][0]["satellite_images"] != 'N': start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] for sat in ['Google', 'Sentinel']: print('INFO: routine for provider: ', sat) # downlaod the images from the relevant API GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000) print('INFO: images downloaded.') if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)): print('INFO: already scored.') features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)) else: print('INFO: scoring ...') # extract the features network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID) print('INFO: extractor instantiated.') features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline) # normalize the features features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False) features = features.drop('index', 1) data = data.merge(features, on=["i", "j"]) data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False) print('INFO: features extracted.') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end) data['nightlights'] = NGT.nightlights_values(data) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(minlon, minlat, maxlon, maxlat) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} osm_features = [] for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) osm_tree = OSM.gpd_to_tree(osm_gdf["value"]) dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1) data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x)) osm_features.append('distance_{}'.format(value)) # ---------------- # # NDBI,NDVI,NDWI # # ---------------- # print('INFO: getting NDBI, NDVI, NDWI ...') from rms_indexes import S2indexes S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope) S2.download() data[['max_NDVI', 'max_NDBI', 'max_NDWI']] = S2.rms_values(data).apply(pd.Series) # --------------- # # save features # # --------------- # # features to be use in the linear model features_list = list(sorted(set(data.columns) - set(data_cols) - set(['i', 'j']))) # Standardize Features (0 mean and 1 std) #data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std() print("Normalizing : max") data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].max() data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False) # --------------- # # model indicator # # --------------- # # shuffle dataset data = data.sample(frac=1, random_state=1783).reset_index(drop=True) # shuffle data # if set in the config, take log of indicator if config['log'][0]: data[indicator] = np.log(data[indicator]) from modeller import Modeller X, y = data[features_list + ["gpsLatitude", "gpsLongitude"]], data[indicator] modeller = Modeller(X, rs_features=features_list, spatial_features=["gpsLatitude", "gpsLongitude"], scoring='r2', cv_loops=20) kNN_pipeline = modeller.make_model_pipeline('kNN') kNN_scores = modeller.compute_scores(kNN_pipeline, y) kNN_R2_mean = kNN_scores.mean() kNN_R2_std = kNN_scores.std() print("kNN_R2_mean: ", kNN_R2_mean, "kNN_R2_std: ", kNN_R2_std) Ridge_pipeline = modeller.make_model_pipeline('Ridge') Ridge_scores = modeller.compute_scores(Ridge_pipeline, y) Ridge_R2_mean = Ridge_scores.mean() Ridge_R2_std = Ridge_scores.std() print("Ridge_R2_mean: ", Ridge_R2_mean, "Ridge_R2_std: ", Ridge_R2_std) Ensemble_pipeline = modeller.make_ensemble_pipeline([kNN_pipeline, Ridge_pipeline]) Ensemble_scores = modeller.compute_scores(Ensemble_pipeline, y) Ensemble_R2_mean = Ensemble_scores.mean() Ensemble_R2_std = Ensemble_scores.std() print("Ensemble_R2_mean: ", Ensemble_R2_mean, "Ensemble_R2_std: ", Ensemble_R2_std) # ------------------ # # write scores to DB # # ------------------ # query = """ insert into results_new (run_date, config_id, r2, r2_sd, r2_knn, r2_sd_knn, r2_features, r2_sd_features, mape_rmsense) values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format( config['id'][0], Ensemble_R2_mean, Ensemble_R2_std, kNN_R2_mean, kNN_R2_std, Ridge_R2_mean, Ridge_R2_std, 0) engine.execute(query) # ------------------------- # # write predictions to file # # ------------------------- # print('INFO: writing predictions to disk ...') from sklearn.model_selection import cross_val_predict results = pd.DataFrame({ 'yhat': cross_val_predict(Ensemble_pipeline, X.values, y), 'y': data[indicator].values, 'lat': data['gpsLatitude'], 'lon': data['gpsLongitude']}) results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False) # save model for production Ensemble_pipeline.fit(X.values, y) # Best n_neighbors (kNN) print('INFO: number of neighbours chosen: ', Ensemble_pipeline.regr_[0].named_steps['gridsearchcv'].best_params_) # Best alpha (Ridge) print('INFO: regularization param chosen: ', Ensemble_pipeline.regr_[1].named_steps['gridsearchcv'].best_params_) from sklearn.externals import joblib joblib.dump(Ensemble_pipeline, '../Models/Ensemble_model_config_id_{}.pkl'.format(id)) print(str(np.datetime64('now')), 'INFO: model saved.')
def main(id, aggregate_factor, min_pop, bbox, shapefile): """ makes predictions is areas where we have no survey. Args: id (int): the config id aggregate_factor (int): aggregate pixels to lower resolution by x much min_pop: minimium population in pixel to score bbox: bounding box <minlat> <minlon> <maxlat> <maxlon>, if omitted will use boundaries from dataset shapefile: aggregate within shapefile's geometires Example: id, aggregate_factor, min_pop = 3075, 15, 500 """ # read the configs for id print(str(np.datetime64('now')), " INFO: config id =", id) with open('../private_config.yml', 'r') as cfgfile: private_config = yaml.load(cfgfile) engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""".format( private_config['DB']['user'], private_config['DB']['password'], private_config['DB']['host'], private_config['DB']['database'])) config = pd.read_sql_query( "select * from config_new where id = {}".format(id), engine) dataset = config.get("dataset_filename")[0] raster = config["base_raster"][0] scope = config["scope"][0] nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), \ config["nightlights_date"][0].get("end") s2_date_start, s2_date_end = config["NDs_date"][0].get( "start"), config["NDs_date"][0].get("end") ISO = config["iso3"][0] if config['satellite_config'][0].get('satellite_images') == 'Y': print('INFO: satellite images from Google and Sentinel-2') step = config['satellite_config'][0].get("satellite_step") elif config['satellite_config'][0].get('satellite_images') == 'G': print('INFO: only Google satellite images.') step = config['satellite_config'][0].get("satellite_step") elif config['satellite_config'][0].get('satellite_images') == 'N': print('INFO: no satellite images') # ----------------------------------- # # WorldPop Raster too granular (lots of images), aggregate # if aggregate_factor > 1: print( 'INFO: aggregating raster with factor {}'.format(aggregate_factor)) base_raster = "../local_raster.tif" aggregate(raster, base_raster, aggregate_factor) else: base_raster = raster # ---------------- # # AREA OF INTEREST # # ---------------- # # dataset_df = pd.read_csv(dataset) # data_cols = dataset_df.columns.values if sum(bbox) != 0: # dummy bbox print("INFO: using AOI from bbox") print(sum(bbox)) # define AOI with manually defined bbox minlat, minlon, maxlat, maxlon = bbox[0], bbox[1], bbox[2], bbox[3] area = points_to_polygon(minlat=minlat, minlon=minlon, maxlat=maxlat, maxlon=maxlon) else: print("INFO: using AOI from dataset.") # use dataset's extent dataset_df = pd.read_csv(dataset) minlat, maxlat, minlon, maxlon = boundaries(dataset_df['gpsLatitude'], dataset_df['gpsLongitude']) area = points_to_polygon(minlat=minlat, minlon=minlon, maxlat=maxlat, maxlon=maxlon) del dataset_df # crop raster with rasterio.open(base_raster) as src: out_image, out_transform = mask(src, [area], crop=True) out_meta = src.meta.copy() # save the resulting raster out_meta.update({ "driver": "GTiff", "height": out_image.shape[1], "width": out_image.shape[2], "transform": out_transform }) final_raster = "../final_raster.tif" print('INFO: Removing tiles with population under {}'.format( min_pop)) # only score areas where there are at agg factor living with rasterio.open(final_raster, "w", **out_meta) as dest: out_image[out_image < min_pop] = dest.nodata dest.write(out_image) list_j, list_i = np.where(out_image[0] != dest.nodata) # instantiate GRID GRID = BaseLayer(final_raster) coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5) ix = pd.MultiIndex.from_arrays([list_i, list_j, coords_y, coords_x], names=('i', 'j', "gpsLatitude", "gpsLongitude")) print("Number of clusters: {} ".format(len(ix))) pipeline = 'scoring' # ------------------------------------------------ # # download images from Google and Extract Features # # ------------------------------------------------ # if config['satellite_config'][0].get('satellite_images') in ['Y', 'G']: features_path = "../Data/Features/features_Google_id_{}_{}.csv".format( id, pipeline) data_path = "../Data/Satellite/" gimages = GoogleImages(data_path) # download the images from the relevant API gimages.download(coords_x, coords_y, step=step) # extract the features features = pd.DataFrame(gimages.featurize(coords_x, coords_y, step=step), index=ix) features.columns = [str(col) + '_Google' for col in features.columns] features.to_csv(features_path) print('INFO: features extracted.') data = features.copy() # ------------------------------------------------------------- # # download Sentinel images and Extract Features # # ------------------------------------------------------------- # if config['satellite_config'][0].get('satellite_images') == 'Y': features_path = "../Data/Features/features_Sentinel_id_{}_{}.csv".format( id, pipeline) data_path = "../Data/Satellite/" start_date = config["satellite_config"][0]["start_date"] end_date = config["satellite_config"][0]["end_date"] from sentinel_images import SentinelImages simages = SentinelImages(data_path) # download the images from the relevant API simages.download(coords_x, coords_y, start_date, end_date) print('INFO: scoring ...') # extract the features print('INFO: extractor instantiated.') features = pd.DataFrame(simages.featurize(coords_x, coords_y, start_date, end_date), index=ix) features.columns = [str(col) + '_Sentinel' for col in features.columns] features.to_csv(features_path) if data is not None: data = data.join(features) else: data = features.copy() print('INFO: features extracted') # --------------- # # add nightlights # # --------------- # from nightlights import Nightlights nlights = Nightlights('../Data/Geofiles/') nlights.download(area, nightlights_date_start, nightlights_date_end) features = pd.DataFrame(nlights.featurize(coords_x, coords_y), columns=['nightlights'], index=ix) # quantize nightlights features['nightlights'] = pd.qcut(features['nightlights'], 5, labels=False, duplicates='drop') data = data.join(features) # ---------------- # # add OSM features # # ---------------- # OSM = OSM_extractor(minlon, minlat, maxlon, maxlat) tags = {"amenity": ["school", "hospital"], "natural": ["tree"]} osm_gdf = {} for key, values in tags.items(): for value in values: osm_gdf["value"] = OSM.download(key, value) dist = OSM.distance_to_nearest(coords_y, coords_x, osm_gdf["value"]) data['distance_{}'.format(value)] = [ np.log(0.0001 + x) for x in dist ] # ---------------- # # NDBI,NDVI,NDWI # # ---------------- # print('INFO: getting NDBI, NDVI, NDWI ...') from rms_indexes import S2indexes S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope) S2.download() data['max_NDVI'], data['max_NDBI'], data['max_NDWI'] = S2.rms_values( coords_x, coords_y) # --------------- # # add ACLED # # --------------- # from acled import ACLED acled = ACLED("../Data/Geofiles/ACLED/") acled.download(ISO, nightlights_date_start, nightlights_date_end) d = {} for property in ["fatalities", "n_events", "violence_civ"]: for k in [10000, 100000]: d[property + "_" + str(k)] = acled.featurize(coords_x, coords_y, property=property, function='density', buffer=k) d["weighted_sum_fatalities_by_dist"] = acled.featurize( coords_x, coords_y, property="fatalities", function='weighted_kNN') d["distance_to_acled_event"] = acled.featurize(coords_x, coords_y, function='distance') # quantize ACLED for c in d.keys(): d[c] = np.nan_to_num(pd.qcut(d[c], 5, labels=False, duplicates='drop')) features = pd.DataFrame(d, index=data.index) data = data.join(features) # --------------- # # save features # # --------------- # print('INFO: {} columns.'.format(len(data.columns))) # features to be use in the linear model features_list = list(sorted(data.columns)) print(features_list) data.to_csv("../Data/Features/features_all_id_{}_{}_nonscaled.csv".format( id, pipeline)) # Scale Features print("Normalizing : max") data[features_list] = (data[features_list] - data[features_list].mean() ) / (data[features_list].max() + 0.001) data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format( id, pipeline)) # ------- # # predict # # ------- # ensemble_pipeline = joblib.load( '../Models/Ensemble_model_config_id_{}.pkl'.format(id)) print(str(np.datetime64('now')), 'INFO: model loaded.') X = data.reset_index(level=[2, 3]) ensemble_predictions = ensemble_pipeline.predict(X.values) results = pd.DataFrame({ 'i': list_i, 'j': list_j, 'lat': coords_y, 'lon': coords_x, 'yhat': ensemble_predictions }) results.to_csv('../Data/Results/config_{}.csv'.format(id)) outfile = "../Data/Results/scalerout_{}.tif".format(id) tifgenerator(outfile=outfile, raster_path=final_raster, df=results) outfile = "../Data/Results/scalerout_{}_kNN.tif".format(id) results['yhat_kNN'] = ensemble_pipeline.regr_[0].predict(X.values) tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_kNN') outfile = "../Data/Results/scalerout_{}_Ridge.tif".format(id) results['yhat_Ridge'] = ensemble_pipeline.regr_[1].predict(X.values) tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_Ridge') if shapefile is not None: input_rst = "../Data/Results/scalerout_{}.tif".format(id) weight_rst = "../tmp/final_raster.tif" output_shp = "../Data/Results/scalerout_{}_aggregated.shp".format(id) from utils import weighted_sum_by_polygon weighted_sum_by_polygon(shapefile, input_rst, weight_rst, output_shp)