Пример #1
0
def expression_summary(dbs, confs):
    """Return the expression summary."""
    chart = {}

    stats, failed = aggregate(dbs, confs['configurations'],
                              _expression_summary, lambda x, y: x + y)

    average_by = len(confs['configurations']) - failed

    if average_by == 1:
        label = ''
    if average_by == 0:
        label = 'For one %s' % confs['resolution']['id']
    else:
        label = 'Average over %s %ss' % (average_by, confs['resolution']['id'])

    description = [
        (label, 'string'),
        ('Total', 'number'),
        ('Detected', 'number'),
        ('Percent', 'number'),
    ]
    chart['table_description'] = description

    chart['table_data'] = _percentage_expression_summary(stats, average_by)
    return chart
Пример #2
0
def splicing_summary(dbs, confs):
    """Fetch splicing summary chart"""
    chart = {}

    def adding(x, y):
        """Add the values. Ignoring the absence of a value for the total."""
        z = {'detected': x['detected'] + y['detected']}
        if x['total'] is None:
            z['total'] = None
        else:
            z['total'] = x['total'] + y['total']
        return z

    stats, failed = aggregate(dbs, confs['configurations'], _splicing_summary,
                              adding)

    average_by = len(confs['configurations']) - failed

    if average_by == 0:
        label = ''
    elif average_by == 1:
        label = 'For one set of %ss' % confs['resolution']['id']
    else:
        label = 'Average over %s %ss' % (average_by, confs['resolution']['id'])

    chart['table_description'] = [
        (label, 'string'),
        ('Total', 'number'),
        ('Percent', 'number'),
    ]

    chart['table_data'] = _percentage_splicing_summary(stats, average_by)
    return chart
Пример #3
0
def mapping_summary(dbs, confs):
    """Return an overview of the results after mapping"""
    chart = {}

    stats, failed = aggregate(dbs,
                              confs['configurations'],
                              _mapping_summary,
                              lambda x, y: x + y)

    average_by = len(confs['configurations']) - failed

    if average_by == 0:
        label = ''
    elif average_by == 1:
        label = 'For one set of %ss' % confs['resolution']['id']
    else:
        label = 'Average over %s sets of %ss' % (average_by,
                                                 confs['resolution']['id'])

    description = [(label, 'string'),
                   ('Total', 'number'),
                   ('Percent', 'number'),
                   ]
    chart['table_description'] = description
    chart['table_data'] = _percentage_mapping_summary(stats, average_by)
    return chart
Пример #4
0
def mapping_summary(dbs, confs):
    """Return an overview of the results after mapping"""
    chart = {}

    stats, failed = aggregate(dbs, confs['configurations'], _mapping_summary,
                              lambda x, y: x + y)

    average_by = len(confs['configurations']) - failed

    if average_by == 0:
        label = ''
    elif average_by == 1:
        label = 'For one set of %ss' % confs['resolution']['id']
    else:
        label = 'Average over %s sets of %ss' % (average_by,
                                                 confs['resolution']['id'])

    description = [
        (label, 'string'),
        ('Total', 'number'),
        ('Percent', 'number'),
    ]
    chart['table_description'] = description
    chart['table_data'] = _percentage_mapping_summary(stats, average_by)
    return chart
Пример #5
0
def _p_reads_containing_only_unambiguous_nucleotides(dbs, confs, partition_id):
    """Return reads containing only unambiguous nucleotides of the partition"""
    method = _reads_containing_only_unambiguous_nucleotides
    stats, failed = aggregate(dbs, confs, method, lambda x, y: x + y)
    if len(confs) - failed == 0:
        percent = None
    else:
        only = float(stats['reads_containing_only_unambiguous_nucleotides'])
        total_number_of_reads = float(stats['total_number_of_reads'])
        percent = only / total_number_of_reads * 100.0
    return [partition_id, percent]
Пример #6
0
def _partition_reads_containing_ambiguous_nucleotides(dbs, confs, partition_id):
    """Return reads containing ambiguous nucleotides for the partition"""
    method = _reads_containing_ambiguous_nucleotides
    stats, failed = aggregate(dbs, confs, method, lambda x, y: x + y)
    if len(confs) - failed == 0:
        percent = None
    else:
        containing = float(stats['reads_containing_ambiguous_nucleotides'])
        total_number_of_reads = float(stats['total_number_of_reads'])
        percent = containing / total_number_of_reads * 100.0
    return [partition_id, percent]
Пример #7
0
def _partition_average_and_average_unique_reads(dbs, confs, partition_id):
    """Return the average and average unique reads for the partition"""
    stats, failed = aggregate(dbs, confs, _average_and_average_unique_reads,
                              lambda x, y: x + y)
    average_by = len(confs) - failed
    if average_by == 0:
        unique = None
        total = None
    else:
        unique = float(stats['unique']) / average_by
        total = float(stats['total']) / average_by
    return [partition_id, total, unique]
Пример #8
0
def _partition_total_ambiguous_and_unambiguous_reads(dbs, confs, partition_id):
    """Return the total ambiguous and unambiguous reads for the partition"""
    stats, failed = aggregate(dbs, confs,
                              _total_ambiguous_and_unambiguous_reads,
                              lambda x, y: x + y)
    if failed:
        unambiguous = None
        ambiguous = None
        total = None
    else:
        unambiguous = float(stats['unambiguous'])
        ambiguous = float(stats['ambiguous'])
        total = float(stats['total'])
    return [partition_id, total, unambiguous, ambiguous]
Пример #9
0
def _partition_average_and_average_unique_reads(dbs, confs, partition_id):
    """Return the average and average unique reads for the partition"""
    stats, failed = aggregate(dbs,
                              confs,
                              _average_and_average_unique_reads,
                              lambda x, y: x + y)
    average_by = len(confs) - failed
    if average_by == 0:
        unique = None
        total = None
    else:
        unique = float(stats['unique']) / average_by
        total = float(stats['total']) / average_by
    return [partition_id, total, unique]
Пример #10
0
def _partition_average_percentage_of_unique_reads(dbs, confs, partition_id):
    """Return the average percentage of unique reads for the partition"""
    stats, failed = aggregate(dbs, confs, _average_percentage_of_unique_reads,
                              lambda x, y: x + y)

    average_by = len(confs) - failed

    if average_by == 0:
        percent = None
    else:
        unique_reads = float(stats['unique_reads'])
        total_number_of_reads = float(stats['total_number_of_reads'])
        percent = unique_reads / total_number_of_reads * 100.0

    return [partition_id, percent]
Пример #11
0
def _partition_total_ambiguous_and_unambiguous_reads(dbs, confs, partition_id):
    """Return the total ambiguous and unambiguous reads for the partition"""
    stats, failed = aggregate(dbs,
                              confs,
                              _total_ambiguous_and_unambiguous_reads,
                              lambda x, y: x + y)
    if failed:
        unambiguous = None
        ambiguous = None
        total = None
    else:
        unambiguous = float(stats['unambiguous'])
        ambiguous = float(stats['ambiguous'])
        total = float(stats['total'])
    return [partition_id, total, unambiguous, ambiguous]
Пример #12
0
def _partition_average_percentage_of_unique_reads(dbs, confs, partition_id):
    """Return the average percentage of unique reads for the partition"""
    stats, failed = aggregate(dbs,
                              confs,
                              _average_percentage_of_unique_reads,
                              lambda x, y: x + y)

    average_by = len(confs) - failed

    if average_by == 0:
        percent = None
    else:
        unique_reads = float(stats['unique_reads'])
        total_number_of_reads = float(stats['total_number_of_reads'])
        percent = unique_reads / total_number_of_reads * 100.0

    return [partition_id, percent]
Пример #13
0
def _mapped_reads(dbs, confs, partition, tableid):
    """Calculate read mappings using different SQL tables"""
    stats, failed = aggregate(dbs,
                              confs,
                              _raw_mapped_reads,
                              lambda x, y: x + y,
                              tableid=tableid)

    average_by = len(confs) - failed

    if average_by == 0:
        return [partition, None, None, None, None]

    total = float(stats['totalReads']) / average_by
    mapped = float(stats['mappedReads']) / average_by
    unique = float(stats['uniqueReads']) / average_by
    onezerozero = float(stats['100uniqueReads']) / average_by
    return [partition, total, mapped, unique, onezerozero]
Пример #14
0
def _mapped_reads(dbs, confs, partition, tableid):
    """Calculate read mappings using different SQL tables"""
    stats, failed = aggregate(dbs,
                              confs,
                              _raw_mapped_reads,
                              lambda x, y: x + y,
                              tableid=tableid)

    average_by = len(confs) - failed

    if average_by == 0:
        return [partition, None, None, None, None]

    total = float(stats['totalReads']) / average_by
    mapped = float(stats['mappedReads']) / average_by
    unique = float(stats['uniqueReads']) / average_by
    onezerozero = float(stats['100uniqueReads']) / average_by
    return [partition, total, mapped, unique, onezerozero]
Пример #15
0
 def __init__(self, length=5, train=True):
     super(my_dataset, self).__init__()
     self.database = []
     self.length = length
     self.train = train
     if self.train:
         self.size = (60 - self.length) * 2
     else:
         self.size = (15 - self.length) * 2
     s = ['j', 'k', 's', 'y']
     for s1 in range(4):
         for s2 in range(4):
             data = loadcsv('szy_double/%s%s.csv' % (s[s1], s[s2]),
                            [2, 3, 8, 13, 17, 18],
                            begining=10,
                            ending=85)
             _, _, data = to_DFS(data, 100, 100, cut=[-6, 6])
             data = aggregate(data, [2, 3, 8, 13, 17, 18])
             data = torch.tensor(data, dtype=torch.float32)
             self.database.append((data, s1 * 10 + s2))
Пример #16
0
def read_summary(dbs, confs):
    """Return the read summary table"""
    chart = {}
    method = _read_summary
    configurations = confs['configurations']
    stats, failed = aggregate(dbs, configurations, method, lambda x, y: x + y)
    average_by = len(configurations) - failed
    if average_by == 0:
        label = ''
    elif average_by == 1:
        label = 'For one set of %ss' % confs['resolution']['id']
    else:
        label = 'Average over %s sets of %ss' % (average_by,
                                                 confs['resolution']['id'])
    chart['table_description'] = [(label, 'string'),
                                  ('Total', 'number'),
                                  ('Percent', 'number'),
                                  ]
    chart['table_data'] = _percentage_read_summary(stats, average_by)
    return chart
Пример #17
0
def plot_loss(dpath, list_dname, output_path, low=.05, high=.95, com=10):
    """ Plot loss and accuracy from tensorboard file
    Args:
        dpath (str): path to folder contain (eg: saved/logs)
        list_dname (list(str)): list of run_id to plot.
        output_path (str): path to save csv file after concat logs from different times
    Return:
    """
    ax = plt.gca()
    dict_data_frame = aggregate(dpath, list_dname, output_path, False)
    color = ['red', 'green']
    index = 0
    for key in dict_data_frame.keys():
        df = dict_data_frame[key]
        quant_df = df.quantile([low, high])
        df = df[(df['Value'] > quant_df.loc[low, 'Value'])
                & (df['Value'] < quant_df.loc[high, 'Value'])]
        df['Value'] = df['Value'].ewm(com=com).mean()
        df.plot.line(x='Step', y='Value', label=key, color=color[index], ax=ax)
        index += 1
    plt.show()
Пример #18
0
def read_summary(dbs, confs):
    """Return the read summary table"""
    chart = {}
    method = _read_summary
    configurations = confs['configurations']
    stats, failed = aggregate(dbs, configurations, method, lambda x, y: x + y)
    average_by = len(configurations) - failed
    if average_by == 0:
        label = ''
    elif average_by == 1:
        label = 'For one set of %ss' % confs['resolution']['id']
    else:
        label = 'Average over %s sets of %ss' % (average_by,
                                                 confs['resolution']['id'])
    chart['table_description'] = [
        (label, 'string'),
        ('Total', 'number'),
        ('Percent', 'number'),
    ]
    chart['table_data'] = _percentage_read_summary(stats, average_by)
    return chart
def interpolate():
    traindataset = Train_dataset(1)
    iterations = math.ceil((len(traindataset.subject_list) * 0.2))
    # 817 subjects total. De 0 a 654 training. De 654 a 817 test.
    totalpsnr = 0
    totalssim = 0
    array_psnr = np.empty(iterations)
    array_ssim = np.empty(iterations)
    batch_size = 1
    div_patches = 4
    num_patches = traindataset.num_patches
    img_width = 32  # 64
    img_height = 32  # 64
    img_depth = 23  # 46

    for i in range(0, iterations):
        XT_total = traindataset.data_true(654 + i)
        XT_mask = traindataset.mask(654 + i)
        volume_real = XT_total[0][:, :, :, np.newaxis]
        #volume_real_down = zoom(gaussian_filter(volume_real, sigma=1), [0.5, 0.5, 0.5, 1], prefilter=False, order=1)
        volume_real_down = zoom(volume_real, [0.5, 0.5, 0.5, 1])
        volume_generated = zoom(volume_real_down, [2, 2, 2, 1])
        #volume_generated = volume_generated[:, :, :, np.newaxis]
        #volume_real = XT_total[0][:, :, :, np.newaxis]
        volume_mask = aggregate(XT_mask)
        # compute metrics
        max_gen = np.amax(volume_generated)
        max_real = np.amax(volume_real)
        if max_gen > max_real:
            max = max_gen
        else:
            max = max_real
        min_gen = np.amin(volume_generated)
        min_real = np.amin(volume_real)
        if min_gen < min_real:
            min = min_gen
        else:
            min = min_real
        val_psnr = psnr(np.multiply(volume_real, volume_mask),
                        np.multiply(volume_generated, volume_mask),
                        dynamic_range=max - min)
        # val_psnr = psnr(volume_real, volume_generated,
        #                 dynamic_range=max - min)
        array_psnr[i] = val_psnr

        totalpsnr += val_psnr
        val_ssim = ssim(np.multiply(volume_real, volume_mask),
                        np.multiply(volume_generated, volume_mask),
                        dynamic_range=max - min,
                        multichannel=True)
        array_ssim[i] = val_ssim
        totalssim += val_ssim
        print(val_psnr)
        print(val_ssim)
        #save volumes
        filename_gen = os.path.join(DEFAULT_SAVE_PATH_PREDICTIONS,
                                    str(i) + 'gen.nii.gz')
        img_volume_gen = nib.Nifti1Image(volume_generated, np.eye(4))
        img_volume_gen.to_filename(filename_gen)
        filename_real = os.path.join(DEFAULT_SAVE_PATH_PREDICTIONS,
                                     str(i) + 'real.nii.gz')
        img_volume_real = nib.Nifti1Image(volume_real, np.eye(4))
        img_volume_real.to_filename(filename_real)
        filename_down = os.path.join(DEFAULT_SAVE_PATH_PREDICTIONS,
                                     str(i) + 'down.nii.gz')
        img_volume_down = nib.Nifti1Image(volume_real_down, np.eye(4))
        img_volume_down.to_filename(filename_down)
    return array_psnr, array_ssim
Пример #20
0
                return (obs["_starter_K"] / obs["_starter_BB"])

    @staticmethod
    def calc_AVGIP(obs):

        return (obs._starter_IP / obs.n_starts)


if __name__ == "__main__":

    print("Beginning current season stat aggregation...")

    current_season = pd.read_csv("./all_data/current_season.csv")

    season_totals = aggregate(current_season)

    print("Beginning starting pitcher stat aggregation...")

    career_data = pd.read_csv("./all_data/past_raw.csv.gz", compression="gzip")

    career_starter = aggregate_starter_career(current_season, career_data)

    season_starter = aggregate_starter_season(current_season, career_data)

    print("Beginning metric calculation...")

    calc = Calculator(season_totals, season_starter, career_starter)

    calc.create_metrics()
Пример #21
0
def detected_genes(dbs, confs):
    """Return a list of detected genes."""
    chart = {}

    def adding(x, y):
        """Add detected and keep biotyle and reliability."""
        return {
            'detected': x['detected'] + y['detected'],
            'biotype': x['biotype'],
            'reliability': x['reliability'],
        }

    stats, failed = aggregate(dbs,
                              confs['configurations'],
                              _detected_genes,
                              strategy=adding)

    if stats is None:
        chart['table_description'] = ['Type']
        chart['table_data'] = [[None]]
    else:
        biotypes = set()
        reliabilities = set()
        replicateids = set()
        for expid, biotype, reliability in stats.keys():
            replicateids.add(expid)
            reliabilities.add(reliability)
            biotypes.add(biotype)
        replicateids = list(replicateids)
        replicateids.sort()
        reliabilities = list(reliabilities)
        reliabilities.sort()
        biotypes = list(biotypes)
        biotypes.sort()

        description = [
            ('Type', 'string'),
        ]

        #c:[{v:'miRNA'},{v:'NOVEL'},{v:27}]

        for reliability in reliabilities:
            description.append((reliability, 'number'))

        description.append((confs['resolution']['title'], 'string'))

        chart['table_description'] = description

        results = []
        for expid in replicateids:
            for biotype in biotypes:
                row = [biotype]
                for reliability in reliabilities:
                    detected = stats.get((expid, biotype, reliability), None)
                    if detected is None:
                        row.append(None)
                    else:
                        row.append(int(detected['detected']))
                if row[1] or row[2]:
                    results.append(row + [expid])
        results.sort()
        chart['table_data'] = results
    return chart
Пример #22
0
def run(id):
    # ----------------- #
    # SETUP #############
    # ----------------- #

    print(str(np.datetime64('now')), " INFO: config id =", id)

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    indicator = config["indicator"][0]
    raster = config["satellite_grid"][0]
    aggregate_factor = config["aggregation"][0]

    # ----------------------------------- #
    # WorldPop Raster too fine, aggregate #
    from utils import aggregate
    if aggregate_factor > 1:
        print('INFO: aggregating raster ...')
        base_raster = "../tmp/local_raster.tif"
        aggregate(raster, base_raster, aggregate_factor)
    else:
        base_raster = raster

    nightlights_date_start = config["nightlights_date"][0].get("start")
    nightlights_date_end = config["nightlights_date"][0].get("end")

    if config['satellite_config'][0].get('satellite_images') == 'Y':
        step = config['satellite_config'][0].get("satellite_step")

    # -------- #
    # DATAPREP #
    # -------- #
    data = pd.read_csv(dataset)
    data_cols = data.columns.values

    # grid
    GRID = RasterGrid(base_raster)
    list_i, list_j = GRID.get_gridcoordinates(data)

    # to use the centroid from the tile instead
    # coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)
    #data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y
    coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)

    # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES
    # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y

    data["i"], data["j"] = list_i, list_j

    # Get Polygon Geojson of the boundaries
    minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude")
    area = points_to_polygon(minlon, minlat, maxlon, maxlat)

    # --------------------------- #
    # GROUP CLUSTERS IN SAME TILE #
    # --------------------------- #
    # TODO: looks like shit
    cluster_N = 'n'
    print("Number of clusters: {} ".format(len(data)))

    def wavg(g, df, weight_series):
        w = df.ix[g.index][weight_series]
        return (g * w).sum() / w.sum()

    fnc = functools.partial(wavg, df=data, weight_series=cluster_N)

    try:
        data = data.groupby(["i", "j"]).agg({indicator: fnc, 'gpsLatitude': fnc, 'gpsLongitude': fnc}).reset_index()
    except KeyError:
        print("No weights, taking the average per i and j")
        data = data[['i', 'j', 'n', 'gpsLatitude', 'gpsLongitude', indicator]].groupby(["i", "j"]).mean().reset_index()

    print("Number of unique tiles: {} ".format(len(data)))

    list_i, list_j, pipeline = data["i"], data["j"], 'evaluation'

    # ------------------------------------------------------------- #
    # download images from Google and Sentinel and Extract Features #
    # ------------------------------------------------------------- #
    if config["satellite_config"][0]["satellite_images"] != 'N':

        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        for sat in ['Google', 'Sentinel']:
            print('INFO: routine for provider: ', sat)
            # downlaod the images from the relevant API
            GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000)
            print('INFO: images downloaded.')

            if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)):
                print('INFO: already scored.')
                features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline))
            else:
                print('INFO: scoring ...')
                # extract the features
                network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID)
                print('INFO: extractor instantiated.')

                features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline)
                # normalize the features

                features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False)

            features = features.drop('index', 1)
            data = data.merge(features, on=["i", "j"])

        data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

        print('INFO: features extracted.')

    # --------------- #
    # add nightlights #
    # --------------- #

    from nightlights import Nightlights

    NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end)
    data['nightlights'] = NGT.nightlights_values(data)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(data)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}
    osm_features = []

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            osm_tree = OSM.gpd_to_tree(osm_gdf["value"])
            dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1)
            #density = data.apply(OSM.density, args=(osm_gdf["value"],), axis=1)
            data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x))
            osm_features.append('distance_{}'.format(value))
            #data['density_{}'.format(value)] = density.apply(lambda x: np.log(0.0001 + x))
            #osm_features.append('density_{}'.format(value))

    # ---------------- #
    #   NDBI,NDVI,NDWI #
    # ---------------- #
    # TODO: Use efficiently maxNDBImaxNDVImaxNDWI_sum_todf
    print('INFO: getting NDBI, NDVI, NDWI ...')

    start_date = "2017-01-01"  # TODO: Add to config, be careful no image before 2015
    end_date = "2018-01-01"
    for i in date_range(start_date, end_date, 3):
        print('INFO: getting max NDVI between dates: {}'.format(i))
        gee_ndvi_max_raster = gee_sentinel_raster(i[0], i[1], area, ind="NDVI")
        data["max_NDVI_{}_{}".format(i[0], i[1])] = data.apply(gee_raster_mean, args=(gee_ndvi_max_raster, "gpsLatitude", "gpsLongitude", "NDVI"), axis=1)

    print('INFO: getting max NDBI')
    gee_ndbi_max_raster = gee_sentinel_raster(start_date, end_date, area, ind="NDBI")
    data["max_NDBI"] = data.apply(gee_raster_mean, args=(gee_ndbi_max_raster, "gpsLatitude", "gpsLongitude", "NDBI"), axis=1)

    print('INFO: getting max NDWI')
    gee_ndwi_max_raster = gee_sentinel_raster(start_date, end_date, area, ind="NDWI")
    data["max_NDWI"] = data.apply(gee_raster_mean, args=(gee_ndwi_max_raster, "gpsLatitude", "gpsLongitude", "NDWI"), axis=1)

    # --------------- #
    # save features   #
    # --------------- #

    features_list = list(set(data.columns) - set(data_cols) - set(['i', 'j']))

    # Standardize Features (0 mean and 1 std)
    data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std()

    data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

    # --------------- #
    # model indicator #
    # --------------- #
    data = data.sample(frac=1, random_state=1783).reset_index(drop=True)  # shuffle data

    data_features = data[features_list]

    # if take log of indicator
    if config['log'][0]:
        data[indicator] = np.log(data[indicator])
    from modeller import Modeller
    md = Modeller(['kNN', 'Kriging', 'RmSense', 'Ensamble'], data_features)
    cv_loops = 20
    md.compute(data[['i', 'j']], data[indicator].values, cv_loops)

    # save model for production
    md.save_models(id)
    print(str(np.datetime64('now')), 'INFO: model saved.')

    # ------------------ #
    # write scores to DB #
    # ------------------ #

    r2, r2_var = np.mean(md.scores['Ensamble']), np.var(md.scores['Ensamble'])
    r2_knn, r2_var_knn = np.mean(md.scores['kNN']), np.var(md.scores['kNN'])
    r2_rmsense, r2_var_rmsense = np.mean(md.scores['RmSense']), np.var(md.scores['RmSense'])
    y_duplicated = np.repeat(data[indicator], cv_loops)
    mape_rmsense = np.mean(np.abs([item for sublist in md.results['RmSense'] for item in sublist] - y_duplicated) / y_duplicated)
    if mape_rmsense == float("inf") or mape_rmsense == float("-inf"):
        mape_rmsense = 0

    query = """
    insert into results_new (run_date, config_id, r2, r2_var, r2_knn, r2_var_knn, r2_features, r2_var_features, mape_rmsense)
    values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format(
        config['id'][0],
        r2, r2_var, r2_knn, r2_var_knn, r2_rmsense, r2_var_rmsense, mape_rmsense)
    engine.execute(query)

    # ------------------------- #
    # write predictions to file #
    # ------------------------- #
    print('INFO: writing predictions to disk ...')
    results = pd.DataFrame({
        #'yhat': [item for sublist in md.results['kNN'] for item in sublist],
        'y': data[indicator].values,
        'lat': data['gpsLatitude'],
        'lon': data['gpsLongitude']})
    results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False)
Пример #23
0
def main(id, aggregate_factor, min_pop, minlat, maxlat, minlon, maxlon, shapefile):

    # ----------------- #
    # SETUP #############
    # ----------------- #

    print(str(np.datetime64('now')), " INFO: config id =", id)

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    raster = config["satellite_grid"][0]

    scope = config["scope"][0]
    nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), config["nightlights_date"][0].get("end")
    s2_date_start, s2_date_end = config["NDs_date"][0].get("start"), config["NDs_date"][0].get("end")
    if config['satellite_config'][0].get('satellite_images') == 'Y':
        step = config['satellite_config'][0].get("satellite_step")

    # ----------------------------------- #
    # WorldPop Raster too fine, aggregate #
    if aggregate_factor is None:
        aggregate_factor = config["base_raster_aggregation"][0]

    if aggregate_factor > 1:
        print('INFO: aggregating raster with factor {}'.format(aggregate_factor))
        base_raster = "../tmp/local_raster.tif"
        aggregate(raster, base_raster, aggregate_factor)
    else:
        base_raster = raster

    # ---------------- #
    # AREA OF INTEREST #
    # ---------------- #
    dataset_df = pd.read_csv(dataset)
    data_cols = dataset_df.columns.values

    # create geometry
    if (minlat is None) and (maxlat is None) and (minlon is None) and (maxlon is None):
        minlat, maxlat, minlon, maxlon = df_boundaries(dataset_df, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude")

    area = points_to_polygon(minlon, minlat, maxlon, maxlat)

    # crop raster
    with rasterio.open(base_raster) as src:
        out_image, out_transform = mask(src, [area], crop=True)
        out_meta = src.meta.copy()

    # save the resulting raster
    out_meta.update({"driver": "GTiff",
                     "height": out_image.shape[1],
                     "width": out_image.shape[2],
                     "transform": out_transform
                     })

    final_raster = "../tmp/final_raster.tif"
    print('INFO: Remiving tiles with population under {}'.format(min_pop))  # only score areas where there are at agg factor living
    with rasterio.open(final_raster, "w", **out_meta) as dest:
        out_image[out_image < min_pop] = dest.nodata
        dest.write(out_image)
        list_j, list_i = np.where(out_image[0] != dest.nodata)

    # instantiate GRID
    GRID = RasterGrid(final_raster)

    coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)

    data = pd.DataFrame({"i": list_i, "j": list_j})
    data["gpsLatitude"] = coords_y
    data["gpsLongitude"] = coords_x

    print("Number of clusters: {} ".format(len(data)))

    list_i, list_j, pipeline = data["i"], data["j"], 'scoring'

    # ------------------------------------------------------------- #
    # download images from Google and Sentinel and Extract Features #
    # ------------------------------------------------------------- #
    if config["satellite_config"][0]["satellite_images"] != 'N':

        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        for sat in ['Google', 'Sentinel']:
            print('INFO: routine for provider: ', sat)
            # downlaod the images from the relevant API
            GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000)
            print('INFO: images downloaded.')

            print('INFO: scoring ...')
            # extract the features
            network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID)
            print('INFO: extractor instantiated.')

            features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline)
            # normalize the features

            features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False)

            features = features.drop('index', 1)
            data = data.merge(features, on=["i", "j"])

        data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format(id, pipeline), index=False)

        print('INFO: features extracted.')

    # --------------- #
    # add nightlights #
    # --------------- #

    from nightlights import Nightlights

    NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end)
    data['nightlights'] = NGT.nightlights_values(data)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(minlon, minlat, maxlon, maxlat)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}
    osm_features = []

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            osm_tree = OSM.gpd_to_tree(osm_gdf["value"])
            dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1)
            data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x))
            osm_features.append('distance_{}'.format(value))

    # ---------------- #
    #   NDBI,NDVI,NDWI #
    # ---------------- #
    print('INFO: getting NDBI, NDVI, NDWI ...')

    from rms_indexes import S2indexes

    S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope)
    S2.download()
    data[['max_NDVI', 'max_NDBI', 'max_NDWI']] = S2.rms_values(data).apply(pd.Series)

    # --------------- #
    # save features   #
    # --------------- #

    features_list = list(sorted(set(data.columns) - set(data_cols) - set(['i', 'j'])))

    # Standardize Features (0 mean and 1 std)
    # TODO: use mean and max from training
    print("INFO: Normalizing by the max")
    data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].max()

    data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format(id, pipeline), index=False)

    # Open model
    ensemble_pipeline = joblib.load('../Models/Ensemble_model_config_id_{}.pkl'.format(id))
    print(str(np.datetime64('now')), 'INFO: model loaded.')

    X = data[features_list + ["gpsLatitude", "gpsLongitude"]]
    ensemble_predictions = ensemble_pipeline.predict(X.values)

    # if take log of indicator
    if config['log'][0]:
        ensemble_predictions = np.exp(ensemble_predictions)

    results = pd.DataFrame({'i': list_i, 'j': list_j, 'lat': coords_y, 'lon': coords_x, 'yhat': ensemble_predictions})

    outfile = "../Data/Results/scalerout_{}.tif".format(id)
    tifgenerator(outfile=outfile,
                 raster_path=final_raster,
                 df=results)

    outfile = "../Data/Results/scalerout_{}_kNN.tif".format(id)
    results['yhat_kNN'] = ensemble_pipeline.regr_[0].predict(X.values)
    tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_kNN')

    outfile = "../Data/Results/scalerout_{}_Ridge.tif".format(id)
    results['yhat_Ridge'] = ensemble_pipeline.regr_[1].predict(X.values)
    tifgenerator(outfile=outfile, raster_path=final_raster, df=results, value='yhat_Ridge')

    if shapefile is not None:
        input_rst = "../Data/Results/scalerout_{}.tif".format(id)
        weight_rst = "../tmp/final_raster.tif"

        output_shp = "../Data/Results/scalerout_{}_aggregated.shp".format(id)
        from utils import weighted_sum_by_polygon
        weighted_sum_by_polygon(shapefile, input_rst, weight_rst, output_shp)
Пример #24
0
def main(top_left, bottom_left, bottom_right, top_right, config_id):

    # ------#
    # SETUP #
    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    # connect to db and read config table
    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(config_id), engine)

    raster = config["satellite_grid"][0]
    nightlights_date = config.get("nightlights_date")[0]
    base_raster = "../tmp/local_raster.tif"
    if config['satellite_config'][0].get('satellite_images') == 'Y':
        step = config['satellite_config'][0].get("satellite_step")

    # ----------------------------------- #
    # WorldPop Raster too fine, aggregate #
    aggregate(raster, base_raster, 1)

    # -------------------  #
    # CLIP RASTER TO SCOPE #
    geoms = [{'type': 'Polygon', 'coordinates': [[top_left, bottom_left, bottom_right, top_right]]}]

    with rasterio.open(base_raster) as src:
        out_image, out_transform = mask(src, geoms, crop=True)
        out_meta = src.meta.copy()

    # save the resulting raster
    out_meta.update({"driver": "GTiff",
                     "height": out_image.shape[1],
                     "width": out_image.shape[2],
                     "transform": out_transform
                     })

    with rasterio.open(base_raster, "w", **out_meta) as dest:
        dest.write(out_image)

    # load the new clipped raster to the img_lib
    GRID = RasterGrid(base_raster)
    with rasterio.open(base_raster) as src:
        list_j, list_i = np.where(src.read()[0] != src.nodata)
    print("INFO: downloading images in scope ...")
    coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)

    # ------------------------------------------------------------- #
    # download images from Google and Sentinel and Extract Features #
    # ------------------------------------------------------------- #
    if config["satellite_config"][0]["satellite_images"] != 'N':

        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        for sat in ['Google', 'Sentinel']:
            print('INFO: routine for provider: ', sat)
            # dopwnlaod the images from the relevant API
            GRID.download_images(list_i, list_j, step, sat, start_date, end_date)
            print('INFO: images downloaded.')

            print('INFO: scoring ...')
            # extarct the features
            network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID)
            print('INFO: extractor instantiated.')
            features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline='scoring')
            # normalize the features
            features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, config_id, 'scoring'), index=False)

        g_features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format("Google", config_id, 'scoring'))
        s_features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format("Sentinel", config_id, 'scoring'))

        data = pd.merge(g_features, s_features, on=['i', 'j', 'index'])
        data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(config_id), index=False)

        print('INFO: features extracted.')

    else:
        data = pd.DataFrame({'gpsLongitude': coords_x, 'gpsLatitude': coords_y, 'j': list_j, 'i': list_i})
    # --------------- #
    # add nightlights #
    # --------------- #
    from geojson import Polygon
    from nightlights import Nightlights

    area = Polygon([[top_left, bottom_left, bottom_right, top_right]])

    NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date)
    data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y
    data['nightlights'] = NGT.nightlights_values(data)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(data)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}
    osm_features = []

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            osm_tree = OSM.gpd_to_tree(osm_gdf["value"])
            dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1)
            # density = data.apply(OSM.density, args=(osm_gdf["value"],), axis=1)
            data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x))
            osm_features.append('distance_{}'.format(value))
            # data['density_{}'.format(value)] = density.apply(lambda x: np.log(0.0001 + x))
            # osm_features.append('density_{}'.format(value))

    # ---------------------- #
    # LOAD MODEL AND PREDICT #
    print("INFO: load model and predict ...")
    try:
        X = data.drop(['index', 'i', 'j', 'gpsLongitude', 'gpsLatitude'], axis=1)
    except ValueError:
        X = data.drop(['i', 'j', 'gpsLongitude', 'gpsLatitude'], axis=1)
    # load model and predict
    try:
        RmSense = joblib.load('../Models/RmSense_model_config_id_{}.pkl'.format(config_id))
        kNN = joblib.load('../Models/kNN_model_config_id_{}.pkl'.format(config_id))
    except FileNotFoundError:
        print('ERROR: model not found')

    yhat = (RmSense.predict(X) + kNN.predict(data[['i','j']])) / 2.
    results = pd.DataFrame({'i': list_i, 'j': list_j, 'lat': coords_y, 'lon': coords_x, 'yhat': yhat})

    outfile = "../Data/Results/scalerout_{}.tif".format(config_id)
    tifgenerator(outfile=outfile,
                 raster_path=base_raster,
                 df=results)
Пример #25
0
 def AggregateSlopes(self):
     """Remove sequential entries which have the same slope."""
     self.entries['slope'] = utils.aggregate(self.entries['slope'])
Пример #26
0
def evaluate(img_width, img_height, img_depth, upsampling_factor):

    # dataset & variables
    traindataset = Train_dataset(1)
    iterations = math.ceil(
        (len(traindataset.subject_list) *
         0.2))  # 817 subjects total. De 0 a 654 training. De 654 a 817 test.
    print(len(traindataset.subject_list))
    print(iterations)
    totalpsnr = 0
    totalssim = 0
    array_psnr = np.empty(iterations)
    array_ssim = np.empty(iterations)
    batch_size = 1
    div_patches = 4
    num_patches = traindataset.num_patches
    img_width = img_width  # 224
    img_height = img_height  # 224
    img_depth = img_depth  # 152

    # define model
    t_input_gen = tf.placeholder('float32', [1, None, None, None, 1],
                                 name='t_image_input_to_SRGAN_generator')
    srgan_network = generator(t_input_gen,
                              kernel=3,
                              nb=6,
                              upscaling_factor=upsampling_factor,
                              is_train=False,
                              reuse=False,
                              img_width=img_width,
                              img_height=img_height,
                              img_depth=img_depth)

    # restore g
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                            log_device_placement=False))

    saver = tf.train.Saver(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="SRGAN_g"))
    saver.restore(
        sess,
        tf.train.latest_checkpoint(
            '/work/isanchez/g/ds4-gdl-lrdecay/subpixel'))

    for i in range(0, iterations):
        # extract volumes
        xt_total = traindataset.data_true(
            654 + i)  # [[self.batch_size, 224, 224, 152]]
        xt_mask = traindataset.mask(654 + i)
        xg_generated = np.empty([1, 224, 224, 152, 1])
        normfactor = (np.amax(xt_total[0])) / 2
        x_generator = ((xt_total[0] - normfactor) / normfactor)
        res = 1 / upsampling_factor
        x_generator = x_generator[:, :, :, np.newaxis]
        x_generator = zoom(x_generator, [res, res, res, 1])
        # x_generator = gaussian_filter(x_generator, sigma=1)
        xg_generated[0] = sess.run(srgan_network.outputs,
                                   {t_input_gen: x_generator[np.newaxis, :]})
        xg_generated[0] = ((xg_generated[0] + 1) * normfactor)
        volume_real = xt_total[0]
        volume_real = volume_real[:, :, :, np.newaxis]
        volume_generated = xg_generated[0]
        volume_mask = aggregate(xt_mask)
        # compute metrics
        max_gen = np.amax(volume_generated)
        max_real = np.amax(volume_real)
        if max_gen > max_real:
            val_max = max_gen
        else:
            val_max = max_real
        min_gen = np.amin(volume_generated)
        min_real = np.amin(volume_real)
        if min_gen < min_real:
            val_min = min_gen
        else:
            val_min = min_real
        val_psnr = psnr(np.multiply(volume_real, volume_mask),
                        np.multiply(volume_generated, volume_mask),
                        dynamic_range=val_max - val_min)
        array_psnr[i] = val_psnr

        totalpsnr += val_psnr
        val_ssim = ssim(np.multiply(volume_real, volume_mask),
                        np.multiply(volume_generated, volume_mask),
                        dynamic_range=val_max - val_min,
                        multichannel=True)
        array_ssim[i] = val_ssim
        totalssim += val_ssim
        print(val_psnr)
        print(val_ssim)
        # save volumes
        filename_gen = os.path.join(args.path_volumes, str(i) + 'gen.nii.gz')
        img_volume_gen = nib.Nifti1Image(volume_generated, np.eye(4))
        img_volume_gen.to_filename(filename_gen)
        filename_real = os.path.join(args.path_volumes, str(i) + 'real.nii.gz')
        img_volume_real = nib.Nifti1Image(volume_real, np.eye(4))
        img_volume_real.to_filename(filename_real)

    print('{}{}'.format('Mean PSNR: ', array_psnr.mean()))
    print('{}{}'.format('Mean SSIM: ', array_ssim.mean()))
    print('{}{}'.format('Variance PSNR: ', array_psnr.var()))
    print('{}{}'.format('Variance SSIM: ', array_ssim.var()))
    print('{}{}'.format('Max PSNR: ', array_psnr.max()))
    print('{}{}'.format('Min PSNR: ', array_psnr.min()))
    print('{}{}'.format('Max SSIM: ', array_ssim.max()))
    print('{}{}'.format('Min SSIM: ', array_ssim.min()))
Пример #27
0
        domains = utils.read_json(utils.data_location / subfolder / 'domains.json')
        all_domains.extend(domains)
    if config['rebuttals']:
        rebuttals = utils.read_json(utils.data_location / subfolder / 'rebuttals.json')
        for source_url, rebuttal_l in rebuttals.items():
            for rebuttal_url, source in rebuttal_l.items():
                all_rebuttals[source_url][rebuttal_url].append(source)

urls_cnt = len(all_urls)
domains_cnt = len(all_domains)
fake_urls_cnt = len([el for el in all_urls if el['label'] == 'fake'])
fake_domains_cnt = len([el for el in all_domains if el['label'] == 'fake'])
print('#urls', urls_cnt, ': fake', fake_urls_cnt, 'true', urls_cnt - fake_urls_cnt)
print('#domains', domains_cnt, ': fake', fake_domains_cnt, 'true', domains_cnt - fake_domains_cnt)

aggregated_urls = utils.aggregate(all_urls)
aggregated_domains = utils.aggregate(all_domains, 'domain')

utils.write_json_with_path(aggregated_urls, utils.data_location, 'aggregated_urls.json')
utils.write_json_with_path(aggregated_domains, utils.data_location, 'aggregated_domains.json')
utils.write_json_with_path(all_rebuttals, utils.data_location, 'aggregated_rebuttals.json')

# copy to backend
utils.write_json_with_path(aggregated_urls, Path('../backend'), 'aggregated_urls.json')
utils.write_json_with_path(aggregated_domains, Path('../backend'), 'aggregated_domains.json')
utils.write_json_with_path(all_rebuttals, Path('../backend'), 'aggregated_rebuttals.json')

utils.print_stats(aggregated_urls)
utils.print_stats(aggregated_domains)

print('updating mappings, it may take a while')
Пример #28
0
def callback(ch, method, properties, body):
    data_dict = json.loads(body)
    fickle_file = os.path.join(MRB_TOP, 'MC_genie_numu_CC_seed-service.fcl')
    print data_dict

    job_id = str(data_dict.get('job_id'))
    batch_id = str(data_dict.get('batch_id'))
    batch_object = Batch.objects.get(job_id=job_id, batch_id=batch_id)

    if not job_id or not batch_id or not batch_object:
        print('No id!')
        ch.basic_ack(delivery_tag=method.delivery_tag)
        return

    print("<received_job>\n <job_id>   {0}\n "
          "<batch_id> {1}\n".format(job_id, batch_id))
    # Create log and error file
    if data_dict.get('log_dir'):
        log_path = os.path.join(LOG_TOP, data_dict.get('log_dir'))
    else:
        log_path = os.path.join(LOG_TOP, job_id)
    mkdir_p(log_path)
    log_file_path = os.path.join(log_path, str(batch_id) + '.log')
    err_file_path = os.path.join(log_path, 'err_' + str(batch_id) + '.log')
    log_file = open(log_file_path, 'w')
    err_file = open(err_file_path, 'w')
    batch_object.log_path = log_file_path
    batch_object.err_path = err_file_path
    batch_object.start_time = datetime.datetime.now()

    # if new file, copy new file to MRB_TOP DATA_TOP + new_file
    try:
        if data_dict.get('new_file'):
            new_file = str(data_dict.get('new_file')).replace(" ", "")
            new_file_path = os.path.join(DATA_TOP, new_file)
            shutil.copy(new_file_path, MRB_TOP)
            print >> log_file, "<new_file> '{0}' copied to $MRB_TOP".format(new_file)
            fickle_file = os.path.join(MRB_TOP, new_file.split('/')[-1])

        # check out_path exists, if exist cd, else make and cd, if no exist then make id dir
        # $DATA_TOP/out_dir/id
        if data_dict.get('out_dir'):
            out_path = os.path.join(DATA_TOP, data_dict.get('out_dir'), job_id)
            if not os.path.exists(out_path):
                mkdir_p(out_path)
            os.chdir(out_path)
        else:
            out_path = os.path.join(DATA_TOP, job_id)
            mkdir_p(out_path)
            os.chdir(out_path)
        batch_object.out_path = out_path
        batch_object.save()

    except Exception as e:
        update_error(batch_id, job_id, e)
        print >> err_file, '<error> {0}'.format(e)
        print(e)
        ch.basic_ack(delivery_tag=method.delivery_tag)
        return

    exec_list = [
        'lar -c {0} -n {1} -o single_gen_{2}.root'.format(fickle_file, data_dict.get('events'),
                                                          batch_id),
        'lar -c /products/dev/WireDump_numu_NC-1.fcl -s single_gen_{0}.root -T wire_dump_out_{0}.root'.format(
            batch_id),
        'python /products/dev/ProcessRootFile.py ./wire_dump_out_{0}.root out_{0}'.format(batch_id)]

    print >> log_file, '<working dir> {0}'.format(out_path)

    for cmd in exec_list:
        print >> log_file, '<cmd> {0}'.format(cmd)
        try:
            update_status(batch_id, job_id, cmd)
            sp.call(cmd, shell=True, stdout=log_file, stderr=err_file)
        except Exception as e:
            print(e)
            print >> err_file, '<error> {0}'.format(e)
            update_error(batch_id, job_id, e)

    # Change permissions
    for r, d, f in os.walk(os.getcwd()):
        os.chmod(r, 0777)

    # Aggregate output
    aggregate(out_path, LINK_PATH)

    print "<job end>"
    print >> log_file, "<job end>"
    log_file.close()
    err_file.close()
    update_complete(batch_id, job_id)
    ch.basic_ack(delivery_tag=method.delivery_tag)
    return
Пример #29
0
def process_record(data_line_, prediction_line_, neg_gap_, feature_dir_,
                   record_dir_, match_fn, all_doc_scores, all_ans_scores,
                   z_scores):
    missing_count_ = 0
    total_count_ = 0
    stop_count_ = 0
    data = json.loads(data_line_)
    question = data['question']
    q_id = slugify(question)
    q_path = os.path.join(feature_dir_, '%s.json' % q_id)
    n_q = [0 for _ in Tokenizer.FEAT]
    if os.path.exists(q_path):
        q_data = open(q_path, encoding=ENCODING).read()
        record = json.loads(q_data)
        q_ner = record['ner']
        q_pos = record['pos']
        for feat in q_ner + q_pos:
            n_q[Tokenizer.FEAT_DICT[feat]] += 1
    else:
        print('question feature file %s not exist!' % q_path)
        sys.stdout.flush()
        missing_count_ += 1
        return missing_count_, total_count_, stop_count_

    answer = [normalize(a) for a in data['answer']]
    prediction = json.loads(prediction_line_)
    # MAKE SURE REVERSE IS TRUE
    ranked_prediction = sorted(prediction,
                               key=lambda k: k['doc_score'],
                               reverse=True)
    correct_rank = get_rank(prediction, answer, match_fn)
    if correct_rank > 150:
        #  if correct_rank < 50 or correct_rank > 150:
        return missing_count_, total_count_, stop_count_

    all_corr_rank.append(correct_rank - 1)

    all_n_p = []
    all_n_a = []
    all_p_scores = []
    all_a_scores = []
    all_probs = []
    all_spans = []
    repeats = 0
    for i, entry in enumerate(ranked_prediction):
        doc_id = entry['doc_id']
        start = int(entry['start'])
        end = int(entry['end'])
        doc_score = entry['doc_score']
        ans_score = entry['span_score']
        prob = entry['prob']
        span = entry['span']

        #        RESTRICT TO MAX 1000000000
        #        print("Threshold 1000000")
        #        ans_score=min(ans_score, 1000000) #restrict to max of million

        if span in all_spans:
            repeats += 1

        all_spans.append(span)

        ################Calculate sample z score (t statistic) for answer score
        if all_a_scores == [] or len(
                all_a_scores
        ) == 1:  # dont use a_zscore feature at the beginning or if we only have 1
            a_zscore = 0
        else:  # Take the sample mean of the previous ones, take zscore of the current with respect to that
            #            sample_mean = np.mean(all_a_scores + [ans_score])
            sample_mean = np.mean(all_a_scores)
            #            sample_std = np.std(all_a_scores + [ans_score])
            sample_std = np.std(all_a_scores)
            #            if sample_std != 0:
            a_zscore = (ans_score - sample_mean) / sample_std
            #            else:
            #                a_zscore = 0
            z_scores.append(a_zscore)

        # THESE ARE FOR STATISTISTICS OVER ENTIRE DATA SET, IGNORE
        all_doc_scores.append(doc_score)
        all_ans_scores.append(ans_score)

        corr_doc_score = (doc_score - DOC_MEAN) / DOC_STD
        corr_ans_mean_score = (np.mean(all_a_scores + [ans_score]) -
                               ANS_MEAN) / ANS_STD

        all_probs.append(prob)
        ###############

        p_pos = dict()
        p_ner = dict()
        feat_file = os.path.join(feature_dir_, '%s.json' % doc_id)
        if os.path.exists(feat_file):
            record = json.load(open(feat_file))
            p_ner[doc_id] = record['ner']
            p_pos[doc_id] = record['pos']
        n_p = [0 for _ in Tokenizer.FEAT]
        n_a = [0 for _ in Tokenizer.FEAT]
        for feat in p_ner[doc_id] + p_pos[doc_id]:
            n_p[Tokenizer.FEAT_DICT[feat]] += 1

        for feat in p_ner[doc_id][start:end + 1] + p_pos[doc_id][start:end +
                                                                 1]:
            n_a[Tokenizer.FEAT_DICT[feat]] += 1

        all_n_p.append(n_p)
        all_n_a.append(n_a)

        all_p_scores.append(doc_score)
        all_a_scores.append(ans_score)

        f_np = aggregate(all_n_p)
        f_na = aggregate(all_n_a)
        f_sp = aggregate(all_p_scores)
        f_sa = aggregate_ans(all_a_scores)

        record = OrderedDict()

        # sp, nq, np, na, ha
        record['sp'] = f_sp
        record['nq'] = list(map(float, n_q))
        record['np'] = f_np
        record['na'] = f_na
        record['sa'] = f_sa
        record['a_zscore'] = a_zscore
        record['corr_doc_score'] = corr_doc_score
        record['i'] = i
        record['prob_avg'] = sum(all_probs) / len(all_probs)
        record['prob'] = prob
        record['repeats'] = repeats
        record['ans_avg'] = corr_ans_mean_score

        if i + 1 == correct_rank:
            #        if i + 1 >= correct_rank:
            record['stop'] = 1
            stop_count_ += 1

            write_record = True
            #            if i % neg_gap_ ==0:
            #                write_record = True
            #            else:
            #                write_record = False

            should_return = True
        #            if i + 1 - correct_rank > 30:
        #                should_return = True
        #            else:
        #                should_return = False
        else:
            should_return = False
            if i % neg_gap_ == 0:
                record['stop'] = 0
                write_record = True
            else:
                write_record = False
        if write_record:
            record_path = os.path.join(record_dir_,
                                       '%s_%s.pkl' % (q_id, doc_id))
            with open(record_path, 'wb') as f:
                pk.dump(record, f)
            total_count_ += 1
        if should_return:
            return missing_count_, total_count_, stop_count_
    return missing_count_, total_count_, stop_count_
Пример #30
0
def run(id):
    # ----------------- #
    # SETUP #############
    # ----------------- #

    print(str(np.datetime64('now')), " INFO: config id =", id)

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}"""
                           .format(private_config['DB']['user'], private_config['DB']['password'],
                                   private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query("select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    indicator = config["indicator"][0]
    raster = config["satellite_grid"][0]
    aggregate_factor = config["base_raster_aggregation"][0]
    scope = config["scope"][0]
    nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), config["nightlights_date"][0].get("end")
    s2_date_start, s2_date_end = config["NDs_date"][0].get("start"), config["NDs_date"][0].get("end")
    if config['satellite_config'][0].get('satellite_images') == 'Y': step = config['satellite_config'][0].get("satellite_step")

    # ----------------------------------- #
    # WorldPop Raster too fine, aggregate #
    from utils import aggregate
    if aggregate_factor > 1:
        print('INFO: aggregating raster {}'.format(raster))
        base_raster = "../tmp/local_raster.tif"
        aggregate(raster, base_raster, aggregate_factor)
    else:
        base_raster = raster

    # -------- #
    # DATAPREP #
    # -------- #
    data = pd.read_csv(dataset)
    data_cols = data.columns.values

    # grid
    GRID = RasterGrid(base_raster)
    list_i, list_j = GRID.get_gridcoordinates(data)

    # OPTIONAL: REPLACING THE CLUSTER COORDINATES BY THE CORRESPONDING GRID CENTER COORDINATES
    # data['gpsLongitude'], data['gpsLatitude'] = coords_x, coords_y

    data["i"], data["j"] = list_i, list_j

    # Get Polygon Geojson of the boundaries
    minlat, maxlat, minlon, maxlon = df_boundaries(data, buffer=0.05, lat_col="gpsLatitude", lon_col="gpsLongitude")
    area = points_to_polygon(minlon, minlat, maxlon, maxlat)

    print("Number of clusters: {} ".format(len(data)))

    list_i, list_j, pipeline = data["i"], data["j"], 'evaluation'

    # ------------------------------------------------------------- #
    # download images from Google and Sentinel and Extract Features #
    # ------------------------------------------------------------- #
    if config["satellite_config"][0]["satellite_images"] != 'N':

        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        for sat in ['Google', 'Sentinel']:
            print('INFO: routine for provider: ', sat)
            # downlaod the images from the relevant API
            GRID.download_images(list_i, list_j, step, sat, start_date, end_date, zoom_vhr=16, img_size_sentinel=5000)
            print('INFO: images downloaded.')

            if os.path.exists("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline)):
                print('INFO: already scored.')
                features = pd.read_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline))
            else:
                print('INFO: scoring ...')
                # extract the features
                network = NNExtractor(id, sat, GRID.image_dir, sat, step, GRID)
                print('INFO: extractor instantiated.')

                features = network.extract_features(list_i, list_j, sat, start_date, end_date, pipeline)
                # normalize the features

                features.to_csv("../Data/Features/features_{}_id_{}_{}.csv".format(sat, id, pipeline), index=False)

            features = features.drop('index', 1)
            data = data.merge(features, on=["i", "j"])

        data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

        print('INFO: features extracted.')

    # --------------- #
    # add nightlights #
    # --------------- #

    from nightlights import Nightlights

    NGT = Nightlights(area, '../Data/Geofiles/nightlights/', nightlights_date_start, nightlights_date_end)
    data['nightlights'] = NGT.nightlights_values(data)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(minlon, minlat, maxlon, maxlat)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}
    osm_features = []

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            osm_tree = OSM.gpd_to_tree(osm_gdf["value"])
            dist = data.apply(OSM.distance_to_nearest, args=(osm_tree,), axis=1)
            data['distance_{}'.format(value)] = dist.apply(lambda x: np.log(0.0001 + x))
            osm_features.append('distance_{}'.format(value))

    # ---------------- #
    #   NDBI,NDVI,NDWI #
    # ---------------- #
    print('INFO: getting NDBI, NDVI, NDWI ...')

    from rms_indexes import S2indexes

    S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end, scope)
    S2.download()
    data[['max_NDVI', 'max_NDBI', 'max_NDWI']] = S2.rms_values(data).apply(pd.Series)
    # --------------- #
    # save features   #
    # --------------- #
    # features to be use in the linear model
    features_list = list(sorted(set(data.columns) - set(data_cols) - set(['i', 'j'])))

    # Standardize Features (0 mean and 1 std)
    #data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].std()
    print("Normalizing : max")
    data[features_list] = (data[features_list] - data[features_list].mean()) / data[features_list].max()

    data.to_csv("../Data/Features/features_all_id_{}_evaluation.csv".format(id), index=False)

    # --------------- #
    # model indicator #
    # --------------- #
    # shuffle dataset
    data = data.sample(frac=1, random_state=1783).reset_index(drop=True)  # shuffle data

    # if set in the config, take log of indicator
    if config['log'][0]:
        data[indicator] = np.log(data[indicator])

    from modeller import Modeller
    X, y = data[features_list + ["gpsLatitude", "gpsLongitude"]], data[indicator]
    modeller = Modeller(X, rs_features=features_list, spatial_features=["gpsLatitude", "gpsLongitude"], scoring='r2', cv_loops=20)

    kNN_pipeline = modeller.make_model_pipeline('kNN')
    kNN_scores = modeller.compute_scores(kNN_pipeline, y)
    kNN_R2_mean = kNN_scores.mean()
    kNN_R2_std = kNN_scores.std()
    print("kNN_R2_mean: ", kNN_R2_mean, "kNN_R2_std: ", kNN_R2_std)

    Ridge_pipeline = modeller.make_model_pipeline('Ridge')
    Ridge_scores = modeller.compute_scores(Ridge_pipeline, y)
    Ridge_R2_mean = Ridge_scores.mean()
    Ridge_R2_std = Ridge_scores.std()
    print("Ridge_R2_mean: ", Ridge_R2_mean, "Ridge_R2_std: ", Ridge_R2_std)

    Ensemble_pipeline = modeller.make_ensemble_pipeline([kNN_pipeline, Ridge_pipeline])
    Ensemble_scores = modeller.compute_scores(Ensemble_pipeline, y)
    Ensemble_R2_mean = Ensemble_scores.mean()
    Ensemble_R2_std = Ensemble_scores.std()
    print("Ensemble_R2_mean: ", Ensemble_R2_mean, "Ensemble_R2_std: ", Ensemble_R2_std)

    # ------------------ #
    # write scores to DB #
    # ------------------ #

    query = """
    insert into results_new (run_date, config_id, r2, r2_sd, r2_knn, r2_sd_knn, r2_features, r2_sd_features, mape_rmsense)
    values (current_date, {}, {}, {}, {}, {}, {}, {}, {}) """.format(
        config['id'][0],
        Ensemble_R2_mean, Ensemble_R2_std, kNN_R2_mean, kNN_R2_std, Ridge_R2_mean, Ridge_R2_std, 0)
    engine.execute(query)

    # ------------------------- #
    # write predictions to file #
    # ------------------------- #

    print('INFO: writing predictions to disk ...')

    from sklearn.model_selection import cross_val_predict
    results = pd.DataFrame({
        'yhat': cross_val_predict(Ensemble_pipeline, X.values, y),
        'y': data[indicator].values,
        'lat': data['gpsLatitude'],
        'lon': data['gpsLongitude']})
    results.to_csv('../Data/Results/config_{}.csv'.format(id), index=False)

    # save model for production
    Ensemble_pipeline.fit(X.values, y)

    # Best n_neighbors (kNN)
    print('INFO: number of neighbours chosen: ', Ensemble_pipeline.regr_[0].named_steps['gridsearchcv'].best_params_)
    # Best alpha (Ridge)
    print('INFO: regularization param chosen: ', Ensemble_pipeline.regr_[1].named_steps['gridsearchcv'].best_params_)

    from sklearn.externals import joblib
    joblib.dump(Ensemble_pipeline, '../Models/Ensemble_model_config_id_{}.pkl'.format(id))
    print(str(np.datetime64('now')), 'INFO: model saved.')
Пример #31
0
def main(id, aggregate_factor, min_pop, bbox, shapefile):
    """ makes predictions is areas where we have no survey.
    Args:
        id (int): the config id
        aggregate_factor (int): aggregate pixels to lower resolution by x much
        min_pop: minimium population in pixel to score
        bbox: bounding box <minlat> <minlon> <maxlat> <maxlon>, if omitted will use boundaries from dataset
        shapefile: aggregate within shapefile's geometires

    Example:
        id, aggregate_factor, min_pop = 3075, 15, 500
    """
    # read the configs for id
    print(str(np.datetime64('now')), " INFO: config id =", id)

    with open('../private_config.yml', 'r') as cfgfile:
        private_config = yaml.load(cfgfile)

    engine = create_engine("""postgresql+psycopg2://{}:{}@{}/{}""".format(
        private_config['DB']['user'], private_config['DB']['password'],
        private_config['DB']['host'], private_config['DB']['database']))

    config = pd.read_sql_query(
        "select * from config_new where id = {}".format(id), engine)
    dataset = config.get("dataset_filename")[0]
    raster = config["base_raster"][0]
    scope = config["scope"][0]
    nightlights_date_start, nightlights_date_end = config["nightlights_date"][0].get("start"), \
                                                   config["nightlights_date"][0].get("end")
    s2_date_start, s2_date_end = config["NDs_date"][0].get(
        "start"), config["NDs_date"][0].get("end")
    ISO = config["iso3"][0]
    if config['satellite_config'][0].get('satellite_images') == 'Y':
        print('INFO: satellite images from Google and Sentinel-2')
        step = config['satellite_config'][0].get("satellite_step")
    elif config['satellite_config'][0].get('satellite_images') == 'G':
        print('INFO: only Google satellite images.')
        step = config['satellite_config'][0].get("satellite_step")
    elif config['satellite_config'][0].get('satellite_images') == 'N':
        print('INFO: no satellite images')

    # ----------------------------------- #
    # WorldPop Raster too granular (lots of images), aggregate #
    if aggregate_factor > 1:
        print(
            'INFO: aggregating raster with factor {}'.format(aggregate_factor))
        base_raster = "../local_raster.tif"
        aggregate(raster, base_raster, aggregate_factor)
    else:
        base_raster = raster

    # ---------------- #
    # AREA OF INTEREST #
    # ---------------- #
    # dataset_df = pd.read_csv(dataset)
    # data_cols = dataset_df.columns.values

    if sum(bbox) != 0:  # dummy bbox
        print("INFO: using AOI from bbox")
        print(sum(bbox))
        # define AOI with manually defined bbox
        minlat, minlon, maxlat, maxlon = bbox[0], bbox[1], bbox[2], bbox[3]
        area = points_to_polygon(minlat=minlat,
                                 minlon=minlon,
                                 maxlat=maxlat,
                                 maxlon=maxlon)
    else:
        print("INFO: using AOI from dataset.")
        # use dataset's extent
        dataset_df = pd.read_csv(dataset)
        minlat, maxlat, minlon, maxlon = boundaries(dataset_df['gpsLatitude'],
                                                    dataset_df['gpsLongitude'])
        area = points_to_polygon(minlat=minlat,
                                 minlon=minlon,
                                 maxlat=maxlat,
                                 maxlon=maxlon)
        del dataset_df

    # crop raster
    with rasterio.open(base_raster) as src:
        out_image, out_transform = mask(src, [area], crop=True)
        out_meta = src.meta.copy()

    # save the resulting raster
    out_meta.update({
        "driver": "GTiff",
        "height": out_image.shape[1],
        "width": out_image.shape[2],
        "transform": out_transform
    })

    final_raster = "../final_raster.tif"
    print('INFO: Removing tiles with population under {}'.format(
        min_pop))  # only score areas where there are at agg factor living
    with rasterio.open(final_raster, "w", **out_meta) as dest:
        out_image[out_image < min_pop] = dest.nodata
        dest.write(out_image)
        list_j, list_i = np.where(out_image[0] != dest.nodata)

    # instantiate GRID
    GRID = BaseLayer(final_raster)

    coords_x, coords_y = np.round(GRID.get_gpscoordinates(list_i, list_j), 5)

    ix = pd.MultiIndex.from_arrays([list_i, list_j, coords_y, coords_x],
                                   names=('i', 'j', "gpsLatitude",
                                          "gpsLongitude"))

    print("Number of clusters: {} ".format(len(ix)))

    pipeline = 'scoring'

    # ------------------------------------------------ #
    # download images from Google and Extract Features #
    # ------------------------------------------------ #
    if config['satellite_config'][0].get('satellite_images') in ['Y', 'G']:
        features_path = "../Data/Features/features_Google_id_{}_{}.csv".format(
            id, pipeline)
        data_path = "../Data/Satellite/"

        gimages = GoogleImages(data_path)
        # download the images from the relevant API
        gimages.download(coords_x, coords_y, step=step)
        # extract the features
        features = pd.DataFrame(gimages.featurize(coords_x,
                                                  coords_y,
                                                  step=step),
                                index=ix)
        features.columns = [str(col) + '_Google' for col in features.columns]
        features.to_csv(features_path)
        print('INFO: features extracted.')
        data = features.copy()
    # ------------------------------------------------------------- #
    # download Sentinel images and Extract Features #
    # ------------------------------------------------------------- #
    if config['satellite_config'][0].get('satellite_images') == 'Y':
        features_path = "../Data/Features/features_Sentinel_id_{}_{}.csv".format(
            id, pipeline)
        data_path = "../Data/Satellite/"
        start_date = config["satellite_config"][0]["start_date"]
        end_date = config["satellite_config"][0]["end_date"]

        from sentinel_images import SentinelImages

        simages = SentinelImages(data_path)
        # download the images from the relevant API
        simages.download(coords_x, coords_y, start_date, end_date)
        print('INFO: scoring ...')
        # extract the features
        print('INFO: extractor instantiated.')
        features = pd.DataFrame(simages.featurize(coords_x, coords_y,
                                                  start_date, end_date),
                                index=ix)

        features.columns = [str(col) + '_Sentinel' for col in features.columns]
        features.to_csv(features_path)

        if data is not None:
            data = data.join(features)
        else:
            data = features.copy()
        print('INFO: features extracted')

    # --------------- #
    # add nightlights #
    # --------------- #
    from nightlights import Nightlights

    nlights = Nightlights('../Data/Geofiles/')
    nlights.download(area, nightlights_date_start, nightlights_date_end)
    features = pd.DataFrame(nlights.featurize(coords_x, coords_y),
                            columns=['nightlights'],
                            index=ix)
    # quantize nightlights
    features['nightlights'] = pd.qcut(features['nightlights'],
                                      5,
                                      labels=False,
                                      duplicates='drop')

    data = data.join(features)

    # ---------------- #
    # add OSM features #
    # ---------------- #
    OSM = OSM_extractor(minlon, minlat, maxlon, maxlat)
    tags = {"amenity": ["school", "hospital"], "natural": ["tree"]}
    osm_gdf = {}

    for key, values in tags.items():
        for value in values:
            osm_gdf["value"] = OSM.download(key, value)
            dist = OSM.distance_to_nearest(coords_y, coords_x,
                                           osm_gdf["value"])
            data['distance_{}'.format(value)] = [
                np.log(0.0001 + x) for x in dist
            ]

    # ---------------- #
    #   NDBI,NDVI,NDWI #
    # ---------------- #
    print('INFO: getting NDBI, NDVI, NDWI ...')
    from rms_indexes import S2indexes

    S2 = S2indexes(area, '../Data/Geofiles/NDs/', s2_date_start, s2_date_end,
                   scope)
    S2.download()
    data['max_NDVI'], data['max_NDBI'], data['max_NDWI'] = S2.rms_values(
        coords_x, coords_y)

    # --------------- #
    # add ACLED #
    # --------------- #
    from acled import ACLED

    acled = ACLED("../Data/Geofiles/ACLED/")
    acled.download(ISO, nightlights_date_start, nightlights_date_end)
    d = {}
    for property in ["fatalities", "n_events", "violence_civ"]:
        for k in [10000, 100000]:
            d[property + "_" + str(k)] = acled.featurize(coords_x,
                                                         coords_y,
                                                         property=property,
                                                         function='density',
                                                         buffer=k)

    d["weighted_sum_fatalities_by_dist"] = acled.featurize(
        coords_x, coords_y, property="fatalities", function='weighted_kNN')
    d["distance_to_acled_event"] = acled.featurize(coords_x,
                                                   coords_y,
                                                   function='distance')
    # quantize ACLED
    for c in d.keys():
        d[c] = np.nan_to_num(pd.qcut(d[c], 5, labels=False, duplicates='drop'))

    features = pd.DataFrame(d, index=data.index)
    data = data.join(features)

    # --------------- #
    # save features   #
    # --------------- #
    print('INFO: {} columns.'.format(len(data.columns)))
    # features to be use in the linear model
    features_list = list(sorted(data.columns))
    print(features_list)
    data.to_csv("../Data/Features/features_all_id_{}_{}_nonscaled.csv".format(
        id, pipeline))
    # Scale Features
    print("Normalizing : max")
    data[features_list] = (data[features_list] - data[features_list].mean()
                           ) / (data[features_list].max() + 0.001)

    data.to_csv("../Data/Features/features_all_id_{}_{}.csv".format(
        id, pipeline))

    # ------- #
    # predict #
    # ------- #
    ensemble_pipeline = joblib.load(
        '../Models/Ensemble_model_config_id_{}.pkl'.format(id))
    print(str(np.datetime64('now')), 'INFO: model loaded.')

    X = data.reset_index(level=[2, 3])
    ensemble_predictions = ensemble_pipeline.predict(X.values)

    results = pd.DataFrame({
        'i': list_i,
        'j': list_j,
        'lat': coords_y,
        'lon': coords_x,
        'yhat': ensemble_predictions
    })
    results.to_csv('../Data/Results/config_{}.csv'.format(id))
    outfile = "../Data/Results/scalerout_{}.tif".format(id)
    tifgenerator(outfile=outfile, raster_path=final_raster, df=results)

    outfile = "../Data/Results/scalerout_{}_kNN.tif".format(id)
    results['yhat_kNN'] = ensemble_pipeline.regr_[0].predict(X.values)
    tifgenerator(outfile=outfile,
                 raster_path=final_raster,
                 df=results,
                 value='yhat_kNN')

    outfile = "../Data/Results/scalerout_{}_Ridge.tif".format(id)
    results['yhat_Ridge'] = ensemble_pipeline.regr_[1].predict(X.values)
    tifgenerator(outfile=outfile,
                 raster_path=final_raster,
                 df=results,
                 value='yhat_Ridge')

    if shapefile is not None:
        input_rst = "../Data/Results/scalerout_{}.tif".format(id)
        weight_rst = "../tmp/final_raster.tif"

        output_shp = "../Data/Results/scalerout_{}_aggregated.shp".format(id)
        from utils import weighted_sum_by_polygon
        weighted_sum_by_polygon(shapefile, input_rst, weight_rst, output_shp)