Пример #1
0
def load_separate_model_results(
        results_dir: str = BUILDING_RESULTS_DIR,
        exp_datetime: datetime.datetime = None) -> Dict:
    all_experiments = os.listdir(results_dir)
    if exp_datetime is None:
        paths = [
            os.path.join(results_dir, experiment)
            for experiment in all_experiments
        ]
        exp_dir = max(paths, key=os.path.getctime)
    else:
        date_str = exp_datetime.isoformat(' ', 'seconds')
        exp_dir = os.path.join(results_dir, date_str)

    if not os.path.exists(exp_dir):
        logging.warning("Experiment path doesn't exist")
        return {'model_results': []}

    print(f"loading from {exp_dir}")

    all_files = os.listdir(exp_dir)
    chunk_size = 10

    # TODO: ugly cause pickles are large (1800 models takes 2 hours to load)

    def load_chunk(i):
        all_results = []
        for f in all_files[min(len(all_files), i *
                               chunk_size):min(len(all_files), (i + 1) *
                                               chunk_size)]:
            with open(os.path.join(exp_dir, f, 'results.pickle'),
                      'rb') as file:
                res = pickle.load(file)['kfold_results']
                # TODO: we might want it in the future
                # del res['X_df']
                # del res['y']
                all_results.append(res)
        return all_results

    def load_file(fpath):
        with open(os.path.join(exp_dir, fpath, 'results.pickle'),
                  'rb') as file:
            res = pickle.load(file)['kfold_results']
            # TODO: we might want it in the future
            # del res['X_df']
            # del res['y']
        return res

    print(f"number of files is {len(all_files)}")

    all_results = parmap(load_file,
                         all_files,
                         chunk_size=5,
                         use_tqdm=True,
                         desc="Opening all model results",
                         unit="model")
    return {'model_results': all_results}
Пример #2
0
    def transform(self,
                  input_gs: GeoSeries,
                  use_cache: bool = True) -> pd.DataFrame:
        """
        extract the desired features on desired geometries
        Args:
            input_gs: a GeoSeries with the desired geometries
            use_cache: if set and self.cache_table is filled will load/save the features to the cache

        Returns:
            a pandas dataframe, with columns as features, and rows as the geometries in input_gs

        """
        assert len(input_gs.apply(lambda p: p.wkt).unique()) == len(
            input_gs), "Shouldn't have duplicates when transform"
        required_feats, loaded_feats_dfs = self.features, []

        if use_cache:
            logging.debug(
                f"Starting load from cache for {len(input_gs)} objects")
            required_feats, loaded_feats_dfs = self.load_from_cache(
                self.features, input_gs)

            if len(required_feats) == 0:
                logging.debug("loaded all from cache!")
                return pd.concat(loaded_feats_dfs, axis=1)  # append by column
            else:
                logging.debug(
                    f"loaded from cache {len(loaded_feats_dfs)}/{len(self.features)}"
                )
        else:
            logging.debug(f"Don't load from cache")
        feature_factory = PostgresFeatureFactory(required_feats,
                                                 input_gs=input_gs)
        with feature_factory:
            features_gs_list = parmap(
                lambda feature: feature.extract(input_gs),
                feature_factory.features,
                use_tqdm=True,
                desc=f"Calculating Features for {len(input_gs)} geoms",
                unit='feature',
                leave=False)
            # TODO: if want, extract_object_set

        all_features_df = pd.concat(features_gs_list + loaded_feats_dfs,
                                    axis=1)[self.all_feat_names]

        if self.cache_table and use_cache:
            calculated_features_df = pd.concat(features_gs_list, axis=1)
            save_features_to_db(input_gs, calculated_features_df,
                                self.cache_table)

        return all_features_df
Пример #3
0
    def fit_all_models(self, x: pd.DataFrame, y_true, cv=None):
        # trained_models_and_scores = parmap(lambda model: model.fit(x, y_true, cv=cv), self.models_dict.items(),
        #                                    use_tqdm=True, desc="Fitting Models", unit="model")
        # y_true_soft = self.get_soft_labels(gpd.GeoSeries(data=x.index.values), radius=TRUE_POSITIVE_RADIUS,
        #                                    cache_dir=DISTANCE_CACHE_DIR)

        # y_true_soft = y_true  # TODO delete this and uncomment last row

        def fit_model(model):
            if cv is not None:  # TODO is this really needed?
                model.fit(x, y_true, cv=cv)
            else:
                model.fit(x, y_true)
            return model

        models = parmap(fit_model, self.models_dict.values(), use_tqdm=True, desc="Fitting Models", unit="model", nprocs=32)
        # for name, model in tqdm(self.models_dict.items(), desc="Fitting Models", unit="model"):
        return models
Пример #4
0
    def test_building_to_CLSTR_multiple_pipe_runs_and_returns_same(self):
        # TODO: weird, if this runs second weird errors will arise. "pyproj database disk image is malformed"
        # also when runs in parallel with pytest sometimes fail
        neighb_init_states, poly_feat_builder = self.get_setup_variables()

        res = parmap(
            lambda b: building_to_CLSTR(
                b.hull,
                poly_feat_builder,
                _ClosestToSpecificArea(),
                # partial(hill_climbing, iterations_limit=2)),
                partial(beam, beam_size=50, iterations_limit=3)),
            neighb_init_states)  # , nprocs=1
        print(*[r[1] for r in res])

        self.assertSetEqual({r[1] for r in res}, {res[0][1]})
        for init_s, (bc, v) in zip(neighb_init_states, res):
            self.assertIsInstance(bc, BuildingCluster)
            self.assertIsInstance(v, float)
Пример #5
0
 def test_parmap_does_calculation_correctly_with_chunks(self):
     res = parmap(f, range(500), chunk_size=5)
     self.assertListEqual(res, [x + 1 for x in range(500)])
Пример #6
0
 def test_parmap_does_calculation_correctly(self):
     res = parmap(f, range(500))
     self.assertListEqual(res, [x + 1 for x in range(500)])
Пример #7
0
    def train_predict_on_split(task, building_gs, buildings_y, source_indices,
                               geos, y, source_train_indices,
                               source_test_indices):
        building_train_indices = np.isin(source_indices, source_train_indices)
        building_test_indices = np.isin(source_indices, source_test_indices)

        # fetch train-set and fit
        buildings_train_gs = building_gs.iloc[
            building_train_indices].reset_index(drop=True)
        y_train_buildings = buildings_y[building_train_indices]

        buildings_test_gs = building_gs.iloc[
            building_test_indices].reset_index(drop=True)
        y_test_buildings = buildings_y[building_test_indices]

        train_true_geos = geos[np.isin(range(len(geos)), source_train_indices)
                               & y]  # train-test in CLSTRs
        test_true_geos = geos[np.isin(range(len(geos)), source_test_indices)
                              & y]  # train-test in CLSTRs

        fpb = task.embedder  # feature extractor for polygons
        # add the building scores feature
        train_hash = hash_geoseries(geos[source_train_indices])
        fpb.features += [
            BuildingScores(
                SCORES_TABLE,
                BUILDING_EXPERIMENT_NAME,
                'BalancedRF1000',
                # TODO: doesn't match current MetaModel naming
                train_geom_hash=train_hash,
                radius=radius) for radius in [0, 25]
        ]

        heuristic_guiding_model = BaselineModel()
        heuristic_guiding_model.fit(task.transform(train_true_geos))
        # for i in trange(5, desc="Training CLSTR heuristic"):
        #     potential_CLSTRs_test = parmap(lambda b: building_to_CLSTR(b, fpb, heuristic_guiding_model),
        #                                 random.sample(buildings_train_gs[y_train]), use_tqdm=True, desc="Calculating potential CLSTRs")
        #
        #     heuristic_guiding_model = OneClassSVM()
        #     heuristic_guiding_model.fit(task.transform(train_true_geos))

        # TODO: do smarter choice of what buildings to start from ?
        score_extractor = FeaturesBuilder([
            BuildingScores(SCORES_TABLE,
                           BUILDING_EXPERIMENT_NAME,
                           'BalancedRF1000',
                           radius=0,
                           train_geom_hash=train_hash)
        ])
        building_scores_sorted = score_extractor.transform(
            buildings_test_gs)['building_scores_avg_0m'].sort_values(
                ascending=False)

        building_scores = pd.Series(
            index=buildings_test_gs.iloc[building_scores_sorted.index],
            data=building_scores_sorted.values)

        # building_scores = gpd.GeoDataFrame(
        #     zip(buildings_test_gs, np.random.random(len(buildings_test_gs))),
        #     columns=['geometry', 'score'], geometry='geometry').set_index('geometry')

        # TODO: do smarter choice of what buildings to start from. now top scoring 250
        best_test_buildings_with_scores = building_scores.iloc[random.sample(
            range(1000), 500)]
        potential_CLSTRs_test = parmap(lambda b: building_to_CLSTR(
            b, fpb, heuristic_guiding_model,
            partial(beam, beam_size=15, iterations_limit=15)),
                                       best_test_buildings_with_scores.index,
                                       use_tqdm=True,
                                       desc="Calculating potential CLSTRs",
                                       keep_child_tqdm=True,
                                       nprocs=16)

        # TODO: postprocessing, which CLSTRs to give. Related to how the fit together.
        print([p[1] for p in potential_CLSTRs_test])
        print([len(p[0].buildings) for p in potential_CLSTRs_test])
        sorted_potential_CLSTRs_test = list(
            sorted(potential_CLSTRs_test, key=lambda p: p[1], reverse=True))
        # TODO: choose with intel, depending on pluga, etc.
        best_potential_CLSTRs_test = pd.Series(
            index=[p[0].hull for p in sorted_potential_CLSTRs_test],
            data=MinMaxScaler().fit_transform([
                [p[1]] for p in sorted_potential_CLSTRs_test
            ])[:, 0])  # normalize scores, IMPORTANT
        print(best_potential_CLSTRs_test)

        return building_scores, geos.iloc[
            source_train_indices], y_train_buildings, geos.iloc[
                source_test_indices], test_true_geos, y_test_buildings, best_potential_CLSTRs_test
    def _create_intersection_table(
            self, geom_table_name: str,
            eng: sa.engine.Engine) -> Dict[float, Dict[str, str]]:
        """
        Create a temporary intersection table, to be later used for all the sub-features.
        The table is created using self.table_filter_dict, and the features radii

        Args:
            geom_table_name: name of the geometries table to intersect on.
            eng: sql Alchemy engine.

        Returns:
            The names of the temporary intersection table for each radius, then original table
        """
        radius_table_to_tmp_table_names = {}
        all_tpls = []
        for table, filters_dict in self.table_filter_dict.items():
            all_radii = self.table_radii[table]
            for radius in all_radii:
                all_tpls.append((table, filters_dict[radius], radius))

        def calc_intersect(tpl_idx, table, filters_dict, radius):
            eng = get_connection('POSTGRES')
            filters_columns_sql = ',\n'.join([
                f"CASE WHEN {filter_sql} THEN 1 ELSE 0 end as {filter_name}"
                for filter_name, filter_sql in filters_dict.items()
            ])

            filters_sql = ' or '.join(filters_dict.values())
            tbl_name = f"{get_temp_table_name()}{tpl_idx}"

            # add height if exists
            height_exists = column_exists('height', table, eng)
            inner_height_sql = "height, absolute_ground_surface_height," if height_exists else ""
            outer_height_sql = """t.height as height,
                        t.absolute_ground_surface_height as ground_height,
                        t.absolute_ground_surface_height + t.height as absolute_height,""" if height_exists else ""

            query = f"""
                    create UNLOGGED TABLE {tbl_name}
                    as
                        select
                        1.0 as coverage,
                        {outer_height_sql}
                        q.geom_id as geom_id,
                        q.geom as q_geom,
                        t.geom as t_geom,
                        Geography(t.geom) as t_geog,
                        {', '.join(filters_dict.keys())}

                        from {geom_table_name} q
                        JOIN (select way as geom,
                                    {inner_height_sql}
                                    {filters_columns_sql}
                                    from {table}
                                    WHERE {filters_sql}) t
                        ON ST_DWithin(t.geom, q.geom, {radius}, true)

                    """

            eng.execute(query)
            add_postgis_index(eng, tbl_name, 'q_geom')
            add_postgis_index(eng, tbl_name, 't_geom')
            add_postgis_index(eng, tbl_name, 't_geog')
            eng.dispose()
            return radius, table, tbl_name

        res = parmap(lambda p: calc_intersect(p[0], *p[1]),
                     list(enumerate(all_tpls)),
                     use_tqdm=True,
                     desc="Calculating intersection",
                     unit="table",
                     leave=False)
        for radius, table, tbl_name in res:
            radius_table_to_tmp_table_names.setdefault(radius, {}).update(
                {table: tbl_name})

        return radius_table_to_tmp_table_names