예제 #1
0
def _create_task():
    features = create_building_features()
    feature_builder = FeaturesBuilder(features,
                                      cache_table=BUILDINGS_FEATURES_TABLE)

    models = {
        f'CatBoost_depth{depth}_lr{lr}_l2reg{l2reg}':
        CatBoostClassifier(depth=depth,
                           learning_rate=lr,
                           l2_leaf_reg=l2reg,
                           iterations=300,
                           verbose=False,
                           thread_count=8,
                           od_pval=1e-5)
        for depth, lr, l2reg in product([4, 7, 10], [0.03, 0.1, 0.15],
                                        [1, 4, 9])
        # 'SVM C=0.1': SVC(C=0.1, probability=True, gamma='auto'),
        # 'SVM C=0.01': SVC(C=0.01, probability=True, gamma='auto'),
        # 'SVM C=1': SVC(C=1, probability=True, gamma='auto'),
        # 'CatBoost': CatBoostClassifier(loss_function = 'CrossEntropy', iterations=300, depth=3, learning_rate=0.15, l2_leaf_reg=4, verbose=False),
        # 'Logistic Regression': LogisticRegression(),
        # 'BalancedRF1000': BalancedRandomForestClassifier(n_estimators=1000),
        # 'BalancedRF1000_depth4': BalancedRandomForestClassifier(n_estimators=1000, max_depth=4),
        # 'BalancedRF100_depth3': BalancedRandomForestClassifier(n_estimators=100),
        # 'BalancedRF100_depth5': BalancedRandomForestClassifier(n_estimators=100, max_depth=5),
        # 'XGBoost': XGBClassifier(n_estimators=50, early_stopping_round=10)
    }

    task = TaskHandler(feature_builder, models=models)
    return task
예제 #2
0
 def __init__(self, bundle: list = None, importance=10):
     if bundle is not None:
         from coord2vec.feature_extraction.features_builders import FeaturesBuilder
         from coord2vec.feature_extraction.feature_bundles import create_building_features
         all_feats = create_building_features(bundle, importance)
         builder = FeaturesBuilder(all_feats)
         self.feat_names = builder.all_feat_names
def _create_task():
    features = create_building_features(elements=None)
    feature_builder = FeaturesBuilder(features,
                                      cache_table=BUILDINGS_FEATURES_TABLE)

    # hyper params for model are defined in MetaModel
    task = TaskHandler(feature_builder, models=None)
    return task
예제 #4
0
    def add_features_to_clusters(clusters: List[BuildingCluster], builder: FeaturesBuilder) -> List[BuildingCluster]:
        hulls_feat_df = builder.transform(GeoSeries([s.hull for s in clusters]))  # cache can slow us down

        feats_array = hulls_feat_df#.to_numpy()
        for i, bc in enumerate(clusters):
            bc._feature = feats_array.iloc[i:i+1, :]  # '_feature' attribute only used in CLSTRSearchProblem

        return clusters
예제 #5
0
def _polys_to_sparse_objects(polys: gpd.GeoSeries, features: List[Feature]) -> pd.DataFrame:
    """
    find the number of objects inside each polygon, for each feature
    Args:
        polys: geometries to count object inside of them
        features: features that consist objects

    Returns:
        A DataFrame of shape [n_polys, n_features]
    """
    number_of_features = [NumberOf(feature.table_filter_dict[feature.table][feature.object_name],
                                   feature.table,
                                   feature.object_name,
                                   radius=0) for feature in features]
    for i, feature in enumerate(number_of_features):  # prevent same name
        feature.feature_names = [str(i)]
        feature.set_default_value(0)
    feature_builder = FeaturesBuilder(number_of_features, cache_table=BUILDINGS_FEATURES_TABLE)
    object_count = feature_builder.transform(polys)
    return object_count
예제 #6
0
    def get_setup_variables(self):
        init_buildings = get_buildings_from_polygon(POLYGON,
                                                    is_intersects=True)[:100]
        radius = 10
        poly_feat_builder = FeaturesBuilder([
            AreaOfSelf(),
        ])

        neighb_init_states = [BuildingCluster(init_buildings[1])] + [
            BuildingCluster(b) for b in get_buildings_in_radius(
                init_buildings[1], radius, init_buildings[1])
        ]
        return neighb_init_states, poly_feat_builder
예제 #7
0
def _create_specific_task():
    features = create_CLSTR_features()
    poly_feature_builder = FeaturesBuilder(features,
                                           cache_table="LOC_CLSTR_features")

    # TODO: do we want one-class ? our problem is not one-class
    models = {
        'One Class SVM': OneClassSVM(),
        'isolation forest': IsolationForest(),
        'Gaussians': EllipticEnvelope(),
        'baseline': BaselineModel()
    }
    specific_task = HandlerBuildCLSTRs(poly_feature_builder,
                                       models=models)  # normal
    return specific_task
예제 #8
0
 def setUpClass(cls):
     feats = create_building_features(karka_bundle_features)
     cls.builder = FeaturesBuilder(feats, cache_table=BUILDINGS_FEATURES_TABLE)
     near_levinshtein_house = wkt.loads('POINT (34.8576548 32.1869038)')
     hatlalim_rd_raanana = wkt.loads('POINT (34.8583825 32.1874658)')
     cls.gdf = GeoDataFrame(pd.DataFrame({'geom': [near_levinshtein_house, hatlalim_rd_raanana]}), geometry='geom')
예제 #9
0
    def train_predict_on_split(task, building_gs, buildings_y, source_indices,
                               geos, y, source_train_indices,
                               source_test_indices):
        building_train_indices = np.isin(source_indices, source_train_indices)
        building_test_indices = np.isin(source_indices, source_test_indices)

        # fetch train-set and fit
        buildings_train_gs = building_gs.iloc[
            building_train_indices].reset_index(drop=True)
        y_train_buildings = buildings_y[building_train_indices]

        buildings_test_gs = building_gs.iloc[
            building_test_indices].reset_index(drop=True)
        y_test_buildings = buildings_y[building_test_indices]

        train_true_geos = geos[np.isin(range(len(geos)), source_train_indices)
                               & y]  # train-test in CLSTRs
        test_true_geos = geos[np.isin(range(len(geos)), source_test_indices)
                              & y]  # train-test in CLSTRs

        fpb = task.embedder  # feature extractor for polygons
        # add the building scores feature
        train_hash = hash_geoseries(geos[source_train_indices])
        fpb.features += [
            BuildingScores(
                SCORES_TABLE,
                BUILDING_EXPERIMENT_NAME,
                'BalancedRF1000',
                # TODO: doesn't match current MetaModel naming
                train_geom_hash=train_hash,
                radius=radius) for radius in [0, 25]
        ]

        heuristic_guiding_model = BaselineModel()
        heuristic_guiding_model.fit(task.transform(train_true_geos))
        # for i in trange(5, desc="Training CLSTR heuristic"):
        #     potential_CLSTRs_test = parmap(lambda b: building_to_CLSTR(b, fpb, heuristic_guiding_model),
        #                                 random.sample(buildings_train_gs[y_train]), use_tqdm=True, desc="Calculating potential CLSTRs")
        #
        #     heuristic_guiding_model = OneClassSVM()
        #     heuristic_guiding_model.fit(task.transform(train_true_geos))

        # TODO: do smarter choice of what buildings to start from ?
        score_extractor = FeaturesBuilder([
            BuildingScores(SCORES_TABLE,
                           BUILDING_EXPERIMENT_NAME,
                           'BalancedRF1000',
                           radius=0,
                           train_geom_hash=train_hash)
        ])
        building_scores_sorted = score_extractor.transform(
            buildings_test_gs)['building_scores_avg_0m'].sort_values(
                ascending=False)

        building_scores = pd.Series(
            index=buildings_test_gs.iloc[building_scores_sorted.index],
            data=building_scores_sorted.values)

        # building_scores = gpd.GeoDataFrame(
        #     zip(buildings_test_gs, np.random.random(len(buildings_test_gs))),
        #     columns=['geometry', 'score'], geometry='geometry').set_index('geometry')

        # TODO: do smarter choice of what buildings to start from. now top scoring 250
        best_test_buildings_with_scores = building_scores.iloc[random.sample(
            range(1000), 500)]
        potential_CLSTRs_test = parmap(lambda b: building_to_CLSTR(
            b, fpb, heuristic_guiding_model,
            partial(beam, beam_size=15, iterations_limit=15)),
                                       best_test_buildings_with_scores.index,
                                       use_tqdm=True,
                                       desc="Calculating potential CLSTRs",
                                       keep_child_tqdm=True,
                                       nprocs=16)

        # TODO: postprocessing, which CLSTRs to give. Related to how the fit together.
        print([p[1] for p in potential_CLSTRs_test])
        print([len(p[0].buildings) for p in potential_CLSTRs_test])
        sorted_potential_CLSTRs_test = list(
            sorted(potential_CLSTRs_test, key=lambda p: p[1], reverse=True))
        # TODO: choose with intel, depending on pluga, etc.
        best_potential_CLSTRs_test = pd.Series(
            index=[p[0].hull for p in sorted_potential_CLSTRs_test],
            data=MinMaxScaler().fit_transform([
                [p[1]] for p in sorted_potential_CLSTRs_test
            ])[:, 0])  # normalize scores, IMPORTANT
        print(best_potential_CLSTRs_test)

        return building_scores, geos.iloc[
            source_train_indices], y_train_buildings, geos.iloc[
                source_test_indices], test_true_geos, y_test_buildings, best_potential_CLSTRs_test