def _create_task(): features = create_building_features() feature_builder = FeaturesBuilder(features, cache_table=BUILDINGS_FEATURES_TABLE) models = { f'CatBoost_depth{depth}_lr{lr}_l2reg{l2reg}': CatBoostClassifier(depth=depth, learning_rate=lr, l2_leaf_reg=l2reg, iterations=300, verbose=False, thread_count=8, od_pval=1e-5) for depth, lr, l2reg in product([4, 7, 10], [0.03, 0.1, 0.15], [1, 4, 9]) # 'SVM C=0.1': SVC(C=0.1, probability=True, gamma='auto'), # 'SVM C=0.01': SVC(C=0.01, probability=True, gamma='auto'), # 'SVM C=1': SVC(C=1, probability=True, gamma='auto'), # 'CatBoost': CatBoostClassifier(loss_function = 'CrossEntropy', iterations=300, depth=3, learning_rate=0.15, l2_leaf_reg=4, verbose=False), # 'Logistic Regression': LogisticRegression(), # 'BalancedRF1000': BalancedRandomForestClassifier(n_estimators=1000), # 'BalancedRF1000_depth4': BalancedRandomForestClassifier(n_estimators=1000, max_depth=4), # 'BalancedRF100_depth3': BalancedRandomForestClassifier(n_estimators=100), # 'BalancedRF100_depth5': BalancedRandomForestClassifier(n_estimators=100, max_depth=5), # 'XGBoost': XGBClassifier(n_estimators=50, early_stopping_round=10) } task = TaskHandler(feature_builder, models=models) return task
def __init__(self, bundle: list = None, importance=10): if bundle is not None: from coord2vec.feature_extraction.features_builders import FeaturesBuilder from coord2vec.feature_extraction.feature_bundles import create_building_features all_feats = create_building_features(bundle, importance) builder = FeaturesBuilder(all_feats) self.feat_names = builder.all_feat_names
def _create_task(): features = create_building_features(elements=None) feature_builder = FeaturesBuilder(features, cache_table=BUILDINGS_FEATURES_TABLE) # hyper params for model are defined in MetaModel task = TaskHandler(feature_builder, models=None) return task
def add_features_to_clusters(clusters: List[BuildingCluster], builder: FeaturesBuilder) -> List[BuildingCluster]: hulls_feat_df = builder.transform(GeoSeries([s.hull for s in clusters])) # cache can slow us down feats_array = hulls_feat_df#.to_numpy() for i, bc in enumerate(clusters): bc._feature = feats_array.iloc[i:i+1, :] # '_feature' attribute only used in CLSTRSearchProblem return clusters
def _polys_to_sparse_objects(polys: gpd.GeoSeries, features: List[Feature]) -> pd.DataFrame: """ find the number of objects inside each polygon, for each feature Args: polys: geometries to count object inside of them features: features that consist objects Returns: A DataFrame of shape [n_polys, n_features] """ number_of_features = [NumberOf(feature.table_filter_dict[feature.table][feature.object_name], feature.table, feature.object_name, radius=0) for feature in features] for i, feature in enumerate(number_of_features): # prevent same name feature.feature_names = [str(i)] feature.set_default_value(0) feature_builder = FeaturesBuilder(number_of_features, cache_table=BUILDINGS_FEATURES_TABLE) object_count = feature_builder.transform(polys) return object_count
def get_setup_variables(self): init_buildings = get_buildings_from_polygon(POLYGON, is_intersects=True)[:100] radius = 10 poly_feat_builder = FeaturesBuilder([ AreaOfSelf(), ]) neighb_init_states = [BuildingCluster(init_buildings[1])] + [ BuildingCluster(b) for b in get_buildings_in_radius( init_buildings[1], radius, init_buildings[1]) ] return neighb_init_states, poly_feat_builder
def _create_specific_task(): features = create_CLSTR_features() poly_feature_builder = FeaturesBuilder(features, cache_table="LOC_CLSTR_features") # TODO: do we want one-class ? our problem is not one-class models = { 'One Class SVM': OneClassSVM(), 'isolation forest': IsolationForest(), 'Gaussians': EllipticEnvelope(), 'baseline': BaselineModel() } specific_task = HandlerBuildCLSTRs(poly_feature_builder, models=models) # normal return specific_task
def setUpClass(cls): feats = create_building_features(karka_bundle_features) cls.builder = FeaturesBuilder(feats, cache_table=BUILDINGS_FEATURES_TABLE) near_levinshtein_house = wkt.loads('POINT (34.8576548 32.1869038)') hatlalim_rd_raanana = wkt.loads('POINT (34.8583825 32.1874658)') cls.gdf = GeoDataFrame(pd.DataFrame({'geom': [near_levinshtein_house, hatlalim_rd_raanana]}), geometry='geom')
def train_predict_on_split(task, building_gs, buildings_y, source_indices, geos, y, source_train_indices, source_test_indices): building_train_indices = np.isin(source_indices, source_train_indices) building_test_indices = np.isin(source_indices, source_test_indices) # fetch train-set and fit buildings_train_gs = building_gs.iloc[ building_train_indices].reset_index(drop=True) y_train_buildings = buildings_y[building_train_indices] buildings_test_gs = building_gs.iloc[ building_test_indices].reset_index(drop=True) y_test_buildings = buildings_y[building_test_indices] train_true_geos = geos[np.isin(range(len(geos)), source_train_indices) & y] # train-test in CLSTRs test_true_geos = geos[np.isin(range(len(geos)), source_test_indices) & y] # train-test in CLSTRs fpb = task.embedder # feature extractor for polygons # add the building scores feature train_hash = hash_geoseries(geos[source_train_indices]) fpb.features += [ BuildingScores( SCORES_TABLE, BUILDING_EXPERIMENT_NAME, 'BalancedRF1000', # TODO: doesn't match current MetaModel naming train_geom_hash=train_hash, radius=radius) for radius in [0, 25] ] heuristic_guiding_model = BaselineModel() heuristic_guiding_model.fit(task.transform(train_true_geos)) # for i in trange(5, desc="Training CLSTR heuristic"): # potential_CLSTRs_test = parmap(lambda b: building_to_CLSTR(b, fpb, heuristic_guiding_model), # random.sample(buildings_train_gs[y_train]), use_tqdm=True, desc="Calculating potential CLSTRs") # # heuristic_guiding_model = OneClassSVM() # heuristic_guiding_model.fit(task.transform(train_true_geos)) # TODO: do smarter choice of what buildings to start from ? score_extractor = FeaturesBuilder([ BuildingScores(SCORES_TABLE, BUILDING_EXPERIMENT_NAME, 'BalancedRF1000', radius=0, train_geom_hash=train_hash) ]) building_scores_sorted = score_extractor.transform( buildings_test_gs)['building_scores_avg_0m'].sort_values( ascending=False) building_scores = pd.Series( index=buildings_test_gs.iloc[building_scores_sorted.index], data=building_scores_sorted.values) # building_scores = gpd.GeoDataFrame( # zip(buildings_test_gs, np.random.random(len(buildings_test_gs))), # columns=['geometry', 'score'], geometry='geometry').set_index('geometry') # TODO: do smarter choice of what buildings to start from. now top scoring 250 best_test_buildings_with_scores = building_scores.iloc[random.sample( range(1000), 500)] potential_CLSTRs_test = parmap(lambda b: building_to_CLSTR( b, fpb, heuristic_guiding_model, partial(beam, beam_size=15, iterations_limit=15)), best_test_buildings_with_scores.index, use_tqdm=True, desc="Calculating potential CLSTRs", keep_child_tqdm=True, nprocs=16) # TODO: postprocessing, which CLSTRs to give. Related to how the fit together. print([p[1] for p in potential_CLSTRs_test]) print([len(p[0].buildings) for p in potential_CLSTRs_test]) sorted_potential_CLSTRs_test = list( sorted(potential_CLSTRs_test, key=lambda p: p[1], reverse=True)) # TODO: choose with intel, depending on pluga, etc. best_potential_CLSTRs_test = pd.Series( index=[p[0].hull for p in sorted_potential_CLSTRs_test], data=MinMaxScaler().fit_transform([ [p[1]] for p in sorted_potential_CLSTRs_test ])[:, 0]) # normalize scores, IMPORTANT print(best_potential_CLSTRs_test) return building_scores, geos.iloc[ source_train_indices], y_train_buildings, geos.iloc[ source_test_indices], test_true_geos, y_test_buildings, best_potential_CLSTRs_test