Exemplo n.º 1
0
    def _validate_k_fold_top(self, model, x_train, y_train, x_test, y_test):
        validation_quantities = []

        for k_min in self.k_mins:
            for k_max in self.k_maxs:
                for dist_percentage in self.dist_percentages:
                    print(
                        f"k_min, k_max, dist_percentage: {k_min}, {k_max}, {dist_percentage}"
                    )
                    pipeline_list = [
                        ('extract_subspaces',
                         SubSpaceExtraction(dist_percentage=dist_percentage,
                                            k_min=k_min,
                                            k_max=k_max,
                                            metric="euclidean",
                                            n_jobs=-1)),
                        ('compute_diagrams',
                         VietorisRipsPersistence(n_jobs=-1))
                    ]
                    top_pipeline = Pipeline(pipeline_list)

                    diagrams_train, _ = top_pipeline.fit_transform_resample(
                        x_train, y_train)

                    top_features_train = extract_topological_features(
                        diagrams_train)

                    x_train_model = np.concatenate(
                        [x_train, top_features_train], axis=1)
                    model.fit(x_train_model, y_train)

                    x_test_model = extract_features_for_prediction(
                        x_train, y_train, x_test, y_test, top_pipeline)

                    score = model.score(x_test_model, y_test)
                    output_dictionary = {
                        "k_min": k_min,
                        "k_max": k_max,
                        "dist_percentage": dist_percentage,
                        "score": score
                    }
                    validation_quantities.append(output_dictionary)

        return validation_quantities
Exemplo n.º 2
0
    def cross_validate(self, full_x, full_y, splitting_dates):
        train_split_date = splitting_dates[0]
        val_split_date = splitting_dates[1]
        end_date = splitting_dates[2]

        train_x = full_x[(full_x.date < train_split_date) |
                         (full_x.date >= end_date)]
        train_y = full_y[(full_x.date < train_split_date) |
                         (full_x.date >= end_date)]

        val_x = full_x[(full_x.date >= train_split_date)
                       & (full_x.date < val_split_date)]
        val_y = full_y[(full_x.date >= train_split_date)
                       & (full_x.date < val_split_date)]

        test_x = full_x[(full_x.date >= val_split_date)
                        & (full_x.date < end_date)]
        test_y = full_y[(full_x.date >= val_split_date)
                        & (full_x.date < end_date)]

        train_x.pop("date")
        val_x.pop("date")
        test_x.pop("date")

        train_x = train_x.values
        train_y = train_y.values
        val_x = val_x.values
        val_y = val_y.values
        test_x = test_x.values
        test_y = test_y.values

        print("START VALIDATING MODEL")
        models_cv = self._validate_k_fold_model(train_x, train_y, val_x, val_y)
        best_model_params = best_combination(models_cv)
        best_model_params.pop("score")
        best_model = RandomForestClassifier(**best_model_params)

        best_model.fit(train_x, train_y)

        score = best_model.score(test_x, test_y)
        print(f'score no_top {score}')
        print(f'best model parameters no_top {best_model_params}')

        print("START VALIDATING PARAMS")
        topo_cv = self._validate_k_fold_top(best_model, train_x, train_y,
                                            val_x, val_y)
        best_topo = best_combination(topo_cv)
        best_topo.pop("score")
        best_topo_pipeline_list = [
            ('extract_subspaces', SubSpaceExtraction(**best_topo)),
            ('compute_diagrams', VietorisRipsPersistence(n_jobs=-1))
        ]
        best_topo_pipeline = Pipeline(best_topo_pipeline_list)

        train_x_for_test = np.concatenate([train_x, val_x], axis=0)
        train_y_for_test = np.concatenate([train_y, val_y], axis=0)

        diagrams_train, _ = best_topo_pipeline.fit_transform_resample(
            train_x_for_test, train_y_for_test)

        print("EXTRACTING TOPOLOGICAL FEATURES TRAIN")
        top_features_train = extract_topological_features(diagrams_train)

        x_train_model = np.concatenate([train_x_for_test, top_features_train],
                                       axis=1)
        best_model.fit(x_train_model, train_y_for_test)

        print("EXTRACTING TOPOLOGICAL FEATURES TEST")
        x_test_model = extract_features_for_prediction(x_train_model,
                                                       train_y_for_test,
                                                       test_x, test_y,
                                                       best_topo_pipeline)

        score_top = best_model.score(x_test_model, test_y)

        val_x_with_topo = extract_features_for_prediction(
            train_x, train_y, val_x, val_y, best_topo_pipeline)

        print('START VALIDATING MODEL WITH OPTIMAL TOPOLOGY')
        model_config_with_topo = self._validate_k_fold_model(
            x_train_model, train_y, val_x_with_topo, val_y)
        best_model_config_with_topo = best_combination(model_config_with_topo)
        best_model_config_with_topo.pop('score')

        best_model_with_topo = RandomForestClassifier(
            **best_model_config_with_topo)
        best_model_with_topo.fit(x_train_model, train_y_for_test)

        score_best_topo_and_model = best_model_with_topo.score(
            x_test_model, test_y)
        print(f'score best model and topo_feat {score_best_topo_and_model}')

        return best_model_params, best_topo, best_model_config_with_topo, score, score_top, score_best_topo_and_model