def FeatureSelection(pipeline_name, data_dev_mode, tag, train_filepath,
                     test_filepath):
    logger.info('FEATURE SELECTION...')

    if bool(config.params.clean_experiment_directory_before_training
            ) and os.path.isdir(config.params.experiment_dir):
        logger.info('Cleaning experiment directory...')
        shutil.rmtree(config.params.experiment_dir)

    data = _read_data(data_dev_mode, train_filepath, test_filepath)

    train_set = data['train']

    y = train_set[config.TARGET_COL].values.reshape(-1, )
    train_set = train_set.drop(columns=config.TARGET_COL)

    pipeline = PIPELINES[pipeline_name](so_config=config.SOLUTION_CONFIG,
                                        suffix=tag)

    sfs = SequentialFeatureSelector(estimator=pipeline,
                                    k_features=(10, len(train_set.columns)),
                                    forward=False,
                                    verbose=2,
                                    cv=5,
                                    scoring='roc_auc')
    sfs.fit(train_set.to_numpy(), y)

    fig = plot_sequential_feature_selection(sfs.get_metric_dict())
    plt.ylim([0.6, 1])
    plt.title('Sequential Feature Selection')
    plt.grid()
    plt.show()
示例#2
0
def test_model(model: sk.base.BaseEstimator, x_train: pd.DataFrame,
               y_train: pd.DataFrame, x_test: pd.DataFrame,
               y_test: pd.DataFrame, title: str, emotion: str, width: int,
               location: str, out: str = 'models', n_jobs: int = 1,
               cv: Optional[int] = None) \
        -> Dict[str, Union[str, float, int]]:
    y_train_target = y_train[f"middle.emotions.{emotion}"] if emotion != 'all' \
        else y_train
    y_test_target = y_test[f"middle.emotions.{emotion}"] if emotion != 'all' \
        else y_test

    out_path = pathlib.Path(out) \
               / f"w{width}/{location}" / f"{title.lower().replace(' ', '-')}"
    out_path.mkdir(parents=True, exist_ok=True)
    report = {}
    logger.info("Analyzing %s on %s", title, emotion)
    report['model'] = title
    report['target'] = emotion
    report['width'] = width
    report['location'] = location
    backup_model = sk.base.clone(model)

    try:
        features = analyze_model(model, x_train, y_train_target, n_jobs=n_jobs)
    except BaseException as e:
        import textwrap
        logger.error("There was an error of type %s", str(type(e)))
        logger.error("Error message: %s", str(e))
        with open(out_path / f"no-{emotion}.txt", 'w',
                  encoding='utf-8') as file:
            file.write("Error in generating the model.\n\n")
            file.write("Error message\n")
            file.write("-------------\n\n")
            file.write(textwrap.fill(str(e), 80))
        return report

    plot_sequential_feature_selection(features.get_metric_dict(),
                                      kind='std_dev')
    logger.info("Saving feature selection diagram")
    plt.title(
        f'{title} on {emotion} (w/StdDev, width: {width}, location: {location})'
    )
    plt.grid()
    plt.savefig(out_path / f"{emotion}.svg")

    if cv is not None:
        logger.info("Cross validating model")
        scores = cross_validate(
            model,
            features.transform(x_train),
            y_train_target,
            cv=cv,
            n_jobs=n_jobs,
        )
        logger.info("Saving cross validation results to a CSV")
        pd.DataFrame(scores).to_csv(out_path / f'{emotion}-cv.csv',
                                    encoding='utf-8')

    logger.info("Training final model")
    start_time = time.time()
    backup_model.fit(features.transform(x_train), y_train_target)
    end_time = time.time()
    report['training_time'] = end_time - start_time
    logger.info("Training completed in %.3f seconds", report['training_time'])

    try:
        logger.info("Testing final model")
        y_pred = backup_model.predict(features.transform(x_test))
        report['test_accuracy'] = sk.metrics.accuracy_score(
            y_test_target, y_pred)
        cm = sk.metrics.classification_report(
            y_test_target,
            y_pred,
            # target_names=(y_test.columns if emotion == 'all' else range(7))
        )
        with open(out_path / f"{emotion}-report.txt", 'w',
                  encoding='utf-8') as file:
            logger.info("Saving report to file")
            file.write(cm)
    except BaseException as e:
        logger.error(
            "There was a %s in the testing phase. Testing phase skipped."
            "\nError message: %s", str(type(e)), str(e))

    logger.info("Saving final model to file...")
    joblib.dump(backup_model, out_path / f"{emotion}.joblib")

    report['n_features'] = len(features.k_feature_names_)
    report['features'] = features.k_feature_names_
    report['score'] = features.k_score_

    return report
示例#3
0
def step_forward(X, y, name):
    print(name)
    # Inspiration: https://www.kdnuggets.com/2018/06/step-forward-feature-selection-python.html

    # Set up training/testing standardized data
    X_train_std, X_test_std, y_train, y_test = tts_std(X, y)

    # Build RF classifier to use in feature selection: liblinear solver recommended when you have
    # high dimension dataset, but once you standardize your data, the accuracy of all solvers is
    # pretty much the same. max_iter (maximum number of iterations taken for the solvers to converge.)
    # is set to a higher number than default (100) so that the model will actually converge (lower
    # values cause a no convergence warning).
    clf = skllm.LogisticRegression(penalty='l1',
                                   C=0.1,
                                   solver='liblinear',
                                   max_iter=100)

    # Build step forward feature selection: cv (cross validation) is set to zero for no
    # cross validation, k_features = 3 means we are selecting the 3 best attributes to desribe
    # our feature, and verbose is just used for logging the progress of the feature selector
    sfs1 = mlx.SequentialFeatureSelector(clf,
                                         k_features=5,
                                         forward=True,
                                         floating=False,
                                         verbose=0,
                                         scoring='accuracy',
                                         cv=10)

    # Perform SFS
    sfs1 = sfs1.fit(X_train_std, y_train, custom_feature_names=X.columns)

    # Which features?
    print('\t' + 'Top 5 features: ' + str(sfs1.k_feature_names_))
    feat_cols1 = list(sfs1.k_feature_idx_)

    # Build full model with selected features: sfs has no predict function
    clf = skllm.LogisticRegression(penalty='l1',
                                   C=0.1,
                                   solver='liblinear',
                                   max_iter=100)
    # Now that we have the relevant features according to SFS, we can use logistic regression
    # on JUST those features and see how accurately they can predict the classification of
    # single loaded, clear, straight, etc.
    clf.fit(X_train_std[:, feat_cols1], y_train)

    # 'kind' represents the kind of error bar you get in your plot {'std_dev', 'std_err', 'ci',
    # None}. This error bar is the error of the cv scores.
    fig1 = mlxp.plot_sequential_feature_selection(sfs1.get_metric_dict(),
                                                  kind='std_dev')
    plt.title('Sequential Forward Feature Selection CV Scores: ' + name +
              ' (std dev)')
    plt.ylabel('Mean CV Score')
    plt.grid()
    plt.savefig('feature_selection/sfs_' + name + ".png")
    plt.close()

    # Accuracy
    y_train_pred = clf.predict(X_train_std[:, feat_cols1])
    print('\tTraining accuracy on selected features: %.3f' %
          sklm.accuracy_score(y_train, y_train_pred))
    print('\tTraining mean absolute error on selected features: %.3f' %
          mean_abs_error(y_train, y_train_pred))
    y_test_pred = clf.predict(X_test_std[:, feat_cols1])
    print('\tTesting accuracy on selected features: %.3f' %
          sklm.accuracy_score(y_test, y_test_pred))
    print('\tTesting mean_abs_error on selected features: %.3f' %
          mean_abs_error(y_test, y_test_pred))

    # Confusion matrix generation
    confusion_matrix(y_train, y_train_pred, name + "_sfs_Training_Data_")
    confusion_matrix(y_test, y_test_pred, name + "_sfs_Testing_Data_")
    my_auc(y_train, X_train_std[:, feat_cols1], name + '_sfs_training',
           sfs1.k_feature_names_)

    # CV scores
    scores = sklms.cross_val_score(clf, X, y, cv=4)
    print('\t' + name + ' CVs: ' + str(scores))

    return sfs1, clf, pd.DataFrame.from_dict(sfs1.get_metric_dict()).T