Пример #1
0
def select_features(x, y):
    """

    :param x: dataframe of features
    :param y: dataframe of target property
    :return: Outputs of feature selection process
    """
    x = pd.DataFrame(x)

    # Removing features with low variance
    var_threshold = f_selection.VarianceThreshold(threshold=(.8 * (1 - .8)))

    # Kbest-based and Percentile-based feature selection using regression
    f_regress = f_selection.f_regression(x, y, center=False)
    kbest = f_selection.SelectKBest(score_func=f_regress, k=2)
    percent = f_selection.SelectPercentile(score_func=f_regress, percentile=10)

    # Tree-based feature selection using a number of randomized decision trees
    trees = f_selection.SelectFromModel(ExtraTreesRegressor, prefit=True)

    # "False positive rate"-based feature selection using regression
    fpr = f_selection.SelectFpr(score_func=f_regress, alpha=0.05)

    # PCA-component evaluation
    pca = PCA(n_components=2)

    # Recursive feature elimination and cross-validated feature selection
    estimator = SVR(kernel="linear")
    selector = f_selection.RFECV(estimator, step=1, cv=5)

    # Build estimator from PCA and Univariate selection:
    combined_features = FeatureUnion([("pca_based", pca),
                                      ("univ_kbest", kbest),
                                      ("false_positive_rate", fpr),
                                      ("percentile_based", percent),
                                      ("RFECV_selector", selector),
                                      ("variance_threshold", var_threshold),
                                      ("trees_based", trees)])
    x_union_features = combined_features.fit_transform(x, y)

    svm = SVC(kernel="linear")

    # Do grid search over all parameters:
    pipeline = Pipeline([("features", x_union_features), ("svm", svm)])

    grid = dict(features__pca_based__n_components=range(1, 101),
                features__univ_kbest__k=range(1, 101),
                features_false_positive_rate_alpha=range(0, 1, 0.01),
                features_percentile_based_percentile=range(1, 20, 1),
                features_RFECV_selector_cv=range(1, 5),
                features_variance_threshold_threshold=range(0, 1, 0.01),
                svm__C=[0.01, 0.1, 1.0, 10.0])

    grid_search = GridSearchCV(pipeline, param_grid=grid, verbose=0)
    x_features = grid_search.fit_transform(x, y)

    # Pickling feature reduction outputs
    with open(FS_PICKLE, 'wb') as result:
        pickle.dump(rf_sorted_score, result, pickle.HIGHEST_PROTOCOL)
        pickle.dump(grid_search.best_estimator_, result,
                    pickle.HIGHEST_PROTOCOL)

    print grid_search.best_estimator_

    return x_features
    ax = fig.add_subplot(len(combos) / 2, 2, idx + 1, projection='3d')
    ax.scatter(
        pca_result[:,combo[0]]
        , pca_result[:,combo[1]]
        , pca_result[:,combo[2]]
        , c=df.sval
        , s=20
        , cmap='YlOrRd' # red are the compounds with higher values of pIC50
    )
    ax.view_init(elev=30, azim=45)
    ax.set_xlabel('PC%s' % (combo[0] + 1))
    ax.set_ylabel('PC%s' % (combo[1] + 1))
    ax.set_zlabel('PC%s' % (combo[2] + 1))

plt.show()

'''

from sklearn.manifold import TSNE

model = TSNE(n_components=2)
TSNEdata = model.fit_transform(df.iloc[:2000, 1:])

TSNEdf = pd.DataFrame(TSNEdata, columns=('x', 'y'))

TSNEdf['c'] = pd.Series(df.sval.values[:2000], index=TSNEdf.index)

plot = TSNEdf.plot.scatter(x='x', y='y', c='c', cmap='plasma')

plt.show()