def select_features(x, y): """ :param x: dataframe of features :param y: dataframe of target property :return: Outputs of feature selection process """ x = pd.DataFrame(x) # Removing features with low variance var_threshold = f_selection.VarianceThreshold(threshold=(.8 * (1 - .8))) # Kbest-based and Percentile-based feature selection using regression f_regress = f_selection.f_regression(x, y, center=False) kbest = f_selection.SelectKBest(score_func=f_regress, k=2) percent = f_selection.SelectPercentile(score_func=f_regress, percentile=10) # Tree-based feature selection using a number of randomized decision trees trees = f_selection.SelectFromModel(ExtraTreesRegressor, prefit=True) # "False positive rate"-based feature selection using regression fpr = f_selection.SelectFpr(score_func=f_regress, alpha=0.05) # PCA-component evaluation pca = PCA(n_components=2) # Recursive feature elimination and cross-validated feature selection estimator = SVR(kernel="linear") selector = f_selection.RFECV(estimator, step=1, cv=5) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca_based", pca), ("univ_kbest", kbest), ("false_positive_rate", fpr), ("percentile_based", percent), ("RFECV_selector", selector), ("variance_threshold", var_threshold), ("trees_based", trees)]) x_union_features = combined_features.fit_transform(x, y) svm = SVC(kernel="linear") # Do grid search over all parameters: pipeline = Pipeline([("features", x_union_features), ("svm", svm)]) grid = dict(features__pca_based__n_components=range(1, 101), features__univ_kbest__k=range(1, 101), features_false_positive_rate_alpha=range(0, 1, 0.01), features_percentile_based_percentile=range(1, 20, 1), features_RFECV_selector_cv=range(1, 5), features_variance_threshold_threshold=range(0, 1, 0.01), svm__C=[0.01, 0.1, 1.0, 10.0]) grid_search = GridSearchCV(pipeline, param_grid=grid, verbose=0) x_features = grid_search.fit_transform(x, y) # Pickling feature reduction outputs with open(FS_PICKLE, 'wb') as result: pickle.dump(rf_sorted_score, result, pickle.HIGHEST_PROTOCOL) pickle.dump(grid_search.best_estimator_, result, pickle.HIGHEST_PROTOCOL) print grid_search.best_estimator_ return x_features
ax = fig.add_subplot(len(combos) / 2, 2, idx + 1, projection='3d') ax.scatter( pca_result[:,combo[0]] , pca_result[:,combo[1]] , pca_result[:,combo[2]] , c=df.sval , s=20 , cmap='YlOrRd' # red are the compounds with higher values of pIC50 ) ax.view_init(elev=30, azim=45) ax.set_xlabel('PC%s' % (combo[0] + 1)) ax.set_ylabel('PC%s' % (combo[1] + 1)) ax.set_zlabel('PC%s' % (combo[2] + 1)) plt.show() ''' from sklearn.manifold import TSNE model = TSNE(n_components=2) TSNEdata = model.fit_transform(df.iloc[:2000, 1:]) TSNEdf = pd.DataFrame(TSNEdata, columns=('x', 'y')) TSNEdf['c'] = pd.Series(df.sval.values[:2000], index=TSNEdf.index) plot = TSNEdf.plot.scatter(x='x', y='y', c='c', cmap='plasma') plt.show()