def test_threshold_without_refitting(): """Test that the threshold can be set without refitting the model.""" clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0) model = SelectFromModel(clf, threshold=0.1) model.fit(data, y) X_transform = model.transform(data) # Set a higher threshold to filter out more features. model.threshold = 1.0 assert_greater(X_transform.shape[1], model.transform(data).shape[1])
def test_threshold_without_refitting(): # Test that the threshold can be set without refitting the model. clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None) model = SelectFromModel(clf, threshold="0.1 * mean") model.fit(data, y) X_transform = model.transform(data) # Set a higher threshold to filter out more features. model.threshold = "1.0 * mean" assert X_transform.shape[1] > model.transform(data).shape[1]
gene_features, samples_names, training_samples = data_loader.loadExpressionData( ) pam50_by_sample_name = data_loader.load_labels_data(samples_names) labels = [] for sample_name in samples_names: labels.append(pam50_by_sample_name[sample_name]) selected_features = [True for i in range(0, len(gene_features))] # We use the base estimator LassoCV since the L1 norm promotes sparsity of features. clf = ExtraTreesClassifier() clf.fit(training_samples, labels) print "feature importancies", clf.feature_importances_ sfm = SelectFromModel(clf, prefit=True, threshold=0.001) n_features = sfm.transform(training_samples).shape[1] # Reset the threshold till the number of features equals two. # Note that the attribute can be set directly instead of repeatedly # fitting the metatransformer. while n_features > 100: sfm.threshold = sfm.threshold * 1.5 X_transform = sfm.transform(training_samples) n_features = X_transform.shape[1] selected_features = sfm.get_support(False) pickle.dump(selected_features, open('selected_features_array.pkl', 'wb')) print "Finished the model selection to {} genes.".format(n_features)
X, y = [exp_matrix, tol] #lasso regression use_lasso = input("Use lasso? ") if "yes" in use_lasso: clf = LassoCV() #set an arbitrary threshold sfm = SelectFromModel(clf) sfm.fit(X, y) n_features = sfm.transform(X).shape[1] #continue to increment until top 5 features are #located while n_features < 2: sfm.threshold = sfm.threshold / 1.5 X_transform = sfm.transform(X) n_features = X_transform.shape[1] feats = sfm.get_support(indices=True) for feat in feats: print(gene_cols[feat]) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') feat1 = X_transform[:, 0] feat2 = X_transform[:, 1] ax.scatter(feat1, feat2, tol) plt.xlabel("feat1") plt.ylabel("feat2") plt.show()