示例#1
0
def test_threshold_without_refitting():
    """Test that the threshold can be set without refitting the model."""
    clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0)
    model = SelectFromModel(clf, threshold=0.1)
    model.fit(data, y)
    X_transform = model.transform(data)

    # Set a higher threshold to filter out more features.
    model.threshold = 1.0
    assert_greater(X_transform.shape[1], model.transform(data).shape[1])
示例#2
0
def test_threshold_without_refitting():
    """Test that the threshold can be set without refitting the model."""
    clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0)
    model = SelectFromModel(clf, threshold=0.1)
    model.fit(data, y)
    X_transform = model.transform(data)

    # Set a higher threshold to filter out more features.
    model.threshold = 1.0
    assert_greater(X_transform.shape[1], model.transform(data).shape[1])
示例#3
0
def test_threshold_without_refitting():
    # Test that the threshold can be set without refitting the model.
    clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None)
    model = SelectFromModel(clf, threshold="0.1 * mean")
    model.fit(data, y)
    X_transform = model.transform(data)

    # Set a higher threshold to filter out more features.
    model.threshold = "1.0 * mean"
    assert X_transform.shape[1] > model.transform(data).shape[1]
gene_features, samples_names, training_samples = data_loader.loadExpressionData(
)
pam50_by_sample_name = data_loader.load_labels_data(samples_names)

labels = []
for sample_name in samples_names:
    labels.append(pam50_by_sample_name[sample_name])

selected_features = [True for i in range(0, len(gene_features))]

# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = ExtraTreesClassifier()
clf.fit(training_samples, labels)
print "feature importancies", clf.feature_importances_

sfm = SelectFromModel(clf, prefit=True, threshold=0.001)
n_features = sfm.transform(training_samples).shape[1]

# Reset the threshold till the number of features equals two.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 100:
    sfm.threshold = sfm.threshold * 1.5
    X_transform = sfm.transform(training_samples)
    n_features = X_transform.shape[1]
    selected_features = sfm.get_support(False)

pickle.dump(selected_features, open('selected_features_array.pkl', 'wb'))
print "Finished the model selection to {} genes.".format(n_features)
示例#5
0
X, y = [exp_matrix, tol]

#lasso regression
use_lasso = input("Use lasso? ")
if "yes" in use_lasso:
    clf = LassoCV()
    #set an arbitrary threshold
    sfm = SelectFromModel(clf)
    sfm.fit(X, y)
    n_features = sfm.transform(X).shape[1]

    #continue to increment until top 5 features are
    #located
    while n_features < 2:
        sfm.threshold = sfm.threshold / 1.5
        X_transform = sfm.transform(X)
        n_features = X_transform.shape[1]

    feats = sfm.get_support(indices=True)
    for feat in feats:
        print(gene_cols[feat])

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    feat1 = X_transform[:, 0]
    feat2 = X_transform[:, 1]
    ax.scatter(feat1, feat2, tol)
    plt.xlabel("feat1")
    plt.ylabel("feat2")
    plt.show()