def get_best_feature_subset(X, Y):
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.metrics import f1_score
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.1)

    best_f1 = 0
    best_model = None
    estimator = LogisticRegression()
    for i in range(5, 33):
        rfe = RFE(estimator, i)
        rfe.fit(trainX.values, trainY.values)

        predictions = rfe.predict(testX)

        f1 = f1_score(predictions, testY, average='macro')
        print(i)
        print(f1)
        if f1 > best_f1:
            best_f1 = f1
            best_model = rfe
    print("The subset of features for the best performing model are:")
    result = []
    for i, chosen in enumerate(best_model.support_.tolist()):
        if chosen:
            result.append(trainX.columns.values[i])
    print(result)
    print(classification_report(best_model.predict(testX), testY))
    return result
Exemplo n.º 2
0
def do_learning(X_training, Y_training, X_test, Y_test, reference_dic, model_class):

    '''
    credit: Juan Arroyo-Miranda & Dani Alcala

    With training and testing data select the best
    features with recursive feature elimination method, then
    fit a classifier and return a tuple containing the predicted values on the test data
    and a list of the best features used.
    '''
    
    model = model_class
    # Recursive Feature Elimination
    rfe = RFE(model)
    rfe = rfe.fit(X_training, Y_training)
    
    best_features = rfe.get_support(indices=True)

    best_features_names = [reference_dic[i] for i in best_features]

    predicted = rfe.predict(X_test)
    expected = Y_test

    accuracy = accuracy_score(expected, predicted)
    return (expected, predicted, best_features_names, accuracy)
Exemplo n.º 3
0
def test_rfe():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    X_sparse = sparse.csr_matrix(X)
    y = iris.target

    # dense model
    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    X_r = rfe.transform(X)
    clf.fit(X_r, y)
    assert len(rfe.ranking_) == X.shape[1]

    # sparse model
    clf_sparse = SVC(kernel="linear")
    rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1)
    rfe_sparse.fit(X_sparse, y)
    X_r_sparse = rfe_sparse.transform(X_sparse)

    assert X_r.shape == iris.data.shape
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data))
    assert rfe.score(X, y) == clf.score(iris.data, iris.target)
    assert_array_almost_equal(X_r, X_r_sparse.toarray())
Exemplo n.º 4
0
    def recursive_feature_elimination(config_learning, config_data, number_features):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfe = RFE(estimator, number_features, step=1)
        rfe.fit(x_train, y_train)

        for i, name in enumerate(feature_names):
            output.write(name + "\t" + str(rfe.ranking_[i]) + "\n")
            print(name + "\t" + str(rfe.ranking_[i]))

        predictions = rfe.predict(x_test)

        output.close()

        return predictions
Exemplo n.º 5
0
def feature_selection_LR():

    from sklearn.feature_selection import RFE

    rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=30, step=5, verbose=5)
    rfe_selector.fit(X_train_scaled, y_train)

    y_pred = rfe_selector.predict(X_test_scaled)
    y_predprob = rfe_selector.predict_proba(X_test_scaled)[:, 1]


    rfe_support = rfe_selector.get_support()
    rfe_feature = X_train[predictors].loc[:,rfe_support].columns.tolist()
    print(str(len(rfe_feature)), 'selected features')
    print('RFE features')
    print(rfe_feature)
    # Print model report:
    print("\nModel Report")
    #print("Train Accuracy : %.4g" % metrics.accuracy_score(y_train, y_pred_train))
    print("Test Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
    #print('Train error: {:.3f}'.format(1 - metrics.accuracy_score(y_train, y_pred_train)))
    print('Test error: {:.3f}'.format(1 - metrics.accuracy_score(y_test, y_pred)))
    print("AUC Score : %f" % metrics.roc_auc_score(y_test, y_predprob))
    print("Recall : %f" % metrics.recall_score(y_test, y_pred))
    print("Precision : %f" % metrics.precision_score(y_test, y_pred))
    print("F-measure : %f" % metrics.f1_score(y_test, y_pred))
    c_matrix = metrics.confusion_matrix(y_test, y_pred)
    print('========Confusion Matrix==========')
    print("          Rejected    Accepted")
    print('Rejected     {}      {}'.format(c_matrix[0][0], c_matrix[0][1]))
    print('Accepted     {}      {}'.format(c_matrix[1][0], c_matrix[1][1]))
Exemplo n.º 6
0
def recursive_feature_elimination():
    """ perform recursive feature elimination on a Linear Regression model to retrieve an optimal choice of nodes
        from a set of nodes. """
    # #### create data #######################
    num_nodes = 20
    num_nodes_choose = 5
    nodes = np.linspace(0, 1, num_nodes)
    set_size = 100000

    x = np.empty((num_nodes, set_size))  # one row for one node
    y = np.empty(set_size)

    for n in range(set_size):
        f = VelOscillator()
        x[:, n] = [f(node) for node in nodes]
        y[n] = f.integral(0, 1)

    # ### perform recursive feature elimination on a Linear Regression model
    reg = LinearRegression()
    rfe = RFE(estimator=reg, n_features_to_select=num_nodes_choose)
    rfe.fit(np.transpose(x), y)
    print('selected the nodes: {}'.format(nodes[rfe.support_]))

    # ### calculate the error
    error = np.mean((rfe.predict(np.transpose(x)) - y)**2)**0.5
    print('The chosen nodes and weights yield an error of {}'.format(error))
Exemplo n.º 7
0
def wbc_wrapper():
    wbc_data = load_wbc_data()
    wbc_values, wbc_labels = data_preprocessing(wbc_data)
    estimator = SGDClassifier(max_iter=1000)
    selector = RFE(estimator,5)
    selector.fit(wbc_values,wbc_labels)
    score, f1score = selector.score(wbc_values,wbc_labels), f1_score(selector.predict(wbc_values), wbc_labels)
    print('WBC-wrapper -accuracy of TOP 5 features = %.4f, F1 score = %.4f' % (score,f1score))
Exemplo n.º 8
0
def sonar_wrapper():
    sonar_data = load_sonar_data()
    sonar_values, sonar_labels = data_preprocessing(sonar_data)
    estimator = SGDClassifier(max_iter=1000)
    selector = RFE(estimator,5)
    selector.fit(sonar_values, sonar_labels)
    score, f1score = selector.score(sonar_values, sonar_labels), f1_score(selector.predict(sonar_values), sonar_labels)
    print('Sonar-wrapper -accuracy of TOP 5 features = %.4f, F1 score = %.4f' % (score,f1score))
Exemplo n.º 9
0
def train_recursive_feature_elimination(x_train, y_train, x_test, y_test):
    print("-------------RFE Model-------------")
    model = LogisticRegression(solver='lbfgs')
    rfe = RFE(model, 4)
    # RFE Fit
    rfe.fit(x_train, y_train)
    # RFE Predict
    y_predicted = rfe.predict(x_test)
    print_metrices_out(y_predicted, y_test)
def recursiveFeatureElimination(label, features):
    model = linear_model.LinearRegression()
    rfe = RFE(model, n_features_to_select=4)
    rfe = rfe.fit(features, label)
    print sorted(
        zip(map(lambda features: round(features, 4), rfe.ranking_), features))
    prediction = rfe.predict(features)
    r2Score = r2_score(label, prediction)
    print(r2Score)
    return rfe
Exemplo n.º 11
0
def elimination_feature():
    df = _load_data()
    X_train, X_test, y_train, y_test = _train_test(df, 'Milk')
    linear = LinearRegression()
    rfe = RFE(linear, n_features_to_select=3)
    rfe.fit(X_train, y_train)
    y_predict = rfe.predict(X_test)
    score = rfe.score(X_test, y_test)
    err = mean_squared_error(y_test, y_predict)
    return score, err, y_predict
Exemplo n.º 12
0
def linrfe():
    """
    为了快速计算完成, step=xx 需要设置大一些.

    ridge : 0.28+
    ridge + RFE: 0.28+
    线上却有0.045 ; 线下的这个测试看来完全不准确
    """
    X, y = load_svmlight_file('train.txt')
    X = X.toarray()
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    reg = linear_model.Ridge(alpha=0.5)
    reg.fit(X, y)
    print 'r^2=', reg.score(X, y)
    print 'train mse = ', mean_squared_error(y, reg.predict(X))

    rfe = RFE(estimator=reg, n_features_to_select=500, step=1000, verbose=2)
    rfe.fit(X, y)
    print 'rfe r^2 = ', rfe.score(X, y)
    print 'rfe mse =', mean_squared_error(y, rfe.predict(X))

    X_rfe = rfe.transform(X)
    poly = PolynomialFeatures(degree=2, interaction_only=True)
    X_poly = poly.fit_transform(X_rfe)  #直接处理会有 MemoryError

    param_grid = {'alpha': [0.5, 1, 10, 100, 1000, 1e4, 3e4]}
    gbm = GridSearchCV(reg,
                       param_grid,
                       verbose=2,
                       scoring='neg_mean_squared_error',
                       cv=5)
    gbm.fit(X_poly, y)
    logging.info('after rfe poly, best_result = {0}'.format(gbm.best_score_))
    logging.info('after rfe poly, best_param= {0}'.format(gbm.best_params_))
    #mse =  reg.score(X_poly, y)
    #print 'after poly ' ,mean_squared_error(y, reg.predict(X_poly))
    #logging.info('rfe r^2 score= ' + str(mse) )

    params = {
        'objective': 'mse',
        'num_leaves': 8,
        'learning_rate': 0.05,
        'min_child_samples': 60,  # 这个题目比较关键 .
        # 'subsample': 0.9,
        'n_estimators': 100,
        'silent': False,
    }
    gbm = lgb.LGBMRegressor(**params)
    gbm.fit(X_poly, y, eval_metric='mse', eval_set=[(X_poly, y)])

    logging.info('train lgb of poly = {0}'.format(
        mean_squared_error(y, gbm.predict(X_poly, y))))
Exemplo n.º 13
0
def runRFE(x, y, x_test, y_test, display=False):
    print("Bayes with feature selection")
    bayes = MultinomialNB()
    selector = RFE(bayes, 5, step=1)
    selector.fit(x, y)
    y_pred = selector.predict(x_test)
    labels = y.unique()
    confusion = confusion_matrix(y_test, y_pred, labels)
    if display:
        cd.display(confusion, labels, 31, "Bayes with feature selection")

    return confusion
Exemplo n.º 14
0
def subtest(model, XL, YL, XT, YT, feature_names):
	nfeatures = XL.shape[1]
	rfe = RFE(model, nfeatures-1)
	print "BEFORE"
	model.fit(XL, YL)
	print_performance(YT, model.predict(XT))
	print "AFTER"
	rfe.fit(XL, YL)
	print_performance(YT, rfe.predict(XT))
	print "REMOVED FEATURE %s" % (feature_names[np.where(rfe.support_==False)[0][0]])
	print ""
	return rfe.transform(XL), rfe.transform(XT), feature_names[rfe.support_]
    def get_patient_predictions_rfe(self,expression_file,ic50_file,patient_directory,target_features,drug):

        e_data,e_target,p_identifiers,p_data = dfm.get_cell_line_and_patient_expression_data_target_for_drug(expression_file,ic50_file,patient_directory,1.0,drug)
        step_length = int(len(e_data.tolist()[0]) / 100) + 1

        model = RFE(self.model,target_features,step=step_length)

        model.fit(e_data,e_target)
        predictions = model.predict(p_data)

        all_features = dfm.get_cell_line_and_patient_expression_gene_intersection(dfm.get_cell_line_expression_frame(expression_file),dfm.get_patients_expression_frame(patient_directory))[0]
        top_features = [all_features[i] for i in xrange(0,len(all_features)) if model.support_[i]]
        return p_identifiers, predictions, top_features
Exemplo n.º 16
0
def lsvm_classifier(authors: array, features: array, feature_max = 1000):
    train_labels, test_labels, train_data, test_data = train_test_split(authors, features, test_size=0.10)
    model = LinearSVC()
    selector = RFE(model, feature_max, 50, verbose=0)
    selector = selector.fit(train_data, train_labels)
    predictions = selector.predict(test_data)
    #for feature in range(len(features)):
        #for index in range(len(features[feature])):
            #features[feature][index] *= feature_mask[index]
    model.fit(train_data, train_labels)
    predictions = model.predict(test_data)
    accuracy = accuracy_score(predictions, test_labels)

    return accuracy
Exemplo n.º 17
0
 def test(self):
     estimator = LogisticRegression(random_state=0, solver='lbfgs')
     selector = RFE(estimator, self.feature_num, step=1)
     start = timer()
     selector = selector.fit(X, Y.ravel())
     end = timer()
     running_time = end - start
     prediction = selector.predict(X_test)
     fpr, tpr, thresholds = metrics.roc_curve(Y_test,
                                              prediction,
                                              pos_label=1)
     roc_auc = metrics.auc(fpr, tpr)
     print("Train for feature_num=" + str(self.feature_num) + ' done')
     return running_time, roc_auc
Exemplo n.º 18
0
def runRFE(x, y, x_test, y_test, display=False):
    print("Decision tree with feature selection")
    dtc = get_best_so_far()
    selector = RFE(dtc, 5, step=1)
    selector.fit(x, y)

    y_pred = selector.predict(x_test)
    labels = y.unique()
    confusion = confusion_matrix(y_test, y_pred, labels)
    if display:
        cd.display(confusion, labels, 90,
                   "Decision tree with feature selection")

    return confusion
Exemplo n.º 19
0
def model_logistic(X_train, y_train, X_test):

    '''
    With training and testing data and the data's features and label, select the best
    features with recursive feature elimination method, then
    fit a logistic regression model and return predicted values on the test data
    and a list of the best features used.
    '''
    
    model = LogisticRegression()
    rfe = RFE(model)
    rfe = rfe.fit(X_train, y_train)
    predicted = rfe.predict(X_test)
    best_features = rfe.get_support(indices=True)
    return predicted, best_features
def logisticRegression():

    model = LogisticRegression()
    X, y = generateDataSet("normalizedRegression_removed.csv")
    # create the RFE model and select 3 attributes
    rfe = RFE(model, 12)
    rfe = rfe.fit(X, y)
    # summarize the selection of the attributes
    print(rfe.support_)
    print(rfe.ranking_)

    expected = y
    predicted = rfe.predict(X)
    # summarize the fit of the model
    print(metrics.classification_report(expected, predicted))
Exemplo n.º 21
0
def model_logistic(training_data, test_data, features, label):

    '''
    With training and testing data and the data's features and label, select the best
    features with recursive feature elimination method, then
    fit a logistic regression model and return predicted values on the test data
    and a list of the best features used.

    '''
    
    model = LogisticRegression()
    rfe = RFE(model)
    rfe = rfe.fit(training_data[features], training_data[label])
    predicted = rfe.predict(test_data[features])
    best_features = rfe.get_support(indices=True)
    return predicted, best_features
Exemplo n.º 22
0
def experiment(no_features):
    results = []
    
    for i in no_features:
        logistic = RFE(logistic_regression, i, step=1)
        logistic = logistic.fit(x_train, y_train)
        
        print(get_true_indices(logistic.support_))
        
        y_res = logistic.predict(x_train)
        accuracy = accuracy_score(y_train, y_res.ravel())
        
        print(accuracy)
        
        results.append(accuracy)
        
    return results
Exemplo n.º 23
0
def model_logistic(training_data, test_data, features, label):
    '''
    With training and testing data and the data's features and label, select the best
    features with recursive feature elimination method, then
    fit a logistic regression model and return predicted values on the test data
    and a list of the best features used.

    '''
    start = time()
    model = LogisticRegression()
    rfe = RFE(model)
    rfe = rfe.fit(training_data[features], training_data[label])
    predicted = rfe.predict(test_data[features])
    best_features = rfe.get_support(indices=True)
    elapsed_time = time() - start
    print 'logistic regression took %s seconds to fit' % elapsed_time
    return predicted, best_features
Exemplo n.º 24
0
def logistic_model(X_train, X_test, y_train):
    '''
    Function to select best features using RFE, then fits logistic regression model. Returns predicted values.

    Inputs:
    	X_train, X_test, y_train (df)

    Output:
    	predicted_y (list)
    '''
    reg = LogisticRegression()
    rfe = RFE(reg)
    rfe = rfe.fit(X_train, y_train)
    predicted_y = rfe.predict(X_test)
    best_features = rfe.get_support(indices=True)

    return predicted_y, best_features
Exemplo n.º 25
0
def q4():

    X = df.copy().drop(columns=["Overall"])
    y = df["Overall"]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        shuffle=True)

    reg = LinearRegression()
    reg.fit(X_train, y_train)

    print("Model r2 score Linear regression:", reg.score(X_test, y_test))
    y_pred = reg.predict(X_test)
    print('MSE', mse(y_test, y_pred))
    print('RMSE', mse(y_test, y_pred, squared=False))
    print(
        pd.DataFrame.from_dict(dict(zip(X_train.columns, reg.coef_)),
                               orient='index',
                               columns=['coef']).sort_values(
                                   by='coef', ascending=False).head(5))

    selector = RFE(estimator=reg, n_features_to_select=5, step=1, verbose=0)
    selector = selector.fit(X_train, y_train)
    selected_features5 = list(X_train.columns[selector.get_support()])
    print('\nMost important features RFE', selected_features5)
    print("\nModel r2 score RFE Linear regression selected features:",
          selector.score(X_test, y_test))
    y_pred = selector.predict(X_test)
    print('MSE', mse(y_test, y_pred))
    print('RMSE', mse(y_test, y_pred, squared=False))

    X_train5 = selector.transform(X_train)
    reg.fit(X_train5, y_train)
    coeficients = reg.coef_
    print(
        pd.DataFrame.from_dict(dict(zip(selected_features5, reg.coef_)),
                               orient='index',
                               columns=['coef']).sort_values(
                                   by='coef', ascending=False).head(5))

    # plt.scatter(y_test,y_pred)
    # plt.show()
    return selected_features5
    def get_predictions_full_CCLE_dataset_rfe(self,expression_file,ic50_file,target_features,drug):

        scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None)
        step_length = int(len(scikit_data.tolist()[0]) / 100) + 1

        model = RFE(self.model,target_features,step=step_length)
        model.fit(scikit_data,scikit_target)

        expression_frame = dfm.normalize_expression_frame(dfm.get_cell_line_expression_frame(expression_file))
        cell_lines = expression_frame.columns
        testing_data = dfm.get_scikit_data(expression_frame)

        predictions = model.predict(testing_data)

        top_features = [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if model.support_[i]]


        return cell_lines,predictions,top_features
Exemplo n.º 27
0
def train_recursive_feature_elimination(x_train,
                                        y_train,
                                        x_test,
                                        y_test,
                                        feature_num=10):
    print("-------------RFE Model-------------")
    class_weight = dict()
    class_weight[1] = 1
    class_weight[0] = 1
    model = LogisticRegression(solver='sag', class_weight=class_weight)
    # model = RandomForestClassifier(n_estimators=100)
    # model = SVC(gamma='scale', probability=True, kernel='poly')
    rfe = RFE(model, feature_num)
    # RFE Fit
    rfe.fit(x_train, y_train)
    # RFE Predict
    y_predicted = rfe.predict(x_test)
    y_prob = rfe.predict_proba(x_test)
    print(rfe.support_)
    return y_predicted, y_prob
Exemplo n.º 28
0
def rfe_classifier(method,
                   train_data,
                   train_class,
                   test_data,
                   CV_=3,
                   fraction_feat_to_keep=0.1,
                   LM_params=get_ML_parameters()):
    global have_written_params_to_file
    if have_written_params_to_file is False:
        logging.info("Run settings for models:")
        logging.info(str(LM_params))
        have_written_params_to_file = True

    clf = set_up_classifier(method, CV_, LM_params)

    # fit and predict based on whether cross validation is used
    if (CV_ > 1):
        step_elim = (1 - fraction_feat_to_keep) / CV_
        num_to_keep = int(fraction_feat_to_keep * len(list(train_data)))
        num_to_keep = max(num_to_keep, 1)
        rfecv = RFE(estimator=clf,
                    step=step_elim,
                    n_features_to_select=num_to_keep)

        rfecv.fit(train_data, train_class)
        preds = rfecv.predict(test_data)
        mask = list(rfecv.support_)
        # print("Number of features selected:", sum(mask))
        #print(rfecv.ranking_)
        features = train_data.columns
        features_selected = [
            features[i] for i in range(0, len(mask)) if mask[i]
        ]
        #print(features_selected)

    else:
        clf.fit(train_data, train_class)
        preds = clf.predict(test_data)

    return preds, features_selected, sum(mask)
Exemplo n.º 29
0
def svm(train_set, label_set, test_set, ground_truth):
    train_set = Normalizer().fit_transform(train_set)
    test_set = Normalizer().fit_transform(test_set)
    svm_clf = SVC(C=0.2, kernel='linear')
    #svm_clf = SVC()
    #s = cross_validate(svm_clf,train_set,label_set)
    #print(s)
    #grid = GridSearchCV(svm_clf,param_grid={"C":[0.2,0.5,1.0,1.2,1.5,3,10],"kernel":['linear','rbf']},cv=10)
    #grid.fit(train_set,label_set)
    rfe = RFE(estimator=svm_clf, n_features_to_select=2, step=1)
    # n=5,0.6497 n=8,0.66358  n=12,0.66728  n=15,0.66635
    """"[True  True False  True False  True  True False  True  True  True  True
     True  True False  True]
    [1 1 5 1 2 1 1 3 1 1 1 1 1 1 4 1]"""

    rfe.fit(train_set, label_set)

    #print(rfe.support_)
    #print(rfe.ranking_)
    #svm_clf.fit(train_set,label_set)
    #y_score = svm_clf.decision_function(test_set)
    #y = svm_clf.predict(test_set)
    y = rfe.predict(test_set)
    #fpr, tpr, threshold = roc_curve(test_set, y_score)
    #roc_auc = auc(fpr, tpr)
    #y = grid.predict(test_set)
    print(rfe.score(test_set, ground_truth))

    #print(svm_clf.score(test_set,ground_truth))
    #print(svm_clf.score(train_set,label_set))
    #print(grid.score(test_set,ground_truth))
    #print(grid.score(train_set,label_set))
    #print(grid.best_params_)

    p = precision_score(y, ground_truth)
    r = recall_score(y, ground_truth)
    f = f1_score(y, ground_truth)

    return p, r, f
def automatic_recursive_feature_elimination(df):
    X = df.drop("test", axis=1)
    y = df["test"].apply(lambda x: 1 if x == "positive" else 0)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # Create the RFE with a LogisticRegression estimator and 3 features to select
    rfe = RFE(estimator=LogisticRegression(),
              n_features_to_select=3,
              verbose=1)

    # Fits the eliminator to the data
    rfe.fit(X_train, y_train)

    # Print the features and their ranking (high = dropped early on)
    print(dict(zip(X.columns, rfe.ranking_)))

    # Print the features that are not eliminated
    print(X.columns[rfe.support_])

    # Calculates the test set accuracy
    acc = accuracy_score(y_test, rfe.predict(X_test))
    print("{0:.1%} accuracy on test set.".format(acc))
Exemplo n.º 31
0
def sc3_multitask(X, Y, Z, feature_list, selection_method, estimator_method, selection_args, estimator_args):

    W = []
    features = []

    if estimator_method == 'svm' and selection_method == 'RFE':
        estimator_args['kernel'] = 'linear'

    n_features = min(len(feature_list), selection_args['n_features'])

    estimator = ESTIMATORS[estimator_method](**estimator_args)

    if selection_method == 'rfe':
        del selection_args['n_features']
        selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args)
        selector = selector.fit(X, Y.T)
        features = feature_list[selector.support_]
        W = selector.predict(Z)

    if selection_method == 'kbest':
        print 'Cannot use KBest with multi task methods'

    return W.T, features
def RFElimination1(X_train1, train1_y, X_test1, test1_y, return_score_p1,
                   name):
    print("in " + str(name))
    org = lm
    selector = RFE(org, 18, step=1)
    selector = selector.fit(X_train1, train1_y)
    #     print(selector.ranking_)
    rankingdf = pd.DataFrame(list(zip(X_train1.columns, selector.ranking_)),
                             columns=["features", "ranking"])
    file = "Features" + str(name)
    rankingdf.to_csv(file + ".csv")
    # print(rankingdf)
    result = sm.OLS(train1_y, X_train1).fit()
    # print(result.summary())
    pred = selector.predict(X_train1)
    sc = r2_score(train1_y, pred)
    # print("RFElimination:" + str(sc))
    # print(sc)
    return_score_p1[name] = sc
    print("Training Dataset")
    computations(selector, X_train1, train1_y)
    print("Testing Dataset")
    computations(selector, X_test1, test1_y)
Exemplo n.º 33
0
def logisticReg():
    train = getTrainingData('train.csv',
                            visualize=False,
                            discrete=False,
                            encoding=True)
    X_train = train.drop(['Exited'], axis=1)
    print(X_train.columns.values)
    y_train = train.Exited
    #
    oversample = SMOTE()
    X_train, y_train = oversample.fit_resample(X_train, y_train)
    scale = StandardScaler().fit(X_train)
    X_train = scale.transform(X_train)
    # ---- RFE -----
    rfes = []
    scores = []

    for n in range(1, 13):
        tree = RandomForestClassifier(n_estimators=40)
        rfe = RFE(tree, n, 1)
        rfe.fit(X_train, y_train)
        rfes.append(rfe)
        yHat = rfe.predict(X_train)
        scores.append(accuracy_score(y_train, yHat))
    print(scores)
    print(rfes[4].support_)

    # ['CreditScore' 'Age' 'Tenure' 'Balance' 'NumOfProducts' 'HasCrCard'
    #  'EstimatedSalary' 'France' 'Germany' 'Female' 'Active']
    # [False  True False  True False False False False  True  True  True]
    # this is the setting that's resonable for both accuracy_score and f1_score
    # ['Age' 'NumOfProducts' 'HasCrCard' 'Germany' 'Active']

    return


# logisticReg()
Exemplo n.º 34
0
def RFE_linear_regression(X, y, n_features = 6):
    # define estimators
    estimator1 = Lasso(alpha = 0.2)
    estimator2 = Ridge(alpha = 0.8)
    estimator3 = ElasticNet(alpha = 0.9)
    # create the vector of tuples required as parameter of make_KfoldCV_regression(), here we have 3 estimators
    estimators = [('RFE L1 Regularizer', estimator1), ('RFE L2 Regularizer', estimator2), ('RFE Elastic Net Regularizer', estimator3)]
    ets = {} 
    for name, estimator in estimators:  
        # select best model with minimum error (et1, er1 for MAD, et2, er2 for MSE)      
        et1 = None
        et2 = None
        er1 = None
        er2 = None
        finaly_test = None #used to calculate E_out
        finalX_test = None
        for X_train, y_train, X_test, y_test in get_kfold_train_test(X, y):
            # do feature selection
            selector = RFE(estimator, n_features, step=1)
            # do training
            selector.fit(X_train, y_train)
            # get prediction vector
            preds = selector.predict(X_test)
            # calculate MAD, MSE (out of sample)
            error1 = mean_absolute_dev(preds, y_test)
            error2 = mean_squared_err(preds, y_test)
            # select the best training set and test set, i.e select the best estimator, and the corresponding MAD and MSE (out of sample)
            if er1 is None:
                et1 = copy.deepcopy(selector)
                er1 = error1
                finaly_test = copy.deepcopy(y_test)
                finalX_test = copy.deepcopy(X_test)
            else:
                if error1 < er1:
                    et1 = copy.deepcopy(selector)
                    er1 = error1
                    finaly_test = copy.deepcopy(y_test)
                    finalX_test = copy.deepcopy(X_test)
            if er2 is None:
                et2 = copy.deepcopy(selector)
                er2 = error2
            else:
                if error2 < er2:
                    et2 = copy.deepcopy(selector)
                    er2 = error2
        print(name, ':\n', 'MAD (out of sample):', '%.4f' % er1, '; MSE (out of sample):', '%.4f' % er2)
        # use the best estimator(respectively based on MAD and MSE) to predict all 
        y_preds1 = et1.predict(X)
        y_preds2 = et2.predict(X)
        # calculate E_out
        finaly_preds = et1.predict(finalX_test)
        count = 0
        tol = 0.8
        for i in range(len(finaly_test)):
            if abs(finaly_preds[i]-finaly_test[i]) <= tol:
                count += 1
        print('With tolerance ', '%.4f' % tol, ', E_out is ', '%.4f' %(1 - count/len(finaly_test)))
        # put corresponding vectors of prediction of all samples and MAD, MSE into a dictionary with keys as names of the estimators
        # put corresponding vectors of prediction of all samples and MAD, MSE into a dictionary with keys as names of the estimators
        ets[name] = (y_preds1, mean_absolute_dev(y_preds1, y), y_preds2, mean_squared_err(y_preds2, y))
    return ets
Exemplo n.º 35
0
print("Test Accuracy: ", test_score)

MetricsForMulticlass(y_test, pred)

accuracy = metrics.accuracy_score(y_test, pred)
imp = improvement(group, accuracy)
print("Accuracy: ", round(accuracy, 2)) #0.62
print("Improvement: ", round(imp, 2)) #14.3

#----------------------------------------------------------------------------
# RECURSIVE FEATURES ELIMINATION

from sklearn.feature_selection import RFE
estimator = rfc(max_depth=4, n_estimators=500, random_state=42, n_jobs=-1)
selector = RFE(estimator, 5, step=1)
selector = selector.fit(X_train, y_train)

with open("rfe_new_features38", "wb") as f:
    pickle.dump(selector, f, pickle.HIGHEST_PROTOCOL)

#selector.support_
#selector.ranking_

mod = selector.fit(X_train, y_train)
prediction = selector.predict(X_test)

accuracy = metrics.accuracy_score(y_test, pred)
imp = improvement(group, accuracy)
print("Accuracy: ", round(accuracy, 2))
print("Improvement: ", round(imp, 2))
Exemplo n.º 36
0
    print(confmat)

    return tree

i = 1
while i <= len(wine.columns)-1:
    print(i)
    make_and_test_tree(wine_train, wine_test, i)
    i += 1
    
tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=len(wine.columns)-1, random_state=0)    
selector = RFE(estimator=tree, n_features_to_select=3, step=1) # limit this to two best features
selector = selector.fit(wine_val_train.iloc[:, 0:len(wine_val_train.columns)-2], wine_val_train.iloc[:,[len(wine_validation.columns)-1]])
selector.support_
selector.ranking_
RFE_tree = selector.predict(wine_validation.iloc[:,0:len(wine_validation.columns)-2], wine_validation.iloc[:,[len(wine_validation.columns)-1]])

wine4 = wine[['residual.sugar',
 'free.sulfur.dioxide',
 'density',
 'qualBins2']]

# create training/testing (to use at the end)
wine_train, wine_test = train_test_split(wine4, test_size=0.3, random_state=0)

# create validation set (out of the training set)
wine_val_train, wine_validation = train_test_split(wine_train, test_size=0.3, random_state=0)

i = 1
while i <= len(wine4.columns)-1:
    print(i)
Exemplo n.º 37
0
	print "Predicted pos %d neg %d" %(p,n)
	p=int(0)
	n=int(0)
	for x in ycv:
	    if x==1:
		p+=1
	    else:
		n+=1
	print "Actual pos %d neg %d" %(p,n)
#if "":
f=np.genfromtxt(open("CAX_COPD_TEST_data.csv","rb"),delimiter=",",skiprows=1)
mat=np.matrix(f)
Xtest=mat[:,2:]
p=int(0)
n=int(0)
pred=rfe.predict(Xtest)
for x in pred:
    print x
    if x==1:
	p+=1
    else:
	n+=1
print p,n
if "":
	with open('CAX_COPD_SubmissionFormat.csv','r') as csvinput:
	    with open('Coutput.csv', 'w') as csvoutput:
		writer = csv.writer(csvoutput, lineterminator='\n')
		reader = csv.reader(csvinput)

		all = []
		row = next(reader)
results = []
selector = RFE(estimator,3,step=1)   #recursive forward elimination 
selcetor = selector.fit(X,y)
selector.support_
# Out[61]: array([ True,  True,  True, False, False, False])
# It seems the first predictors are okay but not the the last three. 
# one can see if droping the last three will improve the model with all 6.
selector.ranking_
# Out[65]: array([2, 1, 3, 6, 5, 4])

for i in range(1,len(X.iloc[0])+1):
    selector = RFE(estimator, n_features_to_select=i, step=1)
    selector.fit(X,y)
    r2 = selector.score(X,y)
    selected_features = features[selector.support_]
    msr = mean_squared_error(y, selector.predict(X))
    results.append([i, r2, msr, ','.join(selected_features)])
    
results 

'''
results 
Out[68]: 
[[1, 0.47017552557905884, 2.5448877365932985, 'Email'],
 [2, 0.8987844810699489, 0.48616503259788457, 'Internet,Email'],
 [3, 0.9008606156394956, 0.47619280658599406, 'Internet,Email,Blog'],
 [4,
  0.9051564044419049,
  0.45555899148299284,
  'Internet,Email,Blog,SmartPhone'],
 [5,
Exemplo n.º 39
0
for entry in inp:
    for i in range(n):
	key,val=entry[i].split(':')
	feature[key]=val
    feature={int(key):float(val) for key,val in feature.items()}
    fvec.append(feature.copy())
for i in range(mtest):
    #print fvec[i]
    row=[]
    for key in sorted(fvec[i]):
	row.append(fvec[i][key])
    row=np.matrix(row)
    if i==0:
        test=row
    else:
        test=np.r_[test,row]
	#train=np.r_(train,row)
print test
orig=['-1','+1']
label2orig={i:orig[i] for i in range(2)}

pred=rfe.predict(test)
print pred
cl=[0,1,1,0,1]
for i,val in enumerate(cl):
    print label2orig[val] 

for i,val in enumerate(pred):
    print iden[i],
    print label2orig[val] 
Exemplo n.º 40
0
    # Transform the training data
    #scaler = MinMaxScaler(feature_range=(1, 2)).fit(data_features)
    #data_features = scaler.transform(data_features)
    data_features = transform_data(data_features)

    # Fit the regression model
    alphas = np.logspace(-5,3,30)
    ridge_regressor = linear_model.RidgeCV(alphas=alphas, normalize=True, fit_intercept=True)
    rfe = RFE(estimator=ridge_regressor, n_features_to_select=14, step=1)
    rfe.fit(data_features[0:600], data_values[0:600])
    print "alpha: " + str(rfe.estimator_.alpha_)
    print "intercept: " + str(rfe.estimator_.intercept_)
    print "R-square: " + str(rfe.score(data_features[600:], data_values[600:]))

    # Compute MSE
    train_pred = rfe.predict(data_features)
    mse = ((train_pred[600:] - data_values[600:])**2).mean(axis=0)
    print "MSE: " + str(mse)

    # Visualize predicted values on train data
    plt.plot(train_pred[600:], label='Predicted')
    plt.plot(data_values[600:], label='Actual')
    plt.legend(loc='upper right')
    plt.show()

    # Load the test data
    validate_test = np.loadtxt("validate_and_test.csv", delimiter=',', usecols = range(1,15))
    validate_test_ind = np.loadtxt("validate_and_test.csv", delimiter=',', usecols = range(0,1))
    
    # Transform the test data
    #validate_test = scaler.transform(validate_test)
n_split = 1800    
X_train, X_test = X[:n_split], X[n_split:]
Y_train, Y_test = y[:n_split], y[n_split:]



numFeatures = 40  
    
model = ExtraTreesClassifier()

#model.fit(X_train, Y_train)
rfe = RFE(model, numFeatures)
rfe = rfe.fit(X_train,Y_train)

temp = rfe.score(X_test, Y_test)
predictionOfPrelim = rfe.predict(prelimData)

featureRanking = rfe.ranking_
#Best ExtraTrees Accuracy is:  [400, 0.98902777777777773, 40]            
print("ExtraTrees Accuracy is: ", temp)

prelimClasses = np.loadtxt("prelim-class.txt")
assert len(prelimClasses) == len(predictionOfPrelim)
h = []
for i in range(len(prelimClasses)):
    if prelimClasses[i] == predictionOfPrelim[i]:
        h.append(1)
    else:
        h.append(0)

thefile = open('Result_ExtraTrees_prelim.txt', 'w')
Exemplo n.º 42
0
		#print trainind,cvind
		#itrain=mat[:1100,0]
		#for x in X:
		 #  print x
		model = LogisticRegression()
		#from sklearn.svm import SVC
		#model=SVC(kernel="linear",C=1)
		rfe = RFE(model, k)
		rfe = rfe.fit(Xtrain, ytrain)
		#print(rfe.support_)
		#print(rfe.ranking_) This is one of the results expected
		Xcv=cv[:,2:]
		ycv=cv[:,1]
		p=int(0)
		n=int(0)
		pred=rfe.predict(Xcv)
		#print "pred set before matrix %s" % str(np.shape(pred))
		#print "ycv set before matrix %s" % str(np.shape(ycv))
		J+=float(f1_score(ycv,pred))
		count+=1
        f1score=float(J/count)
	print "No of features %d f1_score %f" % (k,f1score)
        if f1score>F1max:
		F1max=f1score
		n_features=k
print n_features,F1max
Xtrain=f[:,2:]
ytrain=f[:,1]
model = LogisticRegression()
rfe = RFE(model, n_features)
rfe = rfe.fit(Xtrain, ytrain)
Exemplo n.º 43
0
# print the classification (a ratio) for 
# the samples in our original folder

if (fit_data):
	for i in range(0, len(testData)):
		pred = rf.predict(testData[i])
		live = sum(pred)
		ratio = live/len(testData[i])
		print("%%live: ",ratio, "| name: ", dataName[i])

##################
# RANDOM FOREST WITH RFE (NO CV)

if (fit_data_rfe):
	for i in range(0, len(testData)):
		pred = rfe.predict(testData[i])
		live = sum(pred)
		ratio = live/len(testData[i])
		print("%%live: ",ratio, "| name: ", dataName[i])

	importances = rfe.ranking_
	indices = np.argsort(importances)
	print("Feature ranking:")

	for f in range(n_features):
	    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

##################
# RANDOM FOREST WITH RFE WITH CV

if (fit_data_rfe_cv):
Exemplo n.º 44
0
def main(args=None):
    init_log()

    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    # so some option parsing
    parser, ns, args = init_args(description="Predict epitope sites.", args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+')

    ARGS = parse_args(parser, args, namespace=ns)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    # maxrel doesn't support similar
    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    antibodies = tuple(ARGS.ANTIBODY)

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(antibodies, ARGS.CLONAL)

    # if we're doing LOOCV, make sure we set CV_FOLDS appropriately
    if ARGS.LOOCV:
        ARGS.CV_FOLDS = len(seqrecords)

    ab_basename = ''.join((
        '+'.join(antibodies),
        '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
        '_clonal' if clonal else ''
        ))
    alignment_basename = '_'.join((
        ab_basename,
        ARGS.DATA.basename_root,
        __version__
        ))
    sto_filename = alignment_basename + '.sto'

    # don't capture the second variable, let it be gc'd
    alignment = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS)[0]

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    ylabeler = Labeler(
        partial(expression, ARGS.LABEL),
        partial(skipper, is_refseq, ARGS.SUBTYPES)
        )
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(
        max_conservation=ARGS.MAX_CONSERVATION,
        min_conservation=ARGS.MIN_CONSERVATION,
        max_gap_ratio=ARGS.MAX_GAP_RATIO
        )

    extractors = [('site_ident', MSAVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('pair_ident', MSAVectorizerPairwise(ARGS.ENCODER, filter, ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4, name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pair', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS'))
            )

    extractor = FeatureUnion(extractors, n_jobs=1)  # n_jobs must be 1 for now
    X = extractor.fit_transform(alignment)

    assert y.shape[0] == X.shape[0], \
        "number of classes doesn't match the data: %d vs %d" % (y.shape[0], X.shape[0])

    scorer = Scorer(ARGS.OPTSTAT)

    # do grid-search as part of the svm to avoid
    # performing feature selection on every iteration
    # of the grid search, which naturally takes forever
    svm = GridSearchCV(
        estimator=SVC(kernel='linear', class_weight='auto'),
        param_grid=dict(C=list(C_range(*ARGS.LOG2C))),
        scoring=scorer,
        n_jobs=int(getenv('NCPU', -1)),
        pre_dispatch='3 * n_jobs',
        cv=ARGS.CV_FOLDS - 1
        )

    results = None
    for n_features in ARGS.FEATURE_GRID:
        results_ = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

        for train_idxs, test_idxs in StratifiedKFold(y, ARGS.CV_FOLDS):

            if train_idxs.sum() < 1 or test_idxs.sum() < 1:
                y_true = y[test_idxs]
                results_.add(y_true, y_true, {})
                continue

            X_train = X[train_idxs]
            y_train = y[train_idxs]

            if ARGS.RFE:
                clf = RFE(
                    estimator=svm,
                    n_features_to_select=n_features,
                    step=ARGS.RFE_STEP
                    )
            else:
                mrmr = MRMR(
                    k=n_features,
                    method=ARGS.MRMR_METHOD,
                    normalize=ARGS.MRMR_NORMALIZE,
                    similar=ARGS.SIMILAR
                    )
                clf = Pipeline([('mrmr', mrmr), ('svm', svm)])

            clf.fit(X_train, y_train)

            X_test = X[test_idxs]
            y_true = y[test_idxs]

            if ARGS.RFE:
                selector_ = clf
                svm_ = clf.estimator_.best_estimator_
            else:
                selector_ = clf.named_steps['mrmr']
                svm_ = clf.named_steps['svm'].best_estimator_

            y_pred = clf.predict(X_test)

            coefs, ranks = coefs_ranks(selector_.ranking_, selector_.support_, svm_.coef_)

            results_.add(y_true, y_pred, coefs, ranks)

        if results is None or results_ > results:
            results = results_

    # the alignment reflects the number of sequences either naturally
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return results
Exemplo n.º 45
0
Y_target_BR = BR.predict(X_test)

#Random Forest Regressor
RFR = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=0)
RFR = RFR.fit(X_train, y_train)
ranks["RFR"] = ranking(RFR.feature_importances_, colnames)
#print(ranks["RFR"])

Y_target_RFR = RFR.predict(X_test)

#Recursive Feature Elimination on Random Forest Regressor
RFE_RFR = RFE(RFR, n_features_to_select=10, step=1)
RFE_RFR.fit(X_train, y_train)

Y_target_RFE_RFR = RFE_RFR.predict(X_test)

#Extra Trees Classifier

ETC = ExtraTreesClassifier(n_estimators=10,
                           max_depth=None,
                           min_samples_split=2,
                           random_state=0)
ETC = ETC.fit(X_train, y_train)
ranks["ETC"] = ranking(np.abs(ETC.feature_importances_), colnames)

Y_target_ETC = ETC.predict(X_test)

#Recursive Feature Elimination on Decision Tree Regressor
RFE = RFE(DTR, n_features_to_select=10, step=1)
RFE.fit(X_train, y_train)
Exemplo n.º 46
0
def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args):
    W = []
    features = []

    if selection_method != '2step_kbest':
        n_features = min(n_features, len(feature_list))

    if estimator_method == 'svm' and selection_method == 'rfe':
        estimator_args['kernel'] = 'linear'

    estimator = ESTIMATORS[estimator_method](**estimator_args)

    if selection_method == 'cluster':
        agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average')
        clusters = agglom.fit_predict(X).tolist()
        sample = [clusters.index(i) for i in range(n_features)]
        X = X[:,sample]
        Z = Z[:,sample]
        selection_method = None

    if selection_method is None:
        for i, y in enumerate(Y):
            estimator.fit(X, y)
            w = estimator.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'rfe':
        selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector = selector.fit(X, y)
            features.append(feature_list[selector.support_])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'myrfe':
        selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector.fit(X, y)
            features.append(feature_list[selector.support])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'kbest':
        selector = SelectKBest(f_regression, k=n_features, **selection_args)
        for i, y in enumerate(Y):
            X2 = selector.fit_transform(X, y)
            Z2 = selector.transform(Z)
            features.append(feature_list[selector.get_support()])
            estimator.fit(X2, y)
            w = estimator.predict(Z2)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    print

    return W, features