Пример #1
0
def perform_rfe(model, train, test, filename, to_remove=None):

    if to_remove is None:
        to_remove = floor(0.3 * len(train.columns))

    X = train.drop(TARGET, axis=1)
    y = train[TARGET]

    model.fit(X, y)
    preds = model.predict_proba(test)[:, 1]
    build_results_csv(filename,
                      X.columns,
                      send_submission("doesnt_matter.csv", preds),
                      create_file=True)
    sleep(3)

    for i in range(to_remove):
        rfe = RFE(model, n_features_to_select=len(X.columns) - 1).fit(X, y)

        preds = rfe.predict_proba(test)[:, 1]

        X = X.iloc[:, rfe.get_support()]
        test = test.iloc[:, rfe.get_support()]

        results = build_results_csv(
            filename, X.columns, send_submission("doesnt_matter.csv", preds))
        sleep(3)

    return results
    def SelectFeatureByRFE(self):
        rfe_selector = RFE(estimator=LogisticRegression(),
                           n_features_to_select=self.k,
                           step=10,
                           verbose=5)
        rfe_selector.fit(self.X.values, self.y)
        rfe_support = rfe_selector.get_support(indices=True)
        _ = rfe_selector.get_support()
        save_feat = []
        for i in list(rfe_support):
            save_feat.append(self.X.columns[i])

        return save_feat, _
Пример #3
0
 def select_features_RFE_LR(X, y, columns, iteration):
     selection = RFE(estimator=LogisticRegression(max_iter=iteration)).fit(
         X, y)
     selected_features = np.array(columns)[selection.get_support()]
     print(" Features selected by RFE from logistic regression :{}".format(
         selected_features))
     return selected_features
Пример #4
0
def feature_selection_LR():

    from sklearn.feature_selection import RFE

    rfe_selector = RFE(estimator=RandomForestClassifier(),
                       n_features_to_select=30,
                       step=5,
                       verbose=5)
    rfe_selector.fit(X_train_scaled, y_train)

    y_pred = rfe_selector.predict(X_test_scaled)
    y_predprob = rfe_selector.predict_proba(X_test_scaled)[:, 1]

    rfe_support = rfe_selector.get_support()
    rfe_feature = X_train[predictors].loc[:, rfe_support].columns.tolist()
    print(str(len(rfe_feature)), 'selected features')
    print('RFE features')
    print(rfe_feature)
    # Print model report:
    print("\nModel Report")
    #print("Train Accuracy : %.4g" % metrics.accuracy_score(y_train, y_pred_train))
    print("Test Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
    #print('Train error: {:.3f}'.format(1 - metrics.accuracy_score(y_train, y_pred_train)))
    print('Test error: {:.3f}'.format(1 -
                                      metrics.accuracy_score(y_test, y_pred)))
    print("AUC Score : %f" % metrics.roc_auc_score(y_test, y_predprob))
    print("Recall : %f" % metrics.recall_score(y_test, y_pred))
    print("Precision : %f" % metrics.precision_score(y_test, y_pred))
    print("F-measure : %f" % metrics.f1_score(y_test, y_pred))
    c_matrix = metrics.confusion_matrix(y_test, y_pred)
    print('========Confusion Matrix==========')
    print("          Rejected    Accepted")
    print('Rejected     {}      {}'.format(c_matrix[0][0], c_matrix[0][1]))
    print('Accepted     {}      {}'.format(c_matrix[1][0], c_matrix[1][1]))
Пример #5
0
def do_learning(X_training, Y_training, X_test, Y_test, reference_dic, model_class):

    '''
    credit: Juan Arroyo-Miranda & Dani Alcala

    With training and testing data select the best
    features with recursive feature elimination method, then
    fit a classifier and return a tuple containing the predicted values on the test data
    and a list of the best features used.
    '''
    
    model = model_class
    # Recursive Feature Elimination
    rfe = RFE(model)
    rfe = rfe.fit(X_training, Y_training)
    
    best_features = rfe.get_support(indices=True)

    best_features_names = [reference_dic[i] for i in best_features]

    predicted = rfe.predict(X_test)
    expected = Y_test

    accuracy = accuracy_score(expected, predicted)
    return (expected, predicted, best_features_names, accuracy)
Пример #6
0
def feature_selection(X_res, y_res, xcol, FEATURE_NUM):

    ###################### feature selections ###########################
    print(
        '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FEATURE SELECTION >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'
    )
    rfc = RandomForestClassifier()
    # fit random forest classifier on the training set
    y_res = y_res.reshape(-1, 1)  # reshape the lables
    rfc.fit(X_res, y_res)
    # extract important features
    score = np.round(rfc.feature_importances_, 3)
    importances = pd.DataFrame({'feature': xcol, 'importance': score})
    importances = importances.sort_values('importance',
                                          ascending=False).set_index('feature')
    # plot importances
    plt.rcParams['figure.figsize'] = (11, 4)
    importances.plot.bar()

    # create the RFE model and select 10 attributes
    rfe = RFE(rfc, FEATURE_NUM)
    rfe = rfe.fit(X_res, y_res)

    # summarize the selection of the attributes
    feature_map = [(i, v)
                   for i, v in itertools.zip_longest(rfe.get_support(), xcol)]
    selected_features = [v for i, v in feature_map if i == True]

    return selected_features
Пример #7
0
 def RFE(self):
     X_norm = MinMaxScaler().fit_transform(self.df_X)
     rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=10, step=10, verbose=5)
     rfe_selector.fit(X_norm, self.y)
     self.rfe_support = rfe_selector.get_support()
     self.rfe_feature = list(self.df_X.loc[:,self.rfe_support].columns)
     return pd.DataFrame(self.rfe_support, self.rfe_feature)
Пример #8
0
    def feature_selection_RFE(self, data_table, X, y, input_file):

        no_best_features = int(len(X[1, :]) / 10)

        # This will call the svc parameter tuning function of MLTwithPython3
        # svc_best_model = MLT.Execute_machine_learning().parameter_tuning_SVM(X, y, 5)
        class_weight_values = class_weight.compute_class_weight(
            'balanced', np.unique(y), y)
        # data into dictionary format..
        class_weights = dict(zip(np.unique(y), class_weight_values))

        RF = RandomForestClassifier(n_estimators=100,
                                    random_state=42,
                                    n_jobs=-1,
                                    class_weight=class_weights)
        recursive_feature_selection = RFE(
            estimator=RF, n_features_to_select=no_best_features).fit(X, y)

        idxs_selected = recursive_feature_selection.get_support(indices=True)

        selected_data_table = Execute_feature_selection(
        ).generate_dataframe_feature_selection(idxs_selected, data_table)
        selected_data_table.to_csv("Feature_selection_RF_RFE_" +
                                   str(input_file),
                                   index=False)
Пример #9
0
def select_from_tree_recursively(x_data, y_data, select_k):
    print(f"Applying tree derived importance filter")
    print(
        f"cat variables before tree derived recursive importance filter  {x_data.select_dtypes(include='object').shape}"
    )
    print(
        f"num variables before tree derived recursive importance filter  {x_data.select_dtypes(include='number').shape}"
    )

    num_cols = x_data.select_dtypes(include='number').columns

    temp = x_data[num_cols]

    select_ = RFE(estimator=RandomForestRegressor(n_estimators=100),
                  n_features_to_select=10)
    select_.fit(temp, y_data)

    cols_to_keep = temp.columns[select_.get_support()]
    cols_to_drop = [x for x in num_cols if x not in cols_to_keep]

    x_data.drop(labels=cols_to_drop, axis=1, inplace=True)

    print(
        f"cat variables after tree derived recursive importance filter  {x_data.select_dtypes(include='object').shape}"
    )
    print(
        f"num variables after tree derived recursive importance filter  {x_data.select_dtypes(include='number').shape}"
    )

    return x_data
Пример #10
0
def feature_select(x_train, y_train, method='iv', kb=100, rfe=30):
    if method == 'iv':
        method = mutual_info_classif
    elif method == 'f':
        method = f_classif

    # chi2
    fn = x_train.columns
    selector1 = SelectKBest(chi2, kb)
    selector1.fit(x_train, y_train)

    # information value
    selector2 = SelectKBest(method, kb)
    selector2.fit(x_train, y_train)
    left_features = list(
        set(fn[selector2.get_support()].tolist() +
            fn[selector1.get_support()].tolist()))

    # RFE
    _X_tmp = x_train[left_features]
    fn = _X_tmp.columns
    clf = LogisticRegression(penalty='l2', C=0.2)
    selector = RFE(estimator=clf, n_features_to_select=rfe)
    selector.fit(_X_tmp, y_train)

    left_features = fn[selector.get_support()].tolist()
    x_train = x_train[left_features]
    return left_features
Пример #11
0
def in46():
    from sklearn.feature_selection import RFE
    from sklearn.ensemble import RandomForestClassifier
    select=RFE(RandomForestClassifier(n_estimators=100,random_state=42),n_features_to_select=40)

    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    cancer = load_breast_cancer()
    rng = np.random.RandomState(42)
    noise = rng.normal(size=(len(cancer.data), 50))
    #   print(cancer.data.shape) (596,30)
    x_w_noise = np.hstack([cancer.data, noise])
    x_train, x_test, y_train, y_test = train_test_split(x_w_noise, cancer.target, random_state=0, test_size=0.5)

    select.fit(x_train,y_train)

    mask = select.get_support()
    plt.matshow(mask.reshape(1, -1), cmap='gray_r')
    plt.xlabel('sample index')
    plt.show()

    x_train_rfe=select.transform(x_train)
    x_test_rfe=select.transform(x_test)

    from sklearn.linear_model import LogisticRegression
    print(LogisticRegression().fit(x_train, y_train).score(x_test, y_test))
    print(LogisticRegression().fit(x_train_rfe, y_train).score(x_test_rfe, y_test))
def select_features(X, y, n_features, attributes, folder):
    string_attributes = '-'.join(attributes)
    file_features = f"{folder}/features/selected_features_" + string_attributes + "_" + str(
        n_features) + ".pkl"
    if os.path.exists(file_features):
        with open(file_features, "rb") as f:
            selected_features = pickle.load(f)
    else:
        estimator = DecisionTreeClassifier()
        rfe = RFE(estimator=estimator,
                  n_features_to_select=n_features)  # only take 0.05
        rfe.fit(X, y)

        # alternatively RFECV
        # rfe = RFECV(estimator=DecisionTreeClassifier(), step=0.01, scoring='accuracy', min_features_to_select=10)
        # rfe.fit(X, y)

        # return the selected features
        selected_features = X.columns[rfe.get_support()]

        # save selected features to avoid retraining again
        os.makedirs(f'{folder}/features', exist_ok=True)
        with open(
                f"{folder}/features/selected_features_" + string_attributes +
                "_" + str(n_features) + ".pkl", "wb") as f:
            pickle.dump(selected_features, f)
    return selected_features
Пример #13
0
def RFE_filter(df: DataFrame,
               y: Series,
               col_list: List,
               estimator: Any,
               keep: float = 0.5,
               step: int = 1) -> List:
    """
    递归特征消除
    :param df:
    :param y:
    :param col_list:
    :param estimator: 使用的学习器
    :param keep: 保留特征数目或比例
    :param step: 每次递归的步长
    :return:
    """
    if keep >= 1 and isinstance(keep, float):
        raise Exception('参数keep大于等于1时, 请输入整数')
    if isinstance(keep, float):
        keep = np.ceil(len(col_list) * keep)

    selector = RFE(estimator, n_features_to_select=keep, step=step)
    selector = selector.fit(df[col_list], y)
    mask = selector.get_support()

    res = np.array(col_list)[mask].tolist()

    return res
Пример #14
0
    def wrapper(self, X, y, k='all'):
        """
        Wrapper
        documentation for RFE: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html

            Normalization: depend on the used model; yes for LR
            Impute missing values: depend on the used model; yes for LR
        """
        X_norm = MinMaxScaler().fit_transform(X)

        if not k == 'all':
            if k > len(X.columns.tolist()):
                raise NameError(
                    "Numero de features seleccionas (k) mayor a features totales"
                )
        else:
            k = len(X.columns.tolist())

        rfe_selector = RFE(estimator=LogisticRegression(),
                           n_features_to_select=k,
                           step=10,
                           verbose=0)
        rfe_selector.fit(X_norm, y)

        rfe_support = rfe_selector.get_support()
        rfe_feature = X.loc[:, rfe_support].columns.tolist()
        print(str(len(rfe_feature)), 'selected features')

        return rfe_support, rfe_feature, rfe_selector
Пример #15
0
def get_top_features(train_x,train_Y ):
    RFC = RandomForestClassifier()
    rfe = RFE(RFC, n_features_to_select=50)
    rfe = rfe.fit(train_x, train_Y)
    feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), train_x.columns)]
    selected_features = [v for i, v in feature_map if i==True]
    return selected_features
Пример #16
0
def rf_rfe(df):
    X,y = df.iloc[:,:-1],df.iloc[:,-1]
    model = RandomForestClassifier()
    features_no = X.columns
    rfe = RFE(model,len(X.columns)/2)
    rfe.fit(X,y)
    return X.columns.values[rfe.get_support()].tolist()
Пример #17
0
def df5():
    from sklearn.feature_selection import RFE
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    import time
    from sklearn.linear_model import LogisticRegression

    select = RFE(RandomForestClassifier(n_estimators=100,random_state=42),
                 n_features_to_select=40)
    cancer = load_breast_cancer()
    rng = np.random.RandomState(42)
    noise = rng.normal(size=(len((cancer.data)), 50))
    X_w_noise = np.hstack([cancer.data, noise])
    X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target, random_state=0, test_size=.5)

    start_time = time.time()
    select.fit(X_train,y_train)
    print("Estimated execution time: {} seconds".format((time.time()-start_time)))

    X_train_rfe = select.transform(X_train)
    X_test_rfe = select.transform(X_test)

    score = LogisticRegression().fit(X_train_rfe,y_train).score(X_test_rfe,y_test)
    print("Score: {:.3f}".format(score))

    mask = select.get_support()
    plt.matshow(mask.reshape(1,-1),cmap='gray_r')
    plt.xlabel("Sample index")
    plt.show()
Пример #18
0
class RFE_RandomForestRegPrim(primitive):
    def __init__(self, random_state=0):
        super(RFE_RandomForestRegPrim, self).__init__(name='RFE_RandomForestReg')
        self.id = 44
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Feature ranking with recursive feature elimination with Random-Forest regressor. Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached."
        self.hyperparams_run = {'default': True}
        self.selector = RFE(RandomForestRegressor())
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        mask = self.selector.get_support(indices=False)
        final_cols = list(compress(cols, mask))
        output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        final_output = {0: output}
        return final_output
Пример #19
0
def RFE_selector(estimator, n_features_to_select, X_data, Y_data):
    columns = X_data.columns
    selector = RFE(estimator = estimator, n_features_to_select = n_features_to_select)
    selector.fit_transform(X_data, Y_data)
    labels = [columns[x] for x in selector.get_support(indices=True)]    
    feature = pd.DataFrame(selector.fit_transform(X_data, Y_data), columns=labels)
    return feature
Пример #20
0
def stepwise_recur_select(data_df, target_df, model, step_val=0.1, k_vals=[]):

    col_names = list(data_df.columns.values)
    data_np = data_df.values
    target_np = target_df.values

    scorers = {'Accuracy': 'accuracy', 'roc_auc': 'roc_auc'}

    auc_results = []
    acc_results = []
    feature_select = []
    auc_std_results = []
    acc_std_results = []
    for k_val in k_vals:
        sel = RFE(model, n_features_to_select=k_val, step=step_val)
        data_np_fs = sel.fit_transform(data_np, target_np)

        scores = cross_validate(model,
                                data_np_fs,
                                target_np,
                                scoring=scorers,
                                cv=5)
        auc_score = scores['test_roc_auc'].mean()
        acc_score = scores['test_Accuracy'].mean()
        auc_results = np.append(auc_results, auc_score)
        acc_results = np.append(acc_results, acc_score)
        auc_std = scores['test_roc_auc'].std() * 2
        acc_std = scores['test_Accuracy'].std() * 2
        auc_std_results = np.append(auc_std_results, auc_std)
        acc_std_results = np.append(acc_std_results, acc_std)
        feature_select.append(sel.get_support())

    optimal_ndx = np.where(auc_results == auc_results.max())[0]
    if len(optimal_ndx) > 1:
        optimal_ndx = optimal_ndx[0]
    else:
        optimal_ndx = int(optimal_ndx)
    sel_idx = feature_select[int(optimal_ndx)]

    keep_cols, del_cols = get_keep_del_cols(col_names, sel_idx)

    print('-- Stepwise Recursive Feature Selection --')
    print('K Selected: {}'.format(k_vals[optimal_ndx]))
    print('Selected Model Mean Accuracy Score: {}'.format(
        acc_results[optimal_ndx]))
    print('Selected Model Accuracy Deviation: {}'.format(
        acc_std_results[optimal_ndx]))
    print('Selected Model Mean AUC Score: {}'.format(auc_results[optimal_ndx]))
    print('Selected Model AUC Deviation: {}'.format(
        auc_std_results[optimal_ndx]))
    print('Number of Original Features: {}'.format(len(col_names)))
    print('Number of Selected Features: {}'.format(len(keep_cols)))
    print('Features Selected:')
    print(keep_cols)
    print('Features Removed:')
    print(del_cols)

    new_data_df = data_df.drop(del_cols, axis=1)

    return new_data_df, del_cols
def selector_RFE(features, label):
    model = LogisticRegression()
    selector = RFE(model, 15)
    selector.fit(features, label)
    print sorted(zip(map(lambda x: round(x, 4), selector.get_support()),
                     names),
                 reverse=True)
Пример #22
0
def eva_model(c, n, X, y, X_test, y_test, class_names, outdir):
    model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c)
    rfe = RFE(model, n_features_to_select=n)

    ## learning curve
    plt.clf()
    viz_LC = LearningCurve(
        rfe, scoring='f1_weighted', n_jobs=4
    )
    viz_LC.fit(X, y)
    viz_LC.show(outpath=outdir + '/LC.png')

    ## classification report
    plt.clf()
    viz_CR = ClassificationReport(rfe, classes=class_names, support=True)
    viz_CR.fit(X, y)
    viz_CR.score(X_test, y_test)
    viz_CR.show(outpath=outdir + '/CR.png')

    ## confusion matrix
    plt.clf()
    viz_CM = ConfusionMatrix(rfe, classes=class_names)
    viz_CM.fit(X, y)
    viz_CM.score(X_test, y_test)
    viz_CM.show(outpath=outdir + '/CM.png')

    ## precision recall curve
    plt.clf()
    viz_PRC = PrecisionRecallCurve(rfe, per_class=True, iso_f1_curves=True,
                                   fill_area=False, micro=False, classes=class_names)
    viz_PRC.fit(X, y)
    viz_PRC.score(X_test, y_test)
    viz_PRC.show(outpath=outdir + '/PRC.png',size=(1080,720))

    ## class prediction error
    plt.clf()
    viz_CPE = ClassPredictionError(
        rfe, classes=class_names
    )
    viz_CPE.fit(X, y)
    viz_CPE.score(X_test, y_test)
    viz_CPE.show(outpath=outdir + '/CPE.png')

    ## ROCAUC
    plt.clf()
    viz_RA = ROCAUC(rfe, classes=class_names, size=(1080,720))
    viz_RA.fit(X, y)
    viz_RA.score(X, y)
    viz_RA.show(outpath=outdir + '/RA.png')

    fit = rfe.fit(X,y)
    y_predict = fit.predict(X_test)
    f1 = f1_score(y_test, y_predict, average='weighted')

    features_retained_RFE = X.columns[rfe.get_support()].values
    feature_df =pd.DataFrame(features_retained_RFE.tolist())
    feature_df.to_csv(outdir + '/features.csv', sep='\t', index=False)

    return f1
Пример #23
0
def q4():
    LR = LinearRegression()
    x_train, y_train = fifa.drop('Overall', axis=1), fifa['Overall']
    LR.fit(x_train, y_train)
    rfe = RFE(LR, n_features_to_select=5)
    rfe.fit(x_train, y_train)
    selected_columns = list(x_train.columns[rfe.get_support()])
    return selected_columns
Пример #24
0
def rf_rfe(df):
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    model = RandomForestClassifier()
    # create the RFE model and select 3 attributes
    rfe = RFE(model)
    rfe = rfe.fit(X, y)
    #print rfe.ranking_
    return X.columns.values[rfe.get_support()].tolist()
Пример #25
0
def q4():
    # Retorne aqui o resultado da questão 4.
    reg = LinearRegression()
    X = fifa.drop(columns="Overall")
    y = fifa["Overall"]
    rfe_fit = RFE(estimator=reg, n_features_to_select=5, step=1).fit(X, y)
    features_selected = rfe_fit.get_support(indices=True)
    return list(X.columns[features_selected])
Пример #26
0
def q4():
    # Retorne aqui o resultado da questão 4.
    regl = LinearRegression()
    X_train = fifa.drop(columns=['Overall'], axis=1)
    y = fifa.Overall
    rfe = RFE(regl, n_features_to_select=5, step=1)
    rfe.fit(X_train, y)
    return list(X_train.columns[rfe.get_support()])
Пример #27
0
 def selectFeaturesFromSubsetRecursive(self,subset,numFeatures):
   model = svm.LinearSVC(class_weights='auto')
   rfe = RFE(model, numFeatures)
   rfe = rfe.fit(self.instances[:,subset], self.classes)
   # summarize the selection of the attributes
   # print(rfe.get_support(indices=True))
   # print(rfe.ranking_)
   return rfe.get_support(indices=True)
Пример #28
0
def rfe_selector(X, y, num_feats, random_state=None):
    this_selector = RFE(estimator=LogisticRegression(
        C=.1, solver='liblinear', random_state=random_state),
                        n_features_to_select=num_feats,
                        step=.2,
                        verbose=5)
    this_selector.fit(X, y)
    return this_selector.get_support()
Пример #29
0
def q4():
    X = df.drop('Overall', axis=1)
    y = df.Overall
    reg = LinearRegression().fit(X, y)
    selecao = RFE(reg, 5)
    selecao = selecao.fit(X, y)
    index = selecao.get_support(True)
    return list(X.columns[index])
def selectionRecursiveFE(X, y, paramlist):
    #create estimator
    n_features_to_select = paramlist['number _of_features']
    svc = SVC(kernel="linear", C=1)
    rfe = RFE(estimator=svc, n_features_to_select=n_features_to_select, step=3)
    Xnew = rfe.fit_transform(X, y)
    indexarr = rfe.get_support(indices=True)
    return [Xnew, indexarr]
Пример #31
0
def top_rfe(mod, x, y, n, step=0.05, **params):
    selector = RFE(mod(**params), n, step, 1)
    selector.fit(x, y)
    selected = selector.get_support()

    rfe_ftrs = np.asarray(x.columns)[selected]
    rfe_ftrs = pd.Series(1, index=rfe_ftrs)
    return rfe_ftrs
Пример #32
0
def rfe():
	"""Recursive feature elimination"""
	model = LogisticRegression()
	# create the RFE model and select 3 attributes
	rfe = RFE(model, 3)
	rfe = rfe.fit(features_train, labels_train)
	# summarize the selection of the attributes
	print([features_considered[i + 1] for i in rfe.get_support(indices=True) ])
	print(rfe.ranking_)
	for i in range(len(rfe.ranking_)):
	    print features_considered[i+1], ": ", rfe.ranking_[i]
Пример #33
0
def test_rfe_2():
    """Ensure that the TPOT RFE outputs the same result as the sklearn rfe when num_features>no. of features in the dataframe """
    tpot_obj = TPOT()

    non_feature_columns = ['class', 'group', 'guess']
    training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
    estimator = LinearSVC()
    rfe = RFE(estimator, 100, step=0.1)
    rfe.fit(training_features, training_classes)
    mask = rfe.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

    assert np.array_equal(training_testing_data[mask_cols], tpot_obj._rfe(training_testing_data, 64, 0.1))
Пример #34
0
def model_logistic(training_data, test_data, features, label):

    '''
    With training and testing data and the data's features and label, select the best
    features with recursive feature elimination method, then
    fit a logistic regression model and return predicted values on the test data
    and a list of the best features used.

    '''
    
    model = LogisticRegression()
    rfe = RFE(model)
    rfe = rfe.fit(training_data[features], training_data[label])
    predicted = rfe.predict(test_data[features])
    best_features = rfe.get_support(indices=True)
    return predicted, best_features
Пример #35
0
def feat3(matrix):
	last_column = [row[len(matrix[0])-1] for row in matrix]
	data_class = transform_to_int(last_column, matrix[0][len(matrix[0])-1])
	indices = list(range(len(matrix[0])-1))
	new_list = map(operator.itemgetter(*indices), matrix)
	data = np.asarray(new_list) 
	data = data.astype(np.float)
	svc = SVC(kernel="linear", C=1)
	rfe = RFE(estimator=svc, n_features_to_select=5, step=1)
	matrix_new = rfe.fit_transform(data, data_class)
	data_class = np.array([data_class])
	features_selected = np.concatenate((matrix_new,data_class.T),axis=1)
	indices_resultados = rfe.get_support(new_list) 
	features = []	
	for data in indices_resultados:
		features.append(data)
	return features
Пример #36
0
def classify(X_train, X_test, y_train):
    '''
    Train the best classifier on (X_train, and y_train) then predict X_test labels

    :param X_train: A dictionary with the following structure
            { instance_id: [w_1 count, w_2 count, ...],
            ...
            }

    :param X_test: A dictionary with the following structure
            { instance_id: [w_1 count, w_2 count, ...],
            ...
            }

    :param y_train: A dictionary with the following structure
            { instance_id : sense_id }

    :return: results: a list of tuples (instance_id, label) where labels are predicted by the best classifier
    '''

    results = []

    trainVectors, _, trainOutcomes = A.getFeatureVectors(X_train, y_train)
    testVectors, testKeys = A.getFeatureVectors(X_test)

    # Select Features
    svm_clf = svm.LinearSVC()
    selector = RFE(svm_clf, verbose=0, step=10)
    selector = selector.fit(trainVectors, trainOutcomes)
    featMask = selector.get_support()

    # Mask Features
    nItems = testVectors.shape[0]
    testVectorsNew = np.zeros((nItems, np.sum(featMask)))
    for k in range(nItems):
        testVectorsNew[k, :] = testVectors[k, :][featMask]

    model = selector.estimator_
    svm_predict = model.predict(testVectorsNew)
    #svm_clf.fit(trainVectorsNew, trainOutcomes)
    #svm_predict = svm_clf.predict(testVectors)

    results = [(testKeys[k], svm_predict[k]) for k in range(len(testKeys))]

    return results
Пример #37
0
    def _rfe(self, input_df, num_features, step):
        """Uses Scikit-learn's Recursive Feature Elimination to learn the subset of features that have the highest weights according to the estimator

        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to perform feature selection on
        num_features: int
            The number of features to select
        step: float
            The percentage of features to drop each iteration

        Returns
        -------
        subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the `num_features` best features

        """
        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)
        training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values

        if step < 0.1:
            step = 0.1
        elif step >= 1.:
            step = 0.99
        if num_features < 1:
            num_features = 1
        elif num_features > len(training_features.columns):
            num_features = len(training_features.columns)

        if len(training_features.columns.values) == 0:
            return input_df.copy()

        estimator = SVC(kernel='linear')
        selector = RFE(estimator, n_features_to_select=num_features, step=step)
        try:
            selector.fit(training_features, training_class_vals)
            mask = selector.get_support(True)
            mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group']
            return input_df[mask_cols].copy()
        except ValueError:
            return input_df[['guess', 'class', 'group']].copy()
Пример #38
0
     # In[ ]:
    svc = LinearSVC(C=20, penalty='l1', dual=False)
    svc.fit(X, y)
    selected_feature_names = feature_cols[[list(set(np.where(svc.coef_ != 0)[-1]))]]
    X_svm = svc.transform(X)
    print("X_svm L1 transformed:", X_svm.shape)
    X=X_svm


     # In[ ]:

    rfeSelect = RFE(estimator=rf,n_features_to_select=10, step=0.15)
    X_RFE = rfeSelect.fit_transform(X,y)
    print(X_RFE.shape)

    # In[ ]:

    RFE_FeatureNames = feature_cols[rfeSelect.get_support()]
    print("RFE_FeatureNames: \n",RFE_FeatureNames)


    # In[ ]:

    "http://stackoverflow.com/questions/21548750/plotting-histograms-against-classes-in-pandas-matplotlib"
    for featName in RFE_FeatureNames:
        df.groupby("class").feature.hist(alpha=0.4)
        df.groupby("classname")[featName].plot(kind='kde')


Пример #39
0
def main():
    root = 'data/raw/'
    windowData = None
    windowLabelInfo = None
    files = [f for f in os.listdir(root) if path.isfile(path.join(root, f))]
    labels = [l for l in files if "label" in l]
    labels = sorted(labels)
    gl_data = [g for g in files if "glove" in g]
    gl_data = sorted(gl_data)
    for glove_data, label_data in zip(gl_data,labels):
        user = read_user(root, glove_data, label_data, False)
        if windowData is None:
            windowData = user.windowData
            windowLabelInfo = user.windowLabel
        else:
            windowData = pd.concat([windowData, user.windowData])
            windowLabelInfo = pd.concat([windowLabelInfo, user.windowLabelInfo])

    print "permutate data"

    # TODO: here compute the labels the way we want it for analysis!
    # first simple approach: just the the major labe in each window:
    windowLabelInfo = windowLabelInfo.drop('Unnamed: 0', 1)
    windowData = windowData.drop(u'gesture', 1)

    # permutate the data
    indices = np.random.permutation(windowData.index)
    windowData = windowData.reindex(indices)
    windowLabelInfo = windowLabelInfo.reindex(indices)



    # prepare data for feature selection:
    selectLabelDF, exclude = labelMatrixToArray(windowLabelInfo, 150)
    # now we need to balance the amount of the zero class to the other classes
    # get all 0 indexes:
    selectLabelDF = selectLabelDF.drop(exclude)
    selectData = windowData.drop(exclude)
    selectLabelDF, selectData, _ = normalizeZeroClass(selectLabelDF, selectData)

    # feature selection using VarianceThreshold filter
    # sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    # fit = sel.fit(selectData.values)
    # colIndex = fit.get_support(indices=True)
    # windowData = windowData[windowData.columns[colIndex]]

    # the blow is somehow valid, however:
    # first I would need to transform the features so each X > 0
    # (means vor each colum add the col max negative offset to 0 to each value)
    # but I am more in doupth I should do that as these are univariate
    # selections, and I am not sure if we are more in the multivariate
    # world here.
    # - feature selection getting the X best features based on
    # - statistical tests for the data. We have 65 sensors,
    # - or about 12 different single movements in our case
    # - since in our gesture only complete finger flexation
    # - or relaxation is interesting so the minimum
    # - number of features should be in the range of
    # - 12-65. A good set might be the double amount of that
    #fit = SelectKBest(chi2, k=65).fit(selectData.values, selectLabelDF.values)
    #colIndex = fit.get_support(indices=True)
    #windowData = windowData[windowData.columns[colIndex]]

    # important toto!
    # todo: I think also for feature selection we should take care the 0 class is balanced!
    # todo: if you use it that way, scale the features
    print "Recursive eleminate features: "
    svc = sklearn.linear_model.Lasso(alpha = 0.1) #svm.SVR(kernel="linear")
    print "test fit."
    svc.fit(selectData.values, np.ravel(selectLabelDF.values))
    print "run rfecv.."
    rfecv = RFE(estimator=svc, step=0.1, verbose=2)
    rfecv.fit(selectData.values, np.ravel(selectLabelDF.values))
    print "get support..."
    colIndex = rfecv.get_support(indices=True)
    print "shrink data to selected features...."
    windowData = windowData[windowData.columns[colIndex]]
    print windowData.shape

    print "selected headers: "
    print windowData.columns

    # first we split trining and test already here. this
    # is because of the different learning approach
    #
    # windowData['gesture'] = windowLabelInfo.idxmax(axis=1)
    splitpoint = int(windowData.index.size * 0.7)
    trainData = windowData[0:splitpoint]
    testData = windowData[splitpoint + 1:]
    trainLabels = windowLabelInfo[0:splitpoint]
    testLabels = windowLabelInfo[splitpoint + 1:]
    # a complete window has 201 frames. we count the label with
    # more than 150, aka. 3/4 as the real label

    labelDF, exclude = labelMatrixToArray(trainLabels, 150)
    # now we need to balance the amount of the zero class to the other classes
    # get all 0 indexes:
    labelDF = labelDF.drop(exclude)
    trainData = trainData.drop(exclude)
    labelDF, trainData, _ = normalizeZeroClass(labelDF, trainData)

    print("++++++++++++++++")
    print(labelDF)
    print("++++++++++++++++")
    print("train data size:")
    print(trainData.shape)
    print("++++++++++++++++")
    headers = Constants.headers
    #d = trainData.loc[:, headers]
    d = trainData.values #d.values
    d = preprocessing.scale(d)

    print(d)

    clf = None
    kf = KFold(len(labelDF.values), n_folds=5)
    score = 0
    for train_index, test_index in kf:
        X_train = d[train_index, :]
        X_ct = d[test_index, :]
        y_train = labelDF.values[train_index]
        y_ct = labelDF.values[test_index]
        # lin_clf = sklearn.linear_model.LogisticRegression()
        # lin_clf = sklearn.linear_model.LogisticRegression(class_weight='auto')
        # lin_clf = svm.LinearSVC()
        # lin_clf = svm.LinearSVC(class_weight='auto')
        # lin_clf = svm.SVR()
        # lin_clf = svm.SVC()
        # lin_clf = svm.SVC(class_weight='auto')
        lin_clf = svm.SVC(decision_function_shape='ovo')
        # lin_clf = sklearn.neighbors.nearest_centroid.NearestCentroid()
        # lin_clf = sklearn.linear_model.Lasso(alpha = 0.1)
        # lin_clf = sklearn.linear_model.SGDClassifier(loss="hinge", penalty="l2")
        # lin_clf = sklearn.linear_model.SGDClassifier(loss="hinge", penalty="l2", class_weight='auto')
        # lin_clf = sklearn.naive_bayes.MultinomialNB()
        # lin_clf = sklearn.tree.DecisionTreeClassifier()
        # lin_clf = sklearn.tree.DecisionTreeClassifier(class_weight='auto')
        # lin_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=10)
        # lin_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=10, class_weight='auto')
        # lin_clf = sklearn.ensemble.AdaBoostClassifier(n_estimators=100)
        # lin_clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
        lin_clf.fit(X_train, y_train)
        s = lin_clf.score(X_ct, y_ct)
        if s > score:
            score = s
            clf = lin_clf

    #clf = svm.SVC(decision_function_shape='ovo')
    #clf.fit(d, labelDF.values)

    # TODO: test label approach:
    # compute our binary matrix with labels per frame
    # also compute our label vector as above
    # then correct the label vector by looking
    # at multilabel entries if they match with the prediction
    # and set the label to that

    testLabelDF, exclude = labelMatrixToArray(testLabels, 10)

    # testLabelDF, testData, removalIndex = normalizeZeroClass(testLabelDF, testData)
    # testLabels.drop(removalIndex)

    testLabels = testLabels.fillna(0)
    testLabels[testLabels > 0] = 1

    #d = testData.loc[:, headers]
    d = testData.values #d.values
    d = preprocessing.scale(d)

    prediction = clf.predict(d)

    for row in range(prediction.size):
        p = prediction[row]
        val = testLabels.loc[testLabels.index[row]][p]
        if val == 1.0:
            testLabelDF.loc[testLabelDF.index[row]] = p

    print("------------")
    print(prediction)
    print("------------")
    print(testLabelDF)
    print("------------")

    print(classification_report(testLabelDF.values, prediction))
training_features = result1.loc[training_indices].drop('class', axis=1)

if len(training_features.columns.values) > 0 and len(training_features.columns.values) <= 700:
    # The feature constructor must be fit on only the training data
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly.fit(training_features.values.astype(np.float64))
    constructed_features = poly.transform(result1.drop('class', axis=1).values.astype(np.float64))
    result2 = pd.DataFrame(data=constructed_features)
    result2['class'] = result1['class'].values
else:
    result2 = result1.copy()

# Perform classification with a logistic regression classifier
lrc3 = LogisticRegression(C=0.48148148148148145)
lrc3.fit(result2.loc[training_indices].drop('class', axis=1).values, result2.loc[training_indices, 'class'].values)
result3 = result2.copy()
result3['lrc3-classification'] = lrc3.predict(result3.drop('class', axis=1).values)

# Use Scikit-learn's Recursive Feature Elimination (RFE) for feature selection
training_features = result3.loc[training_indices].drop('class', axis=1)
training_class_vals = result3.loc[training_indices, 'class'].values

if len(training_features.columns.values) == 0:
    result4 = result3.copy()
else:
    selector = RFE(SVC(kernel='linear'), n_features_to_select=min(77, len(training_features.columns)), step=0.99)
    selector.fit(training_features.values, training_class_vals)
    mask = selector.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + ['class']
    result4 = result3[mask_cols]
Пример #41
0
selector = RFE(SVC(kernel='linear', C=1.), n_features_to_select=1500, step=0.25)
classifier = SGDClassifier(loss='log', penalty='l1')


X_train, X_test, Y_train, Y_test = train_test_split(documents, labels, test_size=.25, random_state=42)  

#X_vec = vectorizer.fit_transform(X_train)
#X_sel = selector.fit_transform(X_vec, Y_train)
#classifier.fit_transform(X_sel, Y_train)

#X_prep = vectorizer.transform(X_test)
#X_new = selector.transform(X_prep)
#X_pred = classifier.predict(X_new)

steps = [('vectorizer', vectorizer),
         ('selector', selector),
         ('classifier', classifier)]

pipeline = Pipeline(steps)
pipeline.fit_transform(X_train, Y_train)
X_pred = pipeline.predict(X_test)

fnames = vectorizer.get_feature_names() 
indices = selector.get_support(True)                 
selected_terms = [ fnames[i] for i in indices ]


show_most_informative_features(selected_terms, classifier, n=25)
print classification_report(Y_test, X_pred)
print confusion_matrix(Y_test, X_pred)    
Пример #42
0
# -*- coding: utf-8 -*-

import pandas
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

data = pandas.read_csv('D:\\PDM\\6.2\\data2.csv')

feature = data[['月份', '季度', '广告费用', '客流量']]

rfe = RFE(
    estimator=LinearRegression(), 
    n_features_to_select=2
)

sFeature = rfe.fit_transform(
    feature, 
    data['销售额']
)

rfe.get_support()

Пример #43
0
num_round = 250   # Number of rounds of training, increasing this increases the range of output values
clf = xgbw.XGBWrapper(param, num_round, verbose_eval=0)

k = 500
step = 25
result_all = []

for step in [400, 200, 100, 50, 25]:
    selector = RFE(clf, step=step, n_features_to_select=k, verbose=2)

    print "Fitting Selector: k = {}, step = {}".format(k, step)
    start = time.time()
    selector = selector.fit(X_train, y_train)
    train_time = time.time() - start

    support = selector.get_support(indices=True)
    file_name = str(data[0]).rjust(2, "0") + str(data[1]).rjust(2, "0") + "_k" + str(k) + "_s" + str(step)
    addr_out = os.path.join("/home/ubuntu/Weiyi/RFE_Select", file_name)
    np.save(addr_out, support)

    start = time.time()
    prob = selector.predict_proba(X_test)
    test_time = round(time.time() - start, 2)

    score, recall, filter_rate, cut, net_savings = search_cut(prob)
    result_all.append([k, train_time, test_time, score, recall, filter_rate, cut, net_savings, step])

data = pd.DataFrame(np.array(result_all), columns=["k", "train time", "test time", "score", "recall", "filter rate", "cut", "net savings", "step"])
data.to_csv("/home/ubuntu/Weiyi/RFE_Select/RFE_0604.csv")
# List of feature importances
importances = pandas.DataFrame(grid.best_estimator_.feature_importances_, index = explanatory_df.columns, columns =['importance'])
importances.sort(columns = ['importance'], ascending = False, inplace = True)
print importances




# Recursive feature elimination


#rfWithCoef = RandomForestsWithCoef(n_estimators= 500)
rfe = RFE(estimator=rfWithCoef, n_features_to_select=3, step=1, verbose = 0)
rfe.fit(explanatory_df, response_series)

features_used = explanatory_df.columns[rfe.get_support()]
print features_used


# Run random forests on 3 best features


conn = sqlite3.connect('/Users/MatthewCohen/Documents/SQLite/TeamSeason1.sqlite')
query = """SELECT t.won as wins, g.good_team, t.o_fgm as field_goals_made, t.o_fga as field_goals_attempted,
t.o_ftm as free_throws_made, t.o_fta as free_throws_attempted, t.o_oreb as offensive_rebounds,
t.o_dreb as defensive_rebounds, t.o_reb as total_rebounds, t.o_asts as assists, t.o_pf as personal_fouls,
t.o_stl as steals, t.o_to as turnovers, t.o_3pm as three_pointers_made, t.o_3pa as three_pointers_attempted,
t.d_fgm as field_goals_allowed, t.d_fga as field_goal_attempts_allowed, t.d_reb as rebounds_allowed,
t.d_asts as assists_allowed, t.d_pf as fouls_against, t.d_3pm as three_point_makes_allowed,
((o_fgm / o_fga)*100) as field_goal_percentage, ((o_ftm / o_fta)*100) as free_throw_percentage,
((o_3pm / o_3pa)*100) as three_point_percentage, o_blk as blocks, o_pts as points, d_pts as points_against

# # Recursive Feature Elimination

# In[14]:

from sklearn.feature_selection import RFE

lr = LogisticRegression()
rfe = RFE(estimator=lr, n_features_to_select=15, step=1)
rfe.fit(bc_X, bc_y)


# In[15]:

select_features_rfe = rfe.get_support()
feature_names_rfe = bc_data.feature_names[select_features_rfe]
print(feature_names_rfe)


# In[16]:

set(feature_names_kbest) & set(feature_names_rfe)


# # Model based selection

# In[17]:

from sklearn.ensemble import RandomForestClassifier
Пример #46
0
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVR

model = LogisticRegression()
selector = RFE(model, 12, step=1)
selector.fit(X,y)

# summarize the selection of the features
print X.columns[selector.get_support()]

#get the only selected features from X
X_new=selector.transform(X)
X_new = pd.DataFrame(X_new,columns = [X.columns[selector.get_support()]])

# 5-folder cross validation
y_pred=cross_val_predict(model,X_new,y, cv=5)

#print precision_score(y,y_pred,average=None)
#print recall_score(y,y_pred,average=None)
#print f1_score(y,y_pred,average=None)
#print accuracy_score(y,y_pred)
#print classification_report(y,y_pred)

#######################################################
Пример #47
0
def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args):
    W = []
    features = []

    if selection_method != '2step_kbest':
        n_features = min(n_features, len(feature_list))

    if estimator_method == 'svm' and selection_method == 'rfe':
        estimator_args['kernel'] = 'linear'

    estimator = ESTIMATORS[estimator_method](**estimator_args)

    if selection_method == 'cluster':
        agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average')
        clusters = agglom.fit_predict(X).tolist()
        sample = [clusters.index(i) for i in range(n_features)]
        X = X[:,sample]
        Z = Z[:,sample]
        selection_method = None

    if selection_method is None:
        for i, y in enumerate(Y):
            estimator.fit(X, y)
            w = estimator.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'rfe':
        selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector = selector.fit(X, y)
            features.append(feature_list[selector.support_])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'myrfe':
        selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector.fit(X, y)
            features.append(feature_list[selector.support])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'kbest':
        selector = SelectKBest(f_regression, k=n_features, **selection_args)
        for i, y in enumerate(Y):
            X2 = selector.fit_transform(X, y)
            Z2 = selector.transform(Z)
            features.append(feature_list[selector.get_support()])
            estimator.fit(X2, y)
            w = estimator.predict(Z2)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    print

    return W, features