예제 #1
0
def main():
    X, y = loadDataSet()
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=33)

    vec = DictVectorizer()
    X_train = vec.fit_transform(X_train.to_dict(orient='record'))
    X_test = vec.transform(X_test.to_dict(orient='record'))
    print('dimension: ', len(vec.feature_names_))

    dt = DecisionTreeClassifier(criterion='entropy')
    dt.fit(X_train, y_train)
    print('None feature-selection: ', dt.score(X_test, y_test))

    fs = feature_selection.SelectPercentile(feature_selection.chi2,
                                            percentile=20)
    X_train_fs = fs.fit_transform(X_train, y_train)
    dt.fit(X_train_fs, y_train)
    X_test_fs = fs.transform(X_test)
    print('20% feature-selection: ', dt.score(X_test_fs, y_test))

    # 交叉验证,使用固定百分比进行特征筛选,并作图展示
    percentiles = range(1, 100, 2)
    results = []
    for i in percentiles:
        fs = feature_selection.SelectPercentile(feature_selection.chi2,
                                                percentile=i)
        X_train_fs = fs.fit_transform(X_train, y_train)
        scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
        results = np.append(results, scores.mean())
    print('Result: \n', results)

    opt = np.where(results == results.max())[0][0]
    print(opt)
    print('Opeimal number of features %d ' % percentiles[opt])

    # 使用最佳筛选特征进行建模并评估
    fs = feature_selection.SelectPercentile(feature_selection.chi2,
                                            percentile=7)
    X_train_fs = fs.fit_transform(X_train, y_train)
    dt.fit(X_train_fs, y_train)
    X_test_fs = fs.transform(X_test)
    s = dt.score(X_test_fs, y_test)
    print('The best selected: ', s)

    plt.plot(percentiles, results)
    plt.xlabel('percentiles of feature')
    plt.ylabel('accuracy')
예제 #2
0
def trainClassifier(year):
    # Use optimal parameters from train.py (vary around optimal values for chance)
    estimators = math.floor(np.random.uniform(70, 110, 1))
    max_depth = math.floor(np.random.uniform(4, 5, 1))
    max_features = math.floor(np.random.uniform(10, 30, 1))
    learning_rate = np.random.uniform(5, 20, 1) / float(100)
    params = {
        "n_estimators": estimators,
        "max_depth": max_depth,
        "max_features": max_features,
        "learning_rate": learning_rate
    }
    # Create a Gradient Boosting Classifier from these parameters
    clf = GradientBoostingRegressor(**params)
    # Run on all training data except current year
    seasons = [2015, 2016, 2017, 2018, 2019]
    seasons.remove(year)
    # Build team vectors and format training data
    data = buildTeamVectors(seasons=seasons)
    X_train, y_train = formatTrainingData(data, seasons=seasons)
    # Normalize X_train
    X_train = preprocessing.normalize(X_train)
    # Remove columns with low correlation to label outcome
    selector = feature_selection.SelectPercentile(
        feature_selection.mutual_info_classif,
        percentile=50).fit(X_train, y_train)
    X_train = selector.transform(X_train)
    # Train our clf on the training data
    clf.fit(X_train, y_train)
    # Return the clf object
    return clf, selector
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop', transformers.ColumnDropper(
                columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124)
            )),
            ('scale', preprocessing.StandardScaler(
                with_mean=True,
                with_std=True
            )),
            ('select', feature_selection.SelectPercentile(
                percentile=59,#59,
                score_func=feature_selection.mutual_info_classif
            )),
            ('select', feature_selection.SelectKBest(
                k=101,
                score_func=feature_selection.f_classif
            )),
            ('estim', manifold.locally_linear_embedding(
                x,
                n_neighbors=6,
                n_components=101,
                eigen_solver='auto',
                method='standard'

            )),
        ])

        pipe.fit_transform(x)
        self._model = pipe.predict
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop', transformers.ColumnDropper(
                columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124)
            )),
            ('scale', preprocessing.StandardScaler(
                with_mean=True,
                with_std=False
            )),
            ('reduce', decomposition.FastICA(
                n_components=40,
                fun='exp',
                random_state=1742,
            )),
            ('select', feature_selection.SelectPercentile(
                percentile=57,
                score_func=feature_selection.mutual_info_classif,
            )),
            ('estim', naive_bayes.GaussianNB()),
        ])

        pipe.fit(x, y)
        self._model = pipe.predict
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop', transformers.ColumnDropper(
                columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124)
            )),
            ('scale', preprocessing.StandardScaler(
                with_mean=True,
                with_std=False
            )),
            ('select', feature_selection.SelectPercentile(
                percentile=73,
                score_func=feature_selection.f_classif
            )),
            ('estim', neighbors.KNeighborsClassifier(
                n_neighbors=16,
                weights='distance',
                metric='euclidean',
                n_jobs=-1
            ))
        ])

        pipe.fit(x, y)
        self._model = pipe.predict
def select_percentile(X_feature, y, percentile):
    selector = fs.SelectPercentile(percentile=percentile,
                                   score_func=fs.f_classif)
    selector.fit(X_feature, y)
    results = -np.log10(selector.pvalues_)
    X_transformed = selector.fit_transform(X_feature, y).copy()
    return X_transformed, results
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            #('kselect', feature_selection.SelectKBest(feature_selection.f_regression, k=115)),
            ('drop', transformers.ColumnDropper(columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124))),
            ('scale', preprocessing.StandardScaler(
                with_mean=True,
                with_std=True
            )),
            ('select', feature_selection.SelectPercentile(
                percentile=85,#59,
                score_func=feature_selection.mutual_info_classif
            )),
            ('estim', svm.NuSVC(
                nu=0.0525,
                kernel='rbf',
                gamma='auto',
                shrinking=True,
                class_weight=None,
                random_state=1742
            )),
        ])

        pipe.fit(x, y)
        self._model = pipe.predict
예제 #8
0
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop',
             transformers.ColumnDropper(columns=(6, 7, 8, 11, 12, 13, 14))),
            (
                'scale',
                preprocessing.StandardScaler(
                    with_mean=True,
                    with_std=False  # this is not a typo!
                )),
            #('scale', preprocessing.RobustScaler(
            #    with_centering=True, with_scaling=False, quantile_range=(1.0, 99.0)
            #)),
            ('expand',
             preprocessing.PolynomialFeatures(degree=2,
                                              interaction_only=False,
                                              include_bias=False)),
            ('select',
             feature_selection.SelectPercentile(
                 percentile=98, score_func=feature_selection.f_classif)),
            ('estim',
             discriminant_analysis.QuadraticDiscriminantAnalysis(
                 reg_param=0.0043))
        ])

        pipe.fit(x, y)
        self._model = pipe.predict
예제 #9
0
def crossValidate(clf, X, y, k, percent=50):
    # Keep track of the performance of the model on each fold in the scores array
    scores = []
    # Create the object to split the data
    skf = StratifiedKFold(n_splits=k)
    count = 1
    # Iterate through the training and testing data from each of the k-fold splits
    for train_index, test_index in skf.split(X, y):
        # Get our training and testing data to use from the split function
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # Remove columns with low correlation to label outcome
        selector = feature_selection.SelectPercentile(
            feature_selection.mutual_info_classif,
            percentile=percent).fit(X_train, y_train)
        X_train = selector.transform(X_train)
        # Note the columns we remove must be extracted from X_test as well here
        X_test = selector.transform(X_test)
        # Normalize data
        X_train = preprocessing.normalize(X_train)
        X_test = preprocessing.normalize(X_test)
        # Fit based on the training normalized data
        clf.fit(X_train, y_train)
        # Update the scores array with the performance on the testing data
        y_pred = clf.predict(X_test)
        # Move all values to binary classifications
        y_pred[y_pred >= 0.5] = 1
        y_pred[y_pred < 0.5] = 0
        # Our function for the prediction will vary depending on the metric
        accuracy = metrics.accuracy_score(y_test, y_pred)
        print(accuracy)
        scores.append(accuracy)
        count += 1
    # Return the average performance across all fold splits.
    return np.array(scores).mean()
예제 #10
0
def process_my_christine(Xtrain, ytrain, Xval, Xtest, params):
    print 'ITS A MY CHRISTINE TIME !!!'

    t0 = time.time()

    modelrf = pipeline.Pipeline([
        ('feature_selection',
         feature_selection.SelectPercentile(
             percentile=30, score_func=feature_selection.f_classif)),
        ('classification',
         RandomForestClassifier(n_estimators=200,
                                random_state=1,
                                n_jobs=params['n_jobs']))
    ])

    modelrf.fit(Xtrain, ytrain)

    print 'RF DONE'
    print(time.time() - t0) / 60.

    ytestrf = modelrf.predict_proba(Xtest)[:, 1]
    yvalrf = modelrf.predict_proba(Xval)[:, 1]

    ytestfinal = ytestrf
    yvalfinal = yvalrf

    return yvalfinal, ytestfinal
예제 #11
0
def tuning(X_train,y_train):
    param = {
        'n_estimators': range(30, 50, 2),
        'max_depth': range(2, 7, 1)
    }
    if mn == 'gbr':
        X_train.fillna(0, inplace = True)
        params = {'n_estimators': 500, 'max_depth': 4,'learning_rate': 0.01, 'loss': 'ls'}
        model = GradientBoostingRegressor(**params)
    else:
        model = xgb.XGBRegressor(learning_rate=0.01,n_estimators=500, max_depth=4, silent=True, objective='reg:gamma')
    # clf = GridSearchCV(estimator = model, param_grid = param, scoring='r2', cv=10)
    # clf.fit(X_train, y_train)
    # print(clf.grid_scores_, clf.best_params_, clf.best_score_)

    percentiles = range(1, 100, 2)
    results = []
    X_train.fillna(0, inplace = True)
    for i in percentiles:
        fs = feature_selection.SelectPercentile(feature_selection.f_regression, percentile = i)
        X_train_fs = fs.fit_transform(X_train, y_train)
        scores = cross_val_score(model, X_train_fs, y_train, cv=5)
        results = np.append(results, scores.mean())
    print(results)
    opt = np.where(results == results.max())[0]
    print(percentiles[int(opt)])
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop', transformers.ColumnDropper(
                columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124)
            )),
            ('scale', preprocessing.StandardScaler(
                with_mean=True,
                with_std=False
            )),
            ('select', feature_selection.SelectPercentile(
                percentile=54,
                score_func=feature_selection.mutual_info_classif
            )),
            ('estim', semi_supervised.LabelPropagation(
                kernel='rbf',
                alpha=0.65,
                n_neighbors=4,
                n_jobs=-1
            )),
        ])

        pipe.fit(x, y)
        self._transduction = pipe.named_steps['estim'].transduction_
        self._model = pipe.predict
예제 #13
0
파일: main.py 프로젝트: ajmokotoff/AI_Final
def univariate_feature_selection(option, opt, value, parser):
    n_samples = len(y)
    x = np.reshape(X, (n_samples, -1))
    x = np.hstack((x, 2 * np.random.random((n_samples, 400))))

    transform = feature_selection.SelectPercentile(feature_selection.f_classif)
    clf = Pipeline([('anova', transform), ('svc', svm.SVC(C=1.0))])

    score_means = list()
    score_stds = list()
    percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)

    for percentile in percentiles:
        clf.set_params(anova__percentile=percentile)
        # Compute cross-validation score using 1 CPU
        this_scores = cross_val_score(clf, x, y, n_jobs=1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())

    plt.errorbar(percentiles, score_means, np.array(score_stds))
    plt.title(
        'Performance of the SVM-Anova varying the percentile of features selected'
    )
    plt.xlabel('Percentile')
    plt.ylabel('Prediction rate')

    plt.axis('tight')
    plt.show()
예제 #14
0
    def comput_coefs(self, X, y, size):
        cv = KFold(2)  # cross-validation generator for model selection
        ridge = BayesianRidge()
        cachedir = tempfile.mkdtemp()
        mem = Memory(cachedir=cachedir, verbose=1)

        # Ward agglomeration followed by BayesianRidge
        connectivity = grid_to_graph(n_x=size, n_y=size)
        ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity,
                                    memory=mem)
        clf = Pipeline([('ward', ward), ('ridge', ridge)])
        # Select the optimal number of parcels with grid search
        clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
        clf.fit(X, y)  # set the best parameters
        coef_ = clf.best_estimator_.steps[-1][1].coef_
        coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
        coef_agglomeration_ = coef_.reshape(size, size)

        # Anova univariate feature selection followed by BayesianRidge
        f_regression = mem.cache(feature_selection.f_regression)  # caching function
        anova = feature_selection.SelectPercentile(f_regression)
        clf = Pipeline([('anova', anova), ('ridge', ridge)])
        # Select the optimal percentage of features with grid search
        clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv)
        clf.fit(X, y)  # set the best parameters
        coef_ = clf.best_estimator_.steps[-1][1].coef_
        coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
        coef_selection_ = coef_.reshape(size, size)
        return dict(
            coef_selection_=coef_selection_,
            coef_agglomeration_=coef_agglomeration_,
            cachedir=cachedir
        )
def svm_classifier(X, y, is_default=True):
    from sklearn.svm import SVC

    if is_default:
        model = SVC(probability=True)
        model.fit(X, y)
        return model
    else:
        param_grid = {
            'kernel': ('rbf'),
            'C': [1e-2, 1e-1, 1, 10],
            'gamma': [1e-4, 1e-3, 1e-2]
        }
        fs = feature_selection.SelectPercentile(feature_selection.chi2,
                                                percentile=20)
        x_train_fs = fs.fit_transform(X, y)

        model = SVC(probability=True)
        grid_search = GridSearchCV(model,
                                   param_grid,
                                   cv=5,
                                   scoring='accuracy',
                                   verbose=1,
                                   n_jobs=-1)
        grid_search.fit(x_train_fs, y_train)
        best_parameters = grid_search.best_estimator_.get_params()

        # model with the best parameters
        model = SVC(kernel=best_parameters['kernel'],
                    C=best_parameters['C'],
                    gamma=best_parameters['gamma'],
                    probability=True)
        model.fit(x_train_fs, y_train)
        return model
def feature_select_per(features_train, target_train, features_test,
                       target_test, est, lr, depth, subsample, colsamplebt):

    # Now after the model has been tuned, use percentile to do feature selection
    from sklearn import feature_selection

    acc_list_train = []
    acc_list_test = []

    per_list = []

    percentile = range(1, 101)
    #range(10,100)
    #percentile = [22]

    # identify the percentile that will produce the best results
    for per in percentile:

        # intilaize SelectFromModel using thresh
        fs = feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=per)
        feature_model = fs.fit(features_train, target_train)

        features_train_new = feature_model.transform(features_train)
        features_test_new = feature_model.transform(features_test)

        xgb = xgboost.XGBClassifier(n_estimators=est,
                                    learning_rate=lr,
                                    gamma=0,
                                    subsample=subsample,
                                    colsample_bytree=colsamplebt,
                                    max_depth=depth)

        xgb.fit(features_train_new, target_train)
        pred_test = xgb.predict(features_test_new)
        pred_train = xgb.predict(features_train_new)

        predictions_train = [round(value) for value in pred_train]
        predictions_test = [round(value) for value in pred_test]

        train_accuracy = accuracy_score(target_train, predictions_train)
        test_accuracy = accuracy_score(target_test, predictions_test)

        print(per)
        print(train_accuracy)
        print(test_accuracy)

        per_list.append(per)
        acc_list_train.append(train_accuracy)
        acc_list_test.append(test_accuracy)

    per_results = pd.DataFrame({
        'per': per_list,
        'acc_train': acc_list_train,
        'acc_test': acc_list_test
    })

    per_results.to_csv('per_results.csv')

    return per_results
def rf_classifier(X, y, is_default=True):
    from sklearn.ensemble import RandomForestClassifier

    if is_default:
        model = RandomForestClassifier(probability=True)
        model.fit(X, y)
        return model
    else:
        param_grid = {
            'n_estimators': range(10, 100, 10),
            'max_features': np.linspace(0.5, 0.9, num=5).tolist(),
            'max_depth': [10, 50, None],
        }
        fs = feature_selection.SelectPercentile(feature_selection.chi2,
                                                percentile=20)
        x_train_fs = fs.fit_transform(X, y)

        model = RandomForestClassifier(probability=True)
        grid_search = GridSearchCV(model,
                                   param_grid,
                                   cv=5,
                                   scoring='accuracy',
                                   verbose=1,
                                   n_jobs=-1)
        grid_search.fit(x_train_fs, y_train)
        best_parameters = grid_search.best_estimator_.get_params()

        # model with the best parameters
        model = RandomForestClassifier(
            n_estimators=best_parameters['n_estimators'],
            max_features=best_parameters['max_features'],
            max_depth=best_parameters['max_depth'],
            probability=True)
        model.fit(x_train_fs, y_train)
        return model
예제 #18
0
def run_select_percentile(file_path):
    """
    Returns a list of selected feature names.
    :param file_path: Path for the training matrix of extracted features to select from
    :return: List of selected feature names
    """

    # Setting up dataset
    data = pd.read_csv(file_path)

    x_train = data.drop('Label', axis=1)
    y_train = data['Label']

    # Select features according to a percentile of the highest scores.
    feature_selector = fs.SelectPercentile(score_func=fs.f_classif,
                                           percentile=10)
    feature_selector.fit_transform(x_train, y_train)

    mask = feature_selector.get_support()

    # List of all feature names
    feature_names = list(data.columns.values)

    # List of selected feature names
    new_feature_names = []

    for feature_is_selected, feature in zip(mask, feature_names):
        if feature_is_selected:
            new_feature_names.append(feature)

    return new_feature_names
예제 #19
0
def main():
    X, Y = loadData()
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.25,
                                                        random_state=33)
    vec = DictVectorizer()
    X_train = vec.fit_transform(X_train.to_dict(orient='record'))
    X_test = vec.transform(X_test.to_dict(orient='record'))
    print(len(vec.feature_names_))
    dt = DecisionTreeClassifier(criterion='entropy')
    dt.fit(X_train, Y_train)
    print(dt.score(X_test, Y_test))

    fs = feature_selection.SelectPercentile(feature_selection.chi2,
                                            percentile=20)
    X_train_fs = fs.fit_transform(X_train, Y_train)
    dt = DecisionTreeClassifier(criterion='entropy')
    dt.fit(X_train_fs, Y_train)
    X_test_fs = fs.transform(X_test)
    print(dt.score(X_test_fs, Y_test))

    percentiles = range(1, 100, 2)
    results = []
    for i in percentiles:
        fs = feature_selection.SelectPercentile(feature_selection.chi2,
                                                percentile=i)
        X_test_fs = fs.fit_transform(X_train, Y_train)
        scores = cross_val_score(dt, X_train_fs, Y_train, cv=5)
        results = np.append(results, scores.mean())
    print(results)

    opt = np.where(results == results.max())[0]
    print(opt)
    print('Optimal number of features %d' % percentiles[opt[0]])
    pl.plot(percentiles, results)
    pl.xlabel('percent of features')
    pl.ylabel('accuracy')
    pl.show()

    fs = feature_selection.SelectPercentile(feature_selection.chi2,
                                            percentile=percentiles[opt[0]])
    X_train_fs = fs.fit_transform(X_train, Y_train)
    # dt = DecisionTreeClassifier(criterion='entropy')
    dt.fit(X_train_fs, Y_train)
    X_test_fs = fs.transform(X_test)
    print(dt.score(X_test_fs, Y_test))
예제 #20
0
def main():
    train = read_bagofwords_dat(file_train, num_train)
    test = read_bagofwords_dat(file_test, num_test)

    train_target = []
    for i in range(0, num_train):
        if i < num_train/2:
            train_target.append(0) #notspam 
        else: 
            train_target.append(1) #spam

    test_target = []
    for i in range(0, num_test):
        if i < num_test/2:
            test_target.append(0) #notspam
        else:
            test_target.append(1) #spam


    if select_features:
        selector = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=percentile)
        #selector = feature_selection.SelectKBest(feature_selection.f_classif, k = 10)
        train = selector.fit_transform(train, train_target)
        test = selector.transform(test)
        
        #mask = selector.get_support()
        #print_features(mask)
        print ("Finished doing %d percentile feature selection" % (percentile))

    classifiers = [
        #(svm.LinearSVC(), "SVML"),
        #(GaussianNB(), "Gaussian"), 
        #(MultinomialNB(1.0, False, class_prior), "Multinomial"), 
        #(BernoulliNB(1.0, freq_cutoff, False, class_prior), "Bernoulli"),
        #(tree.DecisionTreeClassifier(), "Decision Tree"),  
        (AdaBoostClassifier(base_estimator = tree.DecisionTreeClassifier(max_depth=3), n_estimators = rounds), 
        "Adaboost with %d rounds and max-depth 3 decision tree" % (rounds))
        ]
    for (classifier, name) in classifiers: 
        model = classifier.fit(train, train_target)
        #y_pred = model.predict(test)
        if name == "SVML":
            y_scores = model.decision_function(test)
        else:
            y_scores = model.predict_proba(test)[:,1]

        #FP = 0
        #FN = 0
        #TP = 0
        #for i in range(0, num_test):
        #    if y_pred[i] == "spam" and test_target[i] == "notspam":
        #        FP+=1
        #    if y_pred[i] == "notspam" and test_target[i] == "spam":
        #        FN+=1
        #    if y_pred[i] == "spam" and test_target[i] == "spam":
        #        TP+=1

        #print("%s: FP %d, FN %d, TP %d " % (name, FP, FN, TP))
        print("%s: AUC %f" % (name, roc_auc_score(test_target, y_scores)))
예제 #21
0
def simon_pipeline(simon_transformer, percentile):
    return Pipeline([
        ('simon', simon_transformer),
        ('scale', MinMaxScaler(feature_range=(-1, 1))),
        ('percent',
         feature_selection.SelectPercentile(feature_selection.f_classif,
                                            percentile=percentile)),
    ])
예제 #22
0
def get_percentile_columns(X, y, percentile=2, score_func=None):
    """
    Method to fetch columns based on SelectPercentile feature selection
    """
    selector = feature_selection.SelectPercentile(score_func=score_func,
                                                  percentile=percentile).fit(
                                                      X, y)
    return X.columns[selector.get_support()]
예제 #23
0
def optimal_features_select_from_data(X_train, X_test, Y_train, Y_test):
    optimal_percentil, results, pecentils = optimal_percentile_find(
        X_train, X_test, Y_train, Y_test)
    fs = feature_selection.SelectPercentile(
        feature_selection.chi2, percentile=pecentils[optimal_percentil])
    X_train_fs = fs.fit_transform(X_train, Y_train)  #fit_transform区别
    X_test_fs = fs.transform(X_test)  #transform区别
    return X_train_fs, X_test_fs
예제 #24
0
def percentile_filter(X, y, percentile=20):
    selector = fs.SelectPercentile(fs.chi2, percentile=percentile)
    selector.fit(X, y)
    # features = selected_features(selector, feature_names)
    # log('Percentile', len(features))
    # log('X', xt.shape)
    # return pd.DataFrame(xt, columns=features, index=X.index), selector
    return selector
예제 #25
0
def train_pair(train_set,
               resize_img=300,
               num_id=5,
               num_img_id=10,
               min_sample=False,
               params=None):
    train_labels = []
    train_hists = []

    for item in train_set:
        print(item)
        train_labels.append(item[0])
        image1_p = item[1][0]
        image2_p = item[1][1]

        image1 = cv.imread(image1_p, 0)
        image2 = cv.imread(image2_p, 0)

        image1_hist = face_descriptors.get_orb_histograms(image1, resize_img)
        image2_hist = face_descriptors.get_orb_histograms(image2, resize_img)

        train_hists.append(np.concatenate((image1_hist, image2_hist)))

    anova_filter = feature_selection.SelectPercentile(
        feature_selection.f_classif)

    if params is not None:
        c, gamma = params
        models = []
        for c_value in c:
            for g_value in gamma:
                # Initialize SVM
                print("Training SVM: c ", c_value, ",gamma ", g_value)
                clf = SVC(kernel='rbf',
                          decision_function_shape='ovr',
                          C=c_value,
                          gamma=g_value,
                          class_weight='balanced',
                          probability=True,
                          verbose=5)
                clf = make_pipeline(anova_filter, clf)
                # fit the model
                clf.fit(train_hists, train_labels)
                models.append([clf, c_value, g_value])
        return models
    else:
        clf = SVC(kernel='linear',
                  decision_function_shape='ovr',
                  C=1,
                  class_weight='balanced',
                  probability=True,
                  verbose=5)
        clf = make_pipeline(anova_filter, clf)
        # fit the model
        clf.fit(train_hists, train_labels)

        return clf
예제 #26
0
def plot_BestKFeatures(X_train, y_train):
    '''
    http://nbviewer.ipython.org/github/gmonce/scikit-learn-book/blob/master/Chapter%204%20-%20Advanced%20Features%20-%20Feature%20Engineering%20and%20Selection.ipynb
    Find the best percentile of features to use,
    using cross-validation on the training set and get K best feats
    '''
    from sklearn import cross_validation
    from sklearn import feature_selection
    from sklearn import tree
    dt = tree.DecisionTreeClassifier(criterion='entropy')
    dt = RandomForestClassifier(n_jobs=2,
                                bootstrap=True,
                                n_estimators=250,
                                criterion='gini')
    dt = dt.fit(X_train, y_train)

    percentiles = range(1, 95, 5)
    results = []
    for i in range(1, 95, 5):
        fs = feature_selection.SelectPercentile(feature_selection.chi2,
                                                percentile=i)  #Original
        fs = feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=i)  # alt
        X_train_fs = fs.fit_transform(X_train, y_train)
        scores = cross_validation.cross_val_score(dt,
                                                  X_train_fs,
                                                  y_train,
                                                  cv=4)
        #print i,scores.mean()
        results = np.append(results, scores.mean())

    optimal_percentil = np.where(results == results.max())[0]
    print(("Optimal number of features:{0}".format(
        percentiles[optimal_percentil])), "\n")

    # Plot number of features VS. cross-validation scores
    import pylab as pl
    import matplotlib.pylab as pl
    pl.figure()
    pl.xlabel("Number of features selected")
    pl.ylabel("Cross validation accuracy)")
    pl.plot(percentiles, results)
    print("Mean scores:", results)
    return
예제 #27
0
def get_fs_model(model, method, train, target=None, cv=None):
    """Connects given model with specified feature selection method and trains
    the final structure.
    """
    if method == "RFE":
        model = fs_scikit.RFE(model, 2, step=5)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    if method == "RFECV":
        model = fs_scikit.RFECV(model, 3, cv=cv)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    elif method == "linearSVC":
        sel = SelectFromModel(LinearSVC(penalty='l1', dual=False))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "fromModel":
        fm = fs_scikit.SelectFromModel(model)
        if target is not None:
            fm.fit(train, target)
        else:
            fm.fit(train)
        model = Pipeline([('feature_selection', fm), ('data_mining', model)])

    # elif method == "Anova":
    # ANOVA SVM-C
    # anova_filter = fs_scikit.SelectKBest(f_regression, k=5)
    # model = Pipeline([
    #     ('feature_selection', anova_filter),
    #     ('data_mining', model)
    # ])
    elif method == "VarianceThreshold":
        sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8)))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectPercentile":
        sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFpr":
        sel = fs_scikit.SelectFpr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFdr":
        sel = fs_scikit.SelectFdr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFwe":
        sel = fs_scikit.SelectFwe(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "ch2":
        sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    else:
        print("Feature selection method was not found: " + method)
        sys.exit(1)
    return model
예제 #28
0
def test_features(my_dataset, features_list):
    ### Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    from sklearn import feature_selection
    fs = feature_selection.SelectPercentile(feature_selection.chi2,
                                            percentile=20)
    X_train_fs = fs.fit_transform(list(map(abs, features)), labels)

    print[features_list[i] for i in np.argsort(fs.scores_)[::-1]]
    test_code(features, labels)
예제 #29
0
def percentile_k_features(data,k=20):
    data1=pd.DataFrame(data)
    data2=data1.to_dict()
    X = pd.DataFrame(data.iloc[:,:-1])
    y = pd.DataFrame(data['SalePrice'])
    X1=X.columns.values
    y1=['SalesPrice']
    transformer = feature_selection.SelectPercentile(f_regression,percentile=20).fit_transform(X, y)   
    dataframep=pd.DataFrame(transformer)
    list=['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath']
    return(list)
예제 #30
0
def train_imbalance(
    descr_series: Series,
    classes_codes: Series,
    TFIDF_,
    IMB_,
    FS_,
    req_percentage: int,
    CLF_,
    model_name: str,
) -> tuple:
    """Trains models using handled setting and saves them as .sav objects.

    Parameters:
    ----------
    instance:
        Instance of User model.
    descr_series:
        description series.
    classes_codes:
        series with classes' codes.
    TFIDF_:
        vectorizer.
    IMB_:
        SMOTE instance.
    FS_:
        ranking terms method.
    req_percentage:
        percentage to be taken from the ranked list.
    CLF_:
        classifier.
    model_name:
        models name.

    Returns:
    ----------
        Trained model in byte representation associated to its model name.

    """
    transformer = feature_selection.SelectPercentile(FS_)
    clf_model = Pipeline([("tfidf", TFIDF_), ("imba", IMB_),
                          ("fs", transformer), ("clf", CLF_)])

    best_params = get_best_params(clf_model, descr_series, classes_codes)
    print(f"{model_name}:{best_params}")

    clf_model.set_params(
        fs__percentile=req_percentage,
        clf__C=best_params["clf__C"],
        clf__gamma=best_params["clf__gamma"],
    ).fit(descr_series, classes_codes)

    return {model_name: clf_model}, {model_name: best_params}