Пример #1
0
def feature_importance_with_forest(rforest_classifier, issues_train, priority_train, issues_test, priority_test):
    """
    Assess feature importance using a Random Forest.
    :param rforest_classifier: An already fitted classifier.
    :param issues_train: Train features.
    :param priority_train: Train classes.
    :param issues_test: Test features.
    :param priority_test: Test classes.
    :return: None
    """
    importances = rforest_classifier.feature_importances_
    indices = np.argsort(importances)[::-1]

    for column_index in range(len(issues_train.columns)):
        print column_index + 1, ") ", issues_train.columns[column_index], " ", importances[indices[column_index]]

    figure, axes = plt.subplots(1, 1)
    plt.title('Feature importance')
    plt.bar(range(len(issues_train.columns)), importances[indices], color='lightblue', align='center')
    plt.xticks(range(len(issues_train.columns)), issues_train.columns, rotation=90)
    plt.xlim([-1, len(issues_train.columns)])
    plt.tight_layout()
    plt.show()

    evaluate_performance("FOREST", rforest_classifier, issues_train, priority_train, issues_test, priority_test)

    print "Selecting important features ..."
    select = SelectFromModel(rforest_classifier, threshold=0.05, prefit=True)

    train_selected = select.transform(issues_train)
    test_selected = select.transform(issues_test)

    rforest_classifier.fit(train_selected, priority_train)
    evaluate_performance("FOREST-IMPORTANT", rforest_classifier, train_selected, priority_train, test_selected,
                         priority_test)
Пример #2
0
def lassoCV_regression(data,target,alphas):
    clf=LassoCV()
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(data, target)
    n_features = sfm.transform(data).shape[1]
    
    while n_features > 2:
        sfm.threshold += 0.1
        data_transform = sfm.transform(data)
        n_features = data_transform.shape[1]
     
    rmses=[]
    kf=KFold(len(target),10,True,None)
    for train_index, test_index in kf:
        data_train,data_test=data_transform[train_index],data_transform[test_index]
        target_train,target_test=target[train_index],target[test_index]
        clf.fit(data_train,target_train)
        rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2))
        rmses.append(rmse)
        
    x0=np.arange(1,11)
    
    plt.figure()
    plt.plot(x0,rmses,label='LassoCV')
    plt.legend()
    plt.show()
    
    return rmses
Пример #3
0
def selecttest():
    import matplotlib.pyplot as plt
    import numpy as np

    from sklearn.datasets import load_boston
    from sklearn.feature_selection import SelectFromModel
    from sklearn.linear_model import LassoCV

    boston = load_boston()
    X,y = boston['data'], boston['target']

    clf = LassoCV()
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(X,y)
    n_features = sfm.transform(X).shape[1]

    while n_features > 2:
        sfm.threshold += 0.1
        X_transform = sfm.transform(X)
        n_features = X_transform.shape[1]

    plt.title(
    "Features selected from Boston using SelectFromModel with "
    "threshold %0.3f." % sfm.threshold)
    feature1 = X_transform[:, 0]
    feature2 = X_transform[:, 1]
    plt.plot(feature1, feature2, 'r.')
    plt.xlabel("Feature number 1")
    plt.ylabel("Feature number 2")
    plt.ylim([np.min(feature2), np.max(feature2)])
    plt.show()
Пример #4
0
def forests(input_df, target_df):
	"""This method implements two types of forest features selection. ExtraTreesClassifier & RandomForestClassifier. Features are ranked in order of importance."""
	from sklearn.ensemble import ExtraTreesClassifier
	from sklearn.feature_selection import SelectFromModel
	clf = ExtraTreesClassifier(random_state = 0)
	clf = clf.fit(input_df, target_df)
	model = SelectFromModel(clf, prefit=True)
	input_df_new = model.transform(input_df)
	original_space = input_df.shape
	new_space_ETC = input_df_new.shape
	tuple_holder = [(j, i) for i, j in zip(feature_space, clf.feature_importances_)]
	tuple_holder.sort()
	tuple_holder.reverse()
	################################################
	################################################
	from sklearn.ensemble import RandomForestClassifier
	clf = RandomForestClassifier(random_state = 0)
	clf = clf.fit(input_df, target_df)
	model = SelectFromModel(clf, prefit=True)
	input_df_new = model.transform(input_df)
	new_space_RFC = input_df_new.shape
	tuple_holder_2 = [(j, i) for i, j in zip(feature_space, clf.feature_importances_)]
	tuple_holder_2.sort()
	tuple_holder_2.reverse()
	################################################
	################################################
	rank_number = 0
	print 'ExtraTreesClassifier', '\t'*4, 'RandomForestClassifier'
	print 'Old Space: ', original_space, '\t'*4, 'Old Space:', original_space
	print 'New Space: ', new_space_ETC, '\t'*4, 'New Space:', new_space_RFC
	for i, j in zip(tuple_holder, tuple_holder_2):
		rank_number += 1
		print rank_number, '|', i, '\t'*3, rank_number, '|', j
Пример #5
0
def lasso_reducer(X, y):

    clf = LassoCV()

    # Set a minimum threshold of 0.25
    # this is a 'maxing out' of the sum of all coefficients
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(X, y)

    n_features = sfm.transform(X).shape[1]

    # reset the threshold until the number of features equals two.
    # Note that the attribute can be set directly instead of repeatedley
    # fitting the metatransformer.
    while n_features > 2:
        sfm.threshold += 0.1
        X_transform = sfm.transform(X)
        n_features = X_transform.shape[1]

    # Plot the seelcted two features from X.
    plt.title('features selected from boston using the SelectFromModel with'
              'threshold of %0.3f.' % sfm.threshold)

    feature1 = X_transform[:, 0]
    feature2 = X_transform[:, 1]
    plt.plot(feature1, feature2, 'r.')
    plt.xlabel("Value of Feature number 1")
    plt.ylabel("Value of Feature number 2")
    plt.ylim([np.min(feature2), np.max(feature2)])
    plt.show()

    return
Пример #6
0
def feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc'):
    thresholds = [thres for thres in sorted(model.feature_importances_) if thres != 0]  # Use feat. with >0 importance

    roc_scores = {}
    for thresh in thresholds:  # select features using threshold

        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)

        selection_model = XGBClassifier()  # train model
        selection_model.fit(select_X_train, y_train, eval_metric=eval_metric)

        select_X_test = selection.transform(X_test)  # eval model
        y_pred = selection_model.predict(select_X_test)

        roc = roc_auc_score(y_test, y_pred)
        roc_scores[selection.threshold] = roc

    best_thresh = max(roc_scores, key=roc_scores.get)

    fs = SelectFromModel(model, threshold=best_thresh, prefit=True)
    pickle_model(fs, 'feature.select')
    X_train_trans_ = fs.transform(X_train)
    X_test_trans_ = fs.transform(X_test)
    print 'total features kept: {}'.format(X_train_trans_.shape[1])

    return X_train_trans_, X_test_trans_
Пример #7
0
def selectfeature(x, y, x_pre):
	x, x_pre = datscater(x, x_pre)
	clf = linear_model.LassoLars().fit(x, y)
	model = SelectFromModel(clf, prefit=True)
	x_new = model.transform(x)
	print 'x',x.shape
	print x_new.shape
	x_pre = model.transform(x_pre)
	return x_new, x_pre
Пример #8
0
def test_threshold_without_refitting():
    """Test that the threshold can be set without refitting the model."""
    clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0)
    model = SelectFromModel(clf, threshold=0.1)
    model.fit(data, y)
    X_transform = model.transform(data)

    # Set a higher threshold to filter out more features.
    model.threshold = 1.0
    assert_greater(X_transform.shape[1], model.transform(data).shape[1])
Пример #9
0
    def train(self):
        rfc = RandomForestRegressor()
        rfc.fit(self.data, self.target)

        model = SelectFromModel(rfc, prefit=True)
        X = model.transform(self.data)
        self.predict = model.transform(self.predict)

        rfc.fit(X, self.target)
        return rfc
Пример #10
0
def test_partial_fit():
    est = PassiveAggressiveClassifier(random_state=0, shuffle=False)
    transformer = SelectFromModel(estimator=est)
    transformer.partial_fit(data, y, classes=np.unique(y))
    old_model = transformer.estimator_
    transformer.partial_fit(data, y, classes=np.unique(y))
    new_model = transformer.estimator_
    assert_true(old_model is new_model)

    X_transform = transformer.transform(data)
    transformer.fit(np.vstack((data, data)), np.concatenate((y, y)))
    assert_array_equal(X_transform, transformer.transform(data))
Пример #11
0
def extra_trees_classifier():

    tianic=Titanic_Data('../input/train.csv','../input/test.csv')

    combined_normalized_data=tianic.get_normalized_data()

    train,test,targets = recover_train_test_target('../input/train.csv', combined_normalized_data)

    
    clf = ExtraTreesClassifier(n_estimators=200)
    clf = clf.fit(train, targets)

    features = pd.DataFrame()
    features['feature'] = train.columns
    features['importance'] = clf.feature_importances_

    features.sort(['importance'],ascending=False)

    model = SelectFromModel(clf, prefit=True)
    train_new = model.transform(train)
    train_new.shape

    test_new = model.transform(test)
    test_new.shape

    forest = RandomForestClassifier(max_features='sqrt')

    parameter_grid = {
                     'max_depth' : [4,5,6,7,8],
                     'n_estimators': [200,210,240,250],
                     'criterion': ['gini','entropy']
                     }

    cross_validation = StratifiedKFold(targets, n_folds=5)

    grid_search = GridSearchCV(forest,
                               param_grid=parameter_grid,
                               cv=cross_validation)

    grid_search.fit(train_new, targets)

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

    output = grid_search.predict(test_new).astype(int)
    df_output = pd.DataFrame()
    df_output['PassengerId'] = test['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId','Survived']].to_csv('./extra_trees_classifier_output.csv',index=False)
Пример #12
0
def run():
    allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove =
        ["KWH", "KWHSPH", "KWHCOL", "KWHWTH", "KWHRFG", "KWHOTH", "BTUEL", "BTUELSPH", "BTUELCOL", "BTUELWTH", "BTUELRFG","BTUELOTH",
        "DOLLAREL", "DOLELSPH", "DOLELCOL", "DOLELWTH", "DOLELRFG", "DOLELOTH", "TOTALBTUOTH", "TOTALBTUCOL", 'TOTALBTU', 'TOTALBTUWTH',
         'TOTALBTU', 'TOTALBTUSPH', 'TOTALBTURFG', 'TOTALDOL', 'TOTALDOLSPH', 'TOTALDOLCOL', 'TOTALDOLWTH', 'TOTALDOLRFG', 'TOTALDOLOTH'])
    #allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove = [], forceUse =
    #                         [
    #                           'WGTP', 'NP', 'TYPE', 'ACR', 'BDSP', 'BATH', 'FS','MHP', 'RMSP', 'RNTP', 'REFR', 'RNTP', 'RWAT', 'STOV', 'TEN', 'VALP', 'YBL', 'FES', 'FINCP', 'HINCP', 'HHT', 'KIT', 'NOC', 'NPF', 'PLM', 'SRNT', 'SVAL', 'TAXP', 'WIF', 'WORKSTAT',
    #                         ])

    clf = RandomForestRegressor(n_estimators = 100, n_jobs = 7)
    clf.fit(X, y)

    model = SelectFromModel(clf, prefit = True)
    X = model.transform(X)

    relevantFeatures = [allKeys[i] for i in range(len(model._get_support_mask())) if model._get_support_mask()[i] == True]
    print("Relevant Features", relevantFeatures)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)


    clf.fit(X_train, y_train)
    print(y_test[:100])
    print(metrics.mean_squared_error(clf.predict(X_test), y_test))
    features = sorted(zip(allKeys, clf.feature_importances_), key = lambda x : x[1], reverse = True)
    print("Features", features)
Пример #13
0
    def tree_based_selection(self, data_set, data_target, feature_names):
        """

        :param data_set:
        :return:
        """

        clf = ExtraTreesClassifier()
        clf = clf.fit(data_set, data_target)
        print clf.feature_importances_

        model = SelectFromModel(clf, prefit=True)
        feature_set = model.transform(data_set)

        fea_index = []
        for A_col in np.arange(data_set.shape[1]):
            for B_col in np.arange(feature_set.shape[1]):
                if (data_set[:, A_col] == feature_set[:, B_col]).all():
                    fea_index.append(A_col)

        check = {}
        for i in fea_index:
            check[feature_names[i]] = data_set[0][i]
        print np.array(check)

        return feature_set, fea_index
Пример #14
0
def feature_selction(_train_data, _valid_data, _test_data, _train_label, _valid_label, _test_label):
    train_imageNo = _train_data.shape[0]
    valid_imageNo = _valid_data.shape[0]
    whole_data = numpy.concatenate((_train_data, _valid_data, _test_data))
    whole_data = whole_data.reshape((-1, 120))

    whole_label = numpy.concatenate((_train_label, _valid_label, _test_label))
    whole_label = list(whole_label)

    new_label_list = list()
    for i in whole_label:
        for j in range(100):
            new_label_list.append(i)

    assert len(new_label_list) == whole_data.shape[0]

    lsvc = LinearSVC(C=0.1, penalty="l1", dual=False).fit(whole_data, new_label_list)
    model = SelectFromModel(lsvc, prefit=True)
    data_new = model.transform(whole_data)
    print ('After feature selection we have', data_new.shape[1], 'features.')

    data_new = data_new.reshape((-1, 100, data_new.shape[1]))
    _train_data = data_new[:train_imageNo,:,:]
    _valid_data = data_new[train_imageNo:train_imageNo+valid_imageNo,:,:]
    _test_data = data_new[train_imageNo+valid_imageNo:,:,:]

    return _train_data, _valid_data, _test_data
Пример #15
0
def test_feature_importances_2d_coef():
    X, y = datasets.make_classification(
        n_samples=1000,
        n_features=10,
        n_informative=3,
        n_redundant=0,
        n_repeated=0,
        shuffle=False,
        random_state=0,
        n_classes=4,
    )

    est = LogisticRegression()
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        for order in [1, 2, np.inf]:
            # Fit SelectFromModel a multi-class problem
            transformer = SelectFromModel(estimator=LogisticRegression(), threshold=threshold, norm_order=order)
            transformer.fit(X, y)
            assert_true(hasattr(transformer.estimator_, "coef_"))
            X_new = transformer.transform(X)
            assert_less(X_new.shape[1], X.shape[1])

            # Manually check that the norm is correctly performed
            est.fit(X, y)
            importances = norm(est.coef_, axis=0, ord=order)
            feature_mask = importances > func(importances)
            assert_array_equal(X_new, X[:, feature_mask])
def rf_feat_reduction(rf_model, features):

    print " Reducing number of input features based on feature importance."
    subset_model = SelectFromModel(rf_model, prefit=True)
    feat_subset = subset_model.transform(features)
    feat_bool = subset_model.get_support()
    print " " + str(len(feat_subset[0])) + " features chosen after model selection."
    return feat_subset, feat_bool
Пример #17
0
def test_partial_fit():
    est = PassiveAggressiveClassifier(random_state=0, shuffle=False)
    transformer = SelectFromModel(estimator=est)
    transformer.partial_fit(data, y,
                            classes=np.unique(y))
    old_model = transformer.estimator_
    transformer.partial_fit(data, y,
                            classes=np.unique(y))
    new_model = transformer.estimator_
    assert_true(old_model is new_model)

    X_transform = transformer.transform(data)
    transformer.fit(np.vstack((data, data)), np.concatenate((y, y)))
    assert_array_equal(X_transform, transformer.transform(data))

    # check that if est doesn't have partial_fit, neither does SelectFromModel
    transformer = SelectFromModel(estimator=RandomForestClassifier())
    assert_false(hasattr(transformer, "partial_fit"))
Пример #18
0
def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
    # SelectfromModel
    clf = LGBMClassifier(n_estimators=400)
    clf.fit(matrix_x_temp, label_y)
    sfm = SelectFromModel(clf, prefit=True, threshold=th)
    matrix_x = sfm.transform(matrix_x_temp)

    # 打印出有多少特征重要性非零的特征
    feature_score_dict = {}
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print 'number of not-zero features:' + str(len(feature_score_dict) - m)

    # 打印出特征重要性
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                       key=lambda d: d[1], reverse=True)
    print 'feature_importance:'
    for ii in range(len(feature_score_dict_sorted)):
        print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
    print '\n'

    f = open('../eda/lgb_feature_importance.txt', 'w')
    f.write(th)
    f.write('\nRank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()

    # 打印具体使用了哪些字段
    how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    print 'feature_chooesed:'
    for ii in range(len(feature_used_name)):
        print feature_used_name[ii]
    print '\n'

    f = open('../eda/lgb_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()

    # 找到未被使用的字段名
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])

    return matrix_x, feature_not_used_name[:], len(feature_used_name)
def select_features_tree(X, y, feature_names = []):
    print X.shape
    #forest = RandomForestClassifier(n_estimators=1000, n_jobs=4)
    forest = ExtraTreesClassifier(n_estimators=1000, n_jobs=8)
    fo = forest.fit(X, y)
    sorted_feature_names = plot_feature_importance(fo, X, feature_names)
    model = SelectFromModel(fo, prefit=True, )
    X_new = model.transform(X)
    print X_new.shape
    return X_new, sorted_feature_names[0:X_new.shape[1]]
def select_feature(clf,x_train,x_valid):
    clf.fit(x_train, y_train)
    model = SelectFromModel(clf, prefit=True, threshold="mean")

    print x_train.shape
    x_train = model.transform(x_train)
    x_valid = model.transform(x_valid)
    print x_train.shape

    return x_train,x_valid
Пример #21
0
def test_threshold_string():
    est = RandomForestClassifier(n_estimators=50, random_state=0)
    model = SelectFromModel(est, threshold="0.5*mean")
    model.fit(data, y)
    X_transform = model.transform(data)

    # Calculate the threshold from the estimator directly.
    est.fit(data, y)
    threshold = 0.5 * np.mean(est.feature_importances_)
    mask = est.feature_importances_ > threshold
    assert_array_equal(X_transform, data[:, mask])
Пример #22
0
def select_feature_from_model(X, y, max_features):
    from sklearn.feature_selection import SelectFromModel

    X_scaled = pd.DataFrame(preprocessing.scale(X), columns=X.keys())
    classifier = SVC(kernel='linear', class_weight='balanced', C=0.025)
    sfm = SelectFromModel(classifier, threshold=0.05)
    sfm.fit(X_scaled, y)
    n_features = sfm.transform(X_scaled).shape[1]
    while n_features > max_features:  # set the max number of features to select
        sfm.threshold += 0.05
        X_transform = sfm.transform(X_scaled)
        n_features = X_transform.shape[1]
    X_final = pd.DataFrame(X_transform)

    hashes = {}
    features_selected = []
    for c in X_scaled.keys(): hashes[hash(tuple(X_scaled[c].values))] = c
    for c in X_final.keys():
        features_selected.append(hashes[hash(tuple(X_final[c].values))])
    print('Features selection by SelectFromModel: {}'.format(features_selected))
Пример #23
0
def test_coef_default_threshold():
    X, y = datasets.make_classification(
        n_samples=100, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    # For the Lasso and related models, the threshold defaults to 1e-5
    transformer = SelectFromModel(estimator=Lasso(alpha=0.1))
    transformer.fit(X, y)
    X_new = transformer.transform(X)
    mask = np.abs(transformer.estimator_.coef_) > 1e-5
    assert_array_almost_equal(X_new, X[:, mask])
Пример #24
0
def predict_probabilities(X_train,X_test,y_train,threshold,component,m):
	## Selector phase
	selector = SelectFromModel(linear_model.LogisticRegression(),threshold=threshold)
	#print X_train, y_train
	selector.fit(X_train,y_train)
	new_X_train = selector.transform(X_train)
	
	##PCA phase
	pca = PCA(n_components=component)
	
	pca.fit(new_X_train)
	pca_variance =  sum(pca.explained_variance_ratio_)
	pca_X_train = pca.transform(new_X_train)
	
	#convert the X_test
	pca_X_test = pca.transform(selector.transform(X_test))
	
	##Model phase
	model = m[1]
	model.fit(pca_X_train,y_train)
	return model.predict_proba(pca_X_test), pca_variance
Пример #25
0
def lasso_by_num(X_train, y_train, num):  
    # if random_state not specified, each run gives different result
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

    print X_train
    # number of features = ycol-1 

    clf = linear_model.LassoCV()
    sfm = SelectFromModel(clf, threshold=0.00001)
    sfm.fit(X_train, y_train)

    # select 3 features using lasso
    X_train_trans = sfm.transform(X_train)
    n_features = X_train_trans.shape[1]
    while n_features > num:
        sfm.threshold += 0.01
        #print sfm.threshold
        X_train_trans = sfm.transform(X_train)
        n_features = X_train_trans.shape[1]
    
    print X_train_trans
def select_features(inputs, label, threshold):
    print 'training ExtraTreesClassifier...'
    clf = ExtraTreesClassifier(criterion='entropy')
    clf.fit(inputs, label)
    
    threshold='%f*mean'%(threshold)
    print 'training SelectFromModel, threshold=%s...'%(threshold)
    sfm = SelectFromModel(clf, threshold=threshold, prefit=True)
    inputs_new = sfm.transform(inputs)
    #pdb.set_trace()
    print inputs_new.shape
    
    return sfm, inputs_new
Пример #27
0
def test_prefit():
    """
    Test all possible combinations of the prefit parameter.
    """
    # Passing a prefit parameter with the selected model
    # and fitting a unfit model with prefit=False should give same results.
    clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0)
    model = SelectFromModel(clf)
    model.fit(data, y)
    X_transform = model.transform(data)
    clf.fit(data, y)
    model = SelectFromModel(clf, prefit=True)
    assert_array_equal(model.transform(data), X_transform)

    # Check that the model is rewritten if prefit=False and a fitted model is
    # passed
    model = SelectFromModel(clf, prefit=False)
    model.fit(data, y)
    assert_array_equal(model.transform(data), X_transform)

    # Check that prefit=True and calling fit raises a ValueError
    model = SelectFromModel(clf, prefit=True)
    assert_raises(ValueError, model.fit, data, y)
Пример #28
0
def test_feature_importances():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0
    )

    est = RandomForestClassifier(n_estimators=50, random_state=0)
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        transformer = SelectFromModel(estimator=est, threshold=threshold)
        transformer.fit(X, y)
        assert_true(hasattr(transformer.estimator_, "feature_importances_"))

        X_new = transformer.transform(X)
        assert_less(X_new.shape[1], X.shape[1])
        importances = transformer.estimator_.feature_importances_

        feature_mask = np.abs(importances) > func(importances)
        assert_array_almost_equal(X_new, X[:, feature_mask])

    # Check with sample weights
    sample_weight = np.ones(y.shape)
    sample_weight[y == 1] *= 100

    est = RandomForestClassifier(n_estimators=50, random_state=0)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(X, y, sample_weight=sample_weight)
    importances = transformer.estimator_.feature_importances_
    transformer.fit(X, y, sample_weight=3 * sample_weight)
    importances_bis = transformer.estimator_.feature_importances_
    assert_almost_equal(importances, importances_bis)

    # For the Lasso and related models, the threshold defaults to 1e-5
    transformer = SelectFromModel(estimator=Lasso(alpha=0.1))
    transformer.fit(X, y)
    X_new = transformer.transform(X)
    mask = np.abs(transformer.estimator_.coef_) > 1e-5
    assert_array_equal(X_new, X[:, mask])
def execute(fdata):

    data = list()
    target = list()
    storeDict = dict()

    for i, lines in enumerate(fdata):
        sline = lines.split(",")
        target.append(int(sline[0]))
        data.append([float(x) for j, x in enumerate(sline) if j != 0])
        storeDict[i] = [float(x) for j, x in enumerate(sline) if j != 0]

    data = np.array(data)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.25, random_state=0)
    clf = ExtraTreesClassifier()
    clf = clf.fit(X_train, y_train)
    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X_train)

    clfNew = svm.SVC(kernel='linear', C=1).fit(X_new, y_train)

    value_feature = list()
    countDict = dict()
    for key, val in storeDict.items():
        countDict[key] = 0
        for i, inval in enumerate(val):
            if inval in X_new[0]:
                countDict[key] = countDict[key] + 1


    keyName = max(countDict, key=countDict.get)
    posStore = list()
    for val in X_new[0]:
        posStore.append(storeDict[keyName].index(val))

    X_test_new = list()

    for val in X_test:
        inlist = list()
        for i, inval in enumerate(val):
            if i in posStore:
                inlist.append(inval)

        X_test_new.append(inlist)

    X_test_new = np.array(X_test_new)

    return accuracy_score(y_test, clf.predict(X_test)), accuracy_score(y_test, clfNew.predict(X_test_new))
Пример #30
0
def fs_svm(X, y):
    # feature selection with SVM model
    lsvc = LinearSVC(C=0.001, penalty="l1", dual=False).fit(X, y)
    model = SelectFromModel(lsvc, prefit=True)
    X_new = model.transform(X)

    LIWC = iot.read_fields()
    print 'Original feature size', X.shape
    print 'New feature size', X_new.shape
    sample_X = X[0]
    sample_X_new = X_new[0]
    print 'Original feature length of sample', len(set(sample_X))
    print 'New feature length of sample', len(set(sample_X_new))
    for i in xrange(len(sample_X)):
        if sample_X[i] in sample_X_new:
            print i+1, LIWC[i]
Пример #31
0
class ExtraTreeBasedSelector(Transformer):
    def __init__(self,
                 n_estimators=100,
                 criterion='gini',
                 min_samples_leaf=1,
                 min_samples_split=2,
                 max_features=0.5,
                 bootstrap='False',
                 max_leaf_nodes='None',
                 max_depth='None',
                 min_weight_fraction_leaf=0.,
                 min_impurity_decrease=0.,
                 oob_score=False,
                 n_jobs=-1,
                 random_state=1,
                 verbose=0,
                 class_weight=None):
        super().__init__("extra_trees_based_selector", 7)
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.compound_mode = 'only_new'

        self.n_estimators = n_estimators
        self.estimator_increment = 10
        if criterion not in ("gini", "entropy"):
            raise ValueError("'criterion' is not in ('gini', 'entropy'): "
                             "%s" % criterion)
        self.criterion = criterion
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.max_leaf_nodes = max_leaf_nodes
        self.max_depth = max_depth
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.min_impurity_decrease = min_impurity_decrease

        self.oob_score = oob_score
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose
        self.class_weight = class_weight

    def operate(self, input_datanode, target_fields=None, sample_weight=None):
        from sklearn.feature_selection import SelectFromModel

        feature_types = input_datanode.feature_types
        X, y = input_datanode.data
        if target_fields is None:
            target_fields = collect_fields(feature_types, self.input_type)
        X_new = X[:, target_fields]

        n_fields = len(feature_types)
        irrevalent_fields = list(range(n_fields))
        for field_id in target_fields:
            irrevalent_fields.remove(field_id)

        if self.model is None:
            from sklearn.ensemble import ExtraTreesClassifier
            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)

            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)

            self.bootstrap = check_for_bool(self.bootstrap)
            self.n_jobs = int(self.n_jobs)
            self.min_impurity_decrease = float(self.min_impurity_decrease)
            self.max_features = self.max_features
            self.min_samples_leaf = int(self.min_samples_leaf)
            self.min_samples_split = int(self.min_samples_split)
            self.verbose = int(self.verbose)

            max_features = int(X_new.shape[1]**float(self.max_features))
            estimator = ExtraTreesClassifier(
                n_estimators=self.n_estimators,
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                class_weight=self.class_weight)
            estimator.fit(X_new, y, sample_weight=sample_weight)
            self.model = SelectFromModel(estimator=estimator,
                                         threshold='mean',
                                         prefit=True)

        _X = self.model.transform(X_new)
        is_selected = self.model.get_support()

        irrevalent_types = [feature_types[idx] for idx in irrevalent_fields]
        selected_types = [
            feature_types[idx] for idx in target_fields if is_selected[idx]
        ]
        selected_types.extend(irrevalent_types)

        new_X = np.hstack((_X, X[:, irrevalent_fields]))
        new_feature_types = selected_types
        output_datanode = DataNode((new_X, y), new_feature_types,
                                   input_datanode.task_type)
        output_datanode.trans_hist = input_datanode.trans_hist.copy()
        output_datanode.trans_hist.append(self.type)
        output_datanode.enable_balance = input_datanode.enable_balance
        output_datanode.data_balance = input_datanode.data_balance
        self.target_fields = target_fields.copy()

        return output_datanode

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        n_estimators = Constant("n_estimators", 100)
        criterion = CategoricalHyperparameter("criterion", ["gini", "entropy"],
                                              default_value="gini")
        max_features = UniformFloatHyperparameter("max_features",
                                                  0,
                                                  1,
                                                  default_value=0.5,
                                                  q=0.05)

        max_depth = UnParametrizedHyperparameter(name="max_depth",
                                                 value="None")
        max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None")

        min_samples_split = UniformIntegerHyperparameter("min_samples_split",
                                                         2,
                                                         20,
                                                         default_value=2)
        min_samples_leaf = UniformIntegerHyperparameter("min_samples_leaf",
                                                        1,
                                                        20,
                                                        default_value=1)
        min_weight_fraction_leaf = UnParametrizedHyperparameter(
            'min_weight_fraction_leaf', 0.)
        min_impurity_decrease = UnParametrizedHyperparameter(
            'min_impurity_decrease', 0.)

        bootstrap = CategoricalHyperparameter("bootstrap", ["True", "False"],
                                              default_value="False")

        cs.add_hyperparameters([
            n_estimators, criterion, max_features, max_depth, max_leaf_nodes,
            min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
            min_impurity_decrease, bootstrap
        ])

        return cs
Пример #32
0
    print('Random Forest CV score: {}'.format(np.mean(scores)))

    # Plot CV scores
    fig, ax = plt.subplots(figsize=(15, 7))

    ax.plot(scores)
    ax.set_xlabel('Number of Trees (x10)')
    ax.set_ylabel('10-fold Cross-Validation Accuracy')
    ax.grid()
    plt.show()

    # Show feature importances
    rf.fit(X, Y)

    wine = load_wine()
    features = [wine['feature_names'][x] for x in np.argsort(rf.feature_importances_)][::-1]

    fig, ax = plt.subplots(figsize=(15, 8))

    ax.bar([i for i in range(13)], np.sort(rf.feature_importances_)[::-1], align='center')
    ax.set_ylabel('Feature Importance')
    plt.xticks([i for i in range(13)], features, rotation=60)
    plt.show()

    # Select the most important features
    sfm = SelectFromModel(estimator=rf, prefit=True, threshold=0.02)
    X_sfm = sfm.transform(X)

    print('Feature selection shape: {}'.format(X_sfm.shape))

from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

# Load the boston dataset.
boston = load_boston()
X, y = boston.data, boston.target

# We use the base estimator LassoCV since the L1 norm promotes sparsity of
# features.
clf = LassoCV(cv=5)

# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]

# Reset the threshold till the number of features equals two.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 2:
    sfm.threshold += 0.1
    X_transform = sfm.transform(X)
    n_features = X_transform.shape[1]

# Plot the selected two features from X.
plt.title("Features selected from Boston using SelectFromModel with "
          "threshold %0.3f." % sfm.threshold)
feature1 = X_transform[:, 0]
feature2 = X_transform[:, 1]
plt.plot(feature1, feature2, 'r.')
Пример #34
0
"""
from sklearn.feature_selection import SelectFromModel
sdt = SelectFromModel(dt, threshold=0.15)

sdt.fit(x_train, y_train)

#print name name of importtant variables
for i in sdt.get_support(indices=True):
    print(var[i])
    '''
    Petal.Length
    Petal.Width
'''
#creat a data subset with only most imp variables
x_train = sdt.transform(x_train)
x_test = sdt.transform(x_test)
print(x_train.shape)

#Train the new Random Forest Classifier Using only important Variables
#----------model import------------------------------
#=====Random_forest_Classifier======u=======
from sklearn.ensemble import RandomForestClassifier

dt = RandomForestClassifier(
    n_estimators=100,
    random_state=101)  #Accuracy 0.9666666666666667 using ginni index
#--------fit model----------------------------------
dt.fit(x_train, y_train)

#-------predict data test-----------------------------
Пример #35
0
#if len(z)!=1:
#    if z[1]<int(np.trunc(len(X)*0.1)): 
#        from imblearn.over_sampling import RandomOverSampler
#        ros = RandomOverSampler(random_state=0)
#        X1, Label1 = ros.fit_sample(X[int(np.trunc(z[0]*0.85)):], Label[int(np.trunc(z[0]*0.85)):])
#        X = np.vstack((X[0:int(np.trunc(z[0]*0.85))],X1))
#        y_test = np.hstack((Label[0:int(np.trunc(z[0]*0.85))],Label1))

#Feature Selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=1000,n_jobs=-1,bootstrap=True,oob_score=True)#
clf = clf.fit(X, y_test)#,max_depth=20
print(clf.oob_score_)
model = SelectFromModel(clf, prefit=True)
X = model.transform(X)

importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
importanceindex = []
print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    
from collections import Counter
z1 = Counter(y_test)
print(z1)
Пример #36
0
    return train, test, targets


train, test, targets = recover_train_test_target()
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(train, targets)
features = pd.DataFrame()
features['Feature'] = train.columns
features['Importance'] = clf.feature_importances_
features.sort_values(by=['Importance'], ascending=False, inplace=True)
features.set_index('Feature', inplace=True)
features.plot(kind='bar', figsize=(20, 10))
plt.show()

model = SelectFromModel(clf, prefit=True)
train_reduced = model.transform(train)
print(train_reduced.shape)
test_reduced = model.transform(test)
test_reduced.shape
parameters = {
    'bootstrap': False,
    'min_samples_leaf': 3,
    'n_estimators': 50,
    'min_samples_split': 10,
    'max_features': 'sqrt',
    'max_depth': 6
}

model = RandomForestClassifier(**parameters)
print(model.fit(train, targets))
print(compute_score(model, train, targets, scoring='accuracy'))
Пример #37
0
forest.fit(X_train, y_train)
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" %
          (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]),
        importances[indices],
        color='lightblue',
        align='center')

plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
#plt.savefig('./random_forest.png', dpi=300)
plt.show()

from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(forest, threshold=0.15, prefit=True)
X_selected = sfm.transform(X_train)

print(X_selected.shape)

for f in range(X_selected.shape[1]):
    print("%2d) %-*s %f" %
          (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
Пример #38
0
    'pay_amount', 'discount_amount', 'basket_cnt', 'phonecall_cnt',
    'favourite_cnt', 'forward_cnt'
]]
#X_test = stepone_tb[['pay_amount', 'discount_amount', 'basket_cnt', 'phonecall_cnt', 'favourite_cnt', 'forward_cnt']][11001:16597]
y_train = stepone_tb['purchase_cnt']
#y_test = stepone_tb['purchase_cnt'][11001:16597]

from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1)
clf.fit(X_train, X_test).predict(y_train)

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X_train)
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier()
clf = clf.fit(X_train, y_train)

# Lasso
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
alpha = 0.1
clf = LassoCV(cv=20)
y_pred_lasso = clf.fit(X_train, y_train).predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)
print(lasso)
print("r^2 on test data : %f" % r2_score_lasso)
Пример #39
0
class LibLinear_Preprocessor(AutoSklearnPreprocessingAlgorithm):
    # Liblinear is not deterministic as it uses a RNG inside
    def __init__(self,
                 penalty,
                 loss,
                 dual,
                 tol,
                 C,
                 multi_class,
                 fit_intercept,
                 intercept_scaling,
                 class_weight=None,
                 random_state=None):
        self.penalty = penalty
        self.loss = loss
        self.dual = dual
        self.tol = tol
        self.C = C
        self.multi_class = multi_class
        self.fit_intercept = fit_intercept
        self.intercept_scaling = intercept_scaling
        self.class_weight = class_weight
        self.random_state = random_state
        self.preprocessor = None

    def fit(self, X, Y):
        import sklearn.svm
        from sklearn.feature_selection import SelectFromModel

        self.C = float(self.C)
        self.tol = float(self.tol)

        self.dual = self.dual == 'True'
        self.fit_intercept = self.fit_intercept == 'True'
        self.intercept_scaling = float(self.intercept_scaling)

        if self.class_weight == "None":
            self.class_weight = None

        estimator = sklearn.svm.LinearSVC(
            penalty=self.penalty,
            loss=self.loss,
            dual=self.dual,
            tol=self.tol,
            C=self.C,
            class_weight=self.class_weight,
            fit_intercept=self.fit_intercept,
            intercept_scaling=self.intercept_scaling,
            multi_class=self.multi_class,
            random_state=self.random_state)

        estimator.fit(X, Y)
        self.preprocessor = SelectFromModel(estimator=estimator,
                                            threshold='mean',
                                            prefit=True)

        return self

    def transform(self, X):
        if self.preprocessor is None:
            raise NotImplementedError()
        return self.preprocessor.transform(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'LinearSVC Preprocessor',
            'name': 'Liblinear Support Vector Classification Preprocessing',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': False,
            'input': (SPARSE, DENSE, UNSIGNED_DATA),
            'output': (INPUT, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        penalty = Constant("penalty", "l1")
        loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"],
                                         default="squared_hinge")
        dual = Constant("dual", "False")
        # This is set ad-hoc
        tol = UniformFloatHyperparameter("tol",
                                         1e-5,
                                         1e-1,
                                         default=1e-4,
                                         log=True)
        C = UniformFloatHyperparameter("C",
                                       0.03125,
                                       32768,
                                       log=True,
                                       default=1.0)
        multi_class = Constant("multi_class", "ovr")
        # These are set ad-hoc
        fit_intercept = Constant("fit_intercept", "True")
        intercept_scaling = Constant("intercept_scaling", 1)

        cs.add_hyperparameters([
            penalty, loss, dual, tol, C, multi_class, fit_intercept,
            intercept_scaling
        ])

        penalty_and_loss = ForbiddenAndConjunction(
            ForbiddenEqualsClause(penalty, "l1"),
            ForbiddenEqualsClause(loss, "hinge"))
        cs.add_forbidden_clause(penalty_and_loss)
        return cs
Пример #40
0
def fit_and_score(X,
                  y,
                  scoring,
                  train,
                  test,
                  parameters,
                  fit_params=None,
                  return_train_score=True,
                  return_n_test_samples=True,
                  return_times=True,
                  return_parameters=False,
                  return_estimator=False,
                  error_score='raise',
                  verbose=True,
                  return_all=True):
    """Fit an estimator to a dataset and score the performance.

    The following
    methods can currently be applied as preprocessing before fitting, in
    this order:
    0. Apply OneHotEncoder
    1. Apply feature imputation
    2. Select features based on feature type group (e.g. shape, histogram).
    3. Scale features with e.g. z-scoring.
    4. Apply feature selection based on variance of feature among patients.
    5. Univariate statistical testing (e.g. t-test, Wilcoxon).
    6. Use Relief feature selection.
    7. Select features based on a fit with a LASSO model.
    8. Select features using PCA.
    9. Resampling
    10. If a SingleLabel classifier is used for a MultiLabel problem,
        a OneVsRestClassifier is employed around it.

    All of the steps are optional.

    Parameters
    ----------
    estimator: sklearn estimator, mandatory
            Unfitted estimator which will be fit.

    X: array, mandatory
            Array containingfor each object (rows) the feature values
            (1st Column) and the associated feature label (2nd Column).

    y: list(?), mandatory
            List containing the labels of the objects.

    scorer: sklearn scorer, mandatory
            Function used as optimization criterion for the hyperparamater optimization.

    train: list, mandatory
            Indices of the objects to be used as training set.

    test: list, mandatory
            Indices of the objects to be used as testing set.

    parameters: dictionary, mandatory
            Contains the settings used for the above preprocessing functions
            and the fitting. TODO: Create a default object and show the
            fields.

    fit_params:dictionary, default None
            Parameters supplied to the estimator for fitting. See the SKlearn
            site for the parameters of the estimators.

    return_train_score: boolean, default True
            Save the training score to the final SearchCV object.

    return_n_test_samples: boolean, default True
            Save the number of times each sample was used in the test set
            to the final SearchCV object.

    return_times: boolean, default True
            Save the time spend for each fit to the final SearchCV object.

    return_parameters: boolean, default True
            Return the parameters used in the final fit to the final SearchCV
            object.

    return_estimator : bool, default=False
        Whether to return the fitted estimator.

    error_score: numeric or "raise" by default
            Value to assign to the score if an error occurs in estimator
            fitting. If set to "raise", the error is raised. If a numeric
            value is given, FitFailedWarning is raised. This parameter
            does not affect the refit step, which will always raise the error.

    verbose: boolean, default=True
            If True, print intermediate progress to command line. Warnings are
            always printed.

    return_all: boolean, default=True
            If False, only the ret object containing the performance will be
            returned. If True, the ret object plus all fitted objects will be
            returned.

    Returns
    ----------
    Depending on the return_all input parameter, either only ret or all objects
    below are returned.

    ret: list
        Contains optionally the train_scores and the test_scores,
        fit_time, score_time, parameters_est
        and parameters_all.

    GroupSel: WORC GroupSel Object
        Either None if the groupwise feature selection is not used, or
        the fitted object.

    VarSel: WORC VarSel Object
        Either None if the variance threshold feature selection is not used, or
        the fitted object.

    SelectModel: WORC SelectModel Object
        Either None if the feature selection based on a fittd model is not
        used, or the fitted object.

    feature_labels: list
        Labels of the features. Only one list is returned, not one per
        feature object, as we assume all samples have the same feature names.

    scaler: scaler object
        Either None if feature scaling is not used, or
        the fitted object.

    encoder: WORC Encoder Object
        Either None if feature OneHotEncoding is not used, or
        the fitted object.

    imputer: WORC Imputater Object
        Either None if feature imputation is not used, or
        the fitted object.

    pca: WORC PCA Object
        Either None if PCA based feature selection is not used, or
        the fitted object.

    StatisticalSel: WORC StatisticalSel Object
        Either None if the statistical test feature selection is not used, or
        the fitted object.

    ReliefSel: WORC ReliefSel Object
        Either None if the RELIEF feature selection is not used, or
        the fitted object.

    Sampler: WORC ObjectSampler Object
        Either None if no resampling is used, or an ObjectSampler object


    """
    # We copy the parameter object so we can alter it and keep the original
    if verbose:
        print("\n")
        print('#######################################')
        print('Starting fit and score of new workflow.')
    para_estimator = parameters.copy()
    estimator = cc.construct_classifier(para_estimator)

    # Check the scorer
    scorers, __ = check_multimetric_scoring(estimator, scoring=scoring)

    para_estimator = delete_cc_para(para_estimator)

    # Get random seed from parameters
    random_seed = para_estimator['random_seed']
    del para_estimator['random_seed']

    # X is a tuple: split in two arrays
    feature_values = np.asarray([x[0] for x in X])
    feature_labels = np.asarray([x[1] for x in X])

    # Split in train and testing
    X_train, y_train = _safe_split(estimator, feature_values, y, train)
    X_test, y_test = _safe_split(estimator, feature_values, y, test, train)
    train = np.arange(0, len(y_train))
    test = np.arange(len(y_train), len(y_train) + len(y_test))

    # Set some defaults for if a part fails and we return a dummy
    fit_time = np.inf
    score_time = np.inf
    Sampler = None
    encoder = None
    imputer = None
    scaler = None
    GroupSel = None
    SelectModel = None
    pca = None
    StatisticalSel = None
    VarSel = None
    ReliefSel = None
    if isinstance(scorers, dict):
        test_scores = {name: np.nan for name in scorers}
        if return_train_score:
            train_scores = test_scores.copy()
    else:
        test_scores = error_score
        if return_train_score:
            train_scores = error_score

    # Initiate dummy return object for when fit and scoring failes: sklearn defaults
    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(para_estimator)
    if return_estimator:
        ret.append(estimator)

    # Additional to sklearn defaults: return all parameters
    ret.append(parameters)

    # ------------------------------------------------------------------------
    # OneHotEncoder
    if 'OneHotEncoding' in para_estimator.keys():
        if para_estimator['OneHotEncoding'] == 'True':
            if verbose:
                print(f'Applying OneHotEncoding, will ignore unknowns.')
            feature_labels_tofit =\
                para_estimator['OneHotEncoding_feature_labels_tofit']
            encoder =\
                OneHotEncoderWrapper(handle_unknown='ignore',
                                     feature_labels_tofit=feature_labels_tofit,
                                     verbose=verbose)
            encoder.fit(X_train, feature_labels)

            if encoder.encoder is not None:
                # Encoder is fitted
                feature_labels = encoder.encoder.encoded_feature_labels
                X_train = encoder.transform(X_train)
                X_test = encoder.transform(X_test)

        del para_estimator['OneHotEncoding']
        del para_estimator['OneHotEncoding_feature_labels_tofit']

    # Delete the object if we do not need to return it
    if not return_all:
        del encoder

    # ------------------------------------------------------------------------
    # Feature imputation
    if 'Imputation' in para_estimator.keys():
        if para_estimator['Imputation'] == 'True':
            imp_type = para_estimator['ImputationMethod']
            if verbose:
                print(f'Imputing NaN with {imp_type}.')
            imp_nn = para_estimator['ImputationNeighbours']

            imputer = Imputer(missing_values=np.nan,
                              strategy=imp_type,
                              n_neighbors=imp_nn)
            imputer.fit(X_train)

            original_shape = X_train.shape
            X_train = imputer.transform(X_train)
            imputed_shape = X_train.shape
            X_test = imputer.transform(X_test)

            if original_shape != imputed_shape:
                removed_features = original_shape[1] - imputed_shape[1]
                raise ae.WORCValueError(
                    f'Several features ({removed_features}) were np.NaN for all objects. Hence, imputation was not possible. Either make sure this is correct and turn of imputation, or correct the feature.'
                )

        del para_estimator['Imputation']
        del para_estimator['ImputationMethod']
        del para_estimator['ImputationNeighbours']

    # Delete the object if we do not need to return it
    if not return_all:
        del imputer

    # Remove any NaN feature values if these are still left after imputation
    X_train = replacenan(X_train,
                         verbose=verbose,
                         feature_labels=feature_labels[0])
    X_test = replacenan(X_test,
                        verbose=verbose,
                        feature_labels=feature_labels[0])

    # ------------------------------------------------------------------------
    # Groupwise feature selection
    if 'SelectGroups' in para_estimator:
        if verbose:
            print("Selecting groups of features.")
        del para_estimator['SelectGroups']
        # TODO: more elegant way to solve this
        feature_groups = [
            'shape_features', 'histogram_features', 'orientation_features',
            'texture_gabor_features', 'texture_glcm_features',
            'texture_gldm_features', 'texture_glcmms_features',
            'texture_glrlm_features', 'texture_glszm_features',
            'texture_gldzm_features', 'texture_ngtdm_features',
            'texture_ngldm_features', 'texture_lbp_features', 'dicom_features',
            'semantic_features', 'coliage_features', 'vessel_features',
            'phase_features', 'fractal_features', 'location_features',
            'rgrd_features', 'original_features', 'wavelet_features',
            'log_features'
        ]

        # First take out the toolbox selection, which is a list
        toolboxes = para_estimator['toolbox']
        del para_estimator['toolbox']

        # Check per feature group if the parameter is present
        parameters_featsel = dict()
        for group in feature_groups:
            if group not in para_estimator:
                # Default: do use the group, except for texture features
                if group == 'texture_features':
                    value = 'False'
                else:
                    value = 'True'
            else:
                value = para_estimator[group]
                del para_estimator[group]

            parameters_featsel[group] = value

        # Fit groupwise feature selection object
        GroupSel = SelectGroups(parameters=parameters_featsel,
                                toolboxes=toolboxes)
        GroupSel.fit(feature_labels[0])
        if verbose:
            print("\t Original Length: " + str(len(X_train[0])))

        # Transform all objectd accordingly
        X_train = GroupSel.transform(X_train)
        X_test = GroupSel.transform(X_test)
        if verbose:
            print("\t New Length: " + str(len(X_train[0])))
        feature_labels = GroupSel.transform(feature_labels)

    # Delete the object if we do not need to return it
    if not return_all:
        del GroupSel

    # Check whether there are any features left
    if len(X_train[0]) == 0:
        # TODO: Make a specific WORC exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably all feature groups were set to False. Parameters:'
            )
            print(parameters)

        # Delete the non-used fields
        para_estimator = delete_nonestimator_parameters(para_estimator)

        if return_all:
            return ret, GroupSel, VarSel, SelectModel, feature_labels[
                0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
        else:
            return ret

    # ------------------------------------------------------------------------
    # Feature scaling
    if verbose and para_estimator['FeatureScaling'] != 'None':
        print(f'Fitting scaler and transforming features, method ' +
              f'{para_estimator["FeatureScaling"]}.')

    scaling_method = para_estimator['FeatureScaling']
    if scaling_method == 'None':
        scaler = None
    else:
        skip_features = para_estimator['FeatureScaling_skip_features']
        n_skip_feat = len([
            i for i in feature_labels[0] if any(e in i for e in skip_features)
        ])
        if n_skip_feat == len(X_train[0]):
            # Don't need to scale any features
            if verbose:
                print(
                    '[WORC Warning] Skipping scaling, only skip features selected.'
                )
            scaler = None
        else:
            scaler = WORCScaler(method=scaling_method,
                                skip_features=skip_features)
            scaler.fit(X_train, feature_labels[0])

    if scaler is not None:
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

    del para_estimator['FeatureScaling']

    # Delete the object if we do not need to return it
    if not return_all:
        del scaler

    # --------------------------------------------------------------------
    # Feature selection based on variance
    if para_estimator['Featsel_Variance'] == 'True':
        if verbose:
            print("Selecting features based on variance.")
        if verbose:
            print("\t Original Length: " + str(len(X_train[0])))
        try:
            X_train, feature_labels, VarSel =\
                selfeat_variance(X_train, feature_labels)
            X_test = VarSel.transform(X_test)
        except ValueError:
            if verbose:
                print(
                    '[WARNING]: No features meet the selected Variance threshold! Skipping selection.'
                )
        if verbose:
            print("\t New Length: " + str(len(X_train[0])))

    del para_estimator['Featsel_Variance']

    # Delete the object if we do not need to return it
    if not return_all:
        del VarSel

    # Check whether there are any features left
    if len(X_train[0]) == 0:
        # TODO: Make a specific WORC exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably your features have too little variance. Parameters:'
            )
            print(parameters)
        para_estimator = delete_nonestimator_parameters(para_estimator)

        if return_all:
            return ret, GroupSel, VarSel, SelectModel, feature_labels[
                0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
        else:
            return ret

    # --------------------------------------------------------------------
    # Relief feature selection, possibly multi classself.
    # Needs to be done after scaling!
    # para_estimator['ReliefUse'] = 'True'
    if 'ReliefUse' in para_estimator.keys():
        if para_estimator['ReliefUse'] == 'True':
            if verbose:
                print("Selecting features using relief.")

            # Get parameters from para_estimator
            n_neighbours = para_estimator['ReliefNN']
            sample_size = para_estimator['ReliefSampleSize']
            distance_p = para_estimator['ReliefDistanceP']
            numf = para_estimator['ReliefNumFeatures']

            # Fit RELIEF object
            ReliefSel = SelectMulticlassRelief(n_neighbours=n_neighbours,
                                               sample_size=sample_size,
                                               distance_p=distance_p,
                                               numf=numf,
                                               random_state=random_seed)
            ReliefSel.fit(X_train, y)
            if verbose:
                print("\t Original Length: " + str(len(X_train[0])))

            # Transform all objects accordingly
            X_train = ReliefSel.transform(X_train)
            X_test = ReliefSel.transform(X_test)

            if verbose:
                print("\t New Length: " + str(len(X_train[0])))
            feature_labels = ReliefSel.transform(feature_labels)

        del para_estimator['ReliefUse']
        del para_estimator['ReliefNN']
        del para_estimator['ReliefSampleSize']
        del para_estimator['ReliefDistanceP']
        del para_estimator['ReliefNumFeatures']

    # Delete the object if we do not need to return it
    if not return_all:
        del ReliefSel

    # Check whether there are any features left
    if len(X_train[0]) == 0:
        # TODO: Make a specific WORC exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably RELIEF could not properly select features. Parameters:'
            )
            print(parameters)
        para_estimator = delete_nonestimator_parameters(para_estimator)

        if return_all:
            return ret, GroupSel, VarSel, SelectModel, feature_labels[
                0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
        else:
            return ret

    # ------------------------------------------------------------------------
    # Perform feature selection using a model
    para_estimator['SelectFromModel'] = 'True'
    if 'SelectFromModel' in para_estimator.keys(
    ) and para_estimator['SelectFromModel'] == 'True':
        model = para_estimator['SelectFromModel_estimator']
        if verbose:
            print(f"Selecting features using model {model}.")

        if model == 'Lasso':
            # Use lasso model for feature selection
            alpha = para_estimator['SelectFromModel_lasso_alpha']
            selectestimator = Lasso(alpha=alpha)

        elif model == 'LR':
            # Use logistic regression model for feature selection
            selectestimator = LogisticRegression()

        elif model == 'RF':
            # Use random forest model for feature selection
            n_estimators = para_estimator['SelectFromModel_n_trees']
            selectestimator = RandomForestClassifier(n_estimators=n_estimators)
        else:
            raise ae.WORCKeyError(
                f'Model {model} is not known for SelectFromModel. Use Lasso, LR, or RF.'
            )

        # Prefit model
        selectestimator.fit(X_train, y_train)

        # Use fit to select optimal features
        SelectModel = SelectFromModel(selectestimator, prefit=True)
        if verbose:
            print("\t Original Length: " + str(len(X_train[0])))

        X_train_temp = SelectModel.transform(X_train)
        if len(X_train_temp[0]) == 0:
            if verbose:
                print(
                    '[WORC WARNING]: No features are selected! Probably your data is too noisy or the selection too strict. Skipping SelectFromModel.'
                )
            SelectModel = None
            parameters['SelectFromModel'] = 'False'
        else:
            X_train = SelectModel.transform(X_train)
            X_test = SelectModel.transform(X_test)
            feature_labels = SelectModel.transform(feature_labels)

            if verbose:
                print("\t New Length: " + str(len(X_train[0])))

    if 'SelectFromModel' in para_estimator.keys():
        del para_estimator['SelectFromModel']
        del para_estimator['SelectFromModel_lasso_alpha']
        del para_estimator['SelectFromModel_estimator']
        del para_estimator['SelectFromModel_n_trees']

    # Delete the object if we do not need to return it
    if not return_all:
        del SelectModel

    # Check whether there are any features left
    if len(X_train[0]) == 0:
        # TODO: Make a specific WORC exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably SelectFromModel could not properly select features. Parameters:'
            )
            print(parameters)
        para_estimator = delete_nonestimator_parameters(para_estimator)

        if return_all:
            return ret, GroupSel, VarSel, SelectModel, feature_labels[
                0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
        else:
            return ret

    # ----------------------------------------------------------------
    # PCA dimensionality reduction
    # Principle Component Analysis
    if 'UsePCA' in para_estimator.keys(
    ) and para_estimator['UsePCA'] == 'True':
        if verbose:
            print('Fitting PCA')
            print("\t Original Length: " + str(len(X_train[0])))
        if para_estimator['PCAType'] == '95variance':
            # Select first X components that describe 95 percent of the explained variance
            pca = PCA(n_components=None, random_state=random_seed)
            try:
                pca.fit(X_train)
            except (ValueError, LinAlgError) as e:
                if verbose:
                    print(
                        f'[WARNING]: skipping this setting due to PCA Error: {e}.'
                    )

                if return_all:
                    return ret, GroupSel, VarSel, SelectModel, feature_labels[
                        0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
                else:
                    return ret

            evariance = pca.explained_variance_ratio_
            num = 0
            sum = 0
            while sum < 0.95:
                sum += evariance[num]
                num += 1

            # Make a PCA based on the determined amound of components
            pca = PCA(n_components=num, random_state=random_seed)
            try:
                pca.fit(X_train)
            except (ValueError, LinAlgError) as e:
                if verbose:
                    print(
                        f'[WARNING]: skipping this setting due to PCA Error: {e}.'
                    )

                if return_all:
                    return ret, GroupSel, VarSel, SelectModel, feature_labels[
                        0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
                else:
                    return ret

            X_train = pca.transform(X_train)
            X_test = pca.transform(X_test)

        else:
            # Assume a fixed number of components: cannot be larger than
            # n_samples
            n_components = min(len(X_train), int(para_estimator['PCAType']))

            if n_components >= len(X_train[0]):
                if verbose:
                    print(
                        f"[WORC WARNING] PCA n_components ({n_components})> n_features ({len(X_train[0])}): skipping PCA."
                    )
            else:
                pca = PCA(n_components=n_components, random_state=random_seed)
                pca.fit(X_train)
                X_train = pca.transform(X_train)
                X_test = pca.transform(X_test)

        if verbose:
            print("\t New Length: " + str(len(X_train[0])))

    # Delete the object if we do not need to return it
    if not return_all:
        del pca

    if 'UsePCA' in para_estimator.keys():
        del para_estimator['UsePCA']
        del para_estimator['PCAType']

    # --------------------------------------------------------------------
    # Feature selection based on a statistical test
    if 'StatisticalTestUse' in para_estimator.keys():
        if para_estimator['StatisticalTestUse'] == 'True':
            metric = para_estimator['StatisticalTestMetric']
            threshold = para_estimator['StatisticalTestThreshold']
            if verbose:
                print(
                    f"Selecting features based on statistical test. Method {metric}, threshold {round(threshold, 5)}."
                )
                print("\t Original Length: " + str(len(X_train[0])))

            StatisticalSel = StatisticalTestThreshold(metric=metric,
                                                      threshold=threshold)

            StatisticalSel.fit(X_train, y)
            X_train_temp = StatisticalSel.transform(X_train)
            if len(X_train_temp[0]) == 0:
                if verbose:
                    print(
                        '[WORC WARNING]: No features are selected! Probably your statistical test feature selection was too strict. Skipping thresholding.'
                    )
                StatisticalSel = None
                parameters['StatisticalTestUse'] = 'False'
            else:
                X_train = StatisticalSel.transform(X_train)
                X_test = StatisticalSel.transform(X_test)
                feature_labels = StatisticalSel.transform(feature_labels)

            if verbose:
                print("\t New Length: " + str(len(X_train[0])))

        del para_estimator['StatisticalTestUse']
        del para_estimator['StatisticalTestMetric']
        del para_estimator['StatisticalTestThreshold']

    # Delete the object if we do not need to return it
    if not return_all:
        del StatisticalSel

    # ------------------------------------------------------------------------
    # Use object resampling
    if 'Resampling_Use' in para_estimator.keys():
        if para_estimator['Resampling_Use'] == 'True':

            # Determine our starting balance
            pos_initial = int(np.sum(y_train))
            neg_initial = int(len(y_train) - pos_initial)
            len_in = len(y_train)

            # Fit ObjectSampler and transform dataset
            # NOTE: need to save random state for this one as well!
            Sampler =\
                ObjectSampler(method=para_estimator['Resampling_Method'],
                              sampling_strategy=para_estimator['Resampling_sampling_strategy'],
                              n_jobs=para_estimator['Resampling_n_cores'],
                              n_neighbors=para_estimator['Resampling_n_neighbors'],
                              k_neighbors=para_estimator['Resampling_k_neighbors'],
                              threshold_cleaning=para_estimator['Resampling_threshold_cleaning'],
                              verbose=verbose)

            try:
                Sampler.fit(X_train, y_train)
                X_train_temp, y_train_temp = Sampler.transform(
                    X_train, y_train)

            except ae.WORCValueError as e:
                message = str(e)
                if verbose:
                    print('[WORC WARNING] Skipping resampling: ' + message)
                Sampler = None
                parameters['Resampling_Use'] = 'False'

            except RuntimeError as e:
                if 'ADASYN is not suited for this specific dataset. Use SMOTE instead.' in str(
                        e):
                    # Seldomly occurs, therefore return performance dummy
                    if verbose:
                        print(
                            f'[WARNING]: {e}. Returning dummies. Parameters: ')
                        print(parameters)
                    para_estimator = delete_nonestimator_parameters(
                        para_estimator)

                    if return_all:
                        return ret, GroupSel, VarSel, SelectModel, feature_labels[
                            0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
                    else:
                        return ret
                else:
                    raise e
            else:
                pos = int(np.sum(y_train_temp))
                neg = int(len(y_train_temp) - pos)
                if pos < 10 or neg < 10:
                    if verbose:
                        print(
                            f'[WORC WARNING] Skipping resampling: to few objects returned in one or both classes (pos: {pos}, neg: {neg}).'
                        )
                    Sampler = None
                    parameters['Resampling_Use'] = 'False'
                else:
                    X_train = X_train_temp
                    y_train = y_train_temp

                    # Notify the user what the resampling did
                    pos = int(np.sum(y_train))
                    neg = int(len(y_train) - pos)
                    if verbose:
                        message = f"Resampling from {len_in} ({pos_initial} pos," +\
                                  f" {neg_initial} neg) to {len(y_train)} ({pos} pos, {neg} neg) patients."
                        print(message)

                    # Also reset train and test indices
                    train = np.arange(0, len(y_train))
                    test = np.arange(len(y_train), len(y_train) + len(y_test))

        del para_estimator['Resampling_Use']
        del para_estimator['Resampling_Method']
        del para_estimator['Resampling_sampling_strategy']
        del para_estimator['Resampling_n_neighbors']
        del para_estimator['Resampling_k_neighbors']
        del para_estimator['Resampling_threshold_cleaning']
        del para_estimator['Resampling_n_cores']

    # Delete the object if we do not need to return it
    if not return_all:
        del Sampler

    # ----------------------------------------------------------------
    # Fitting and scoring
    # Only when using fastr this is an entry
    if 'Number' in para_estimator.keys():
        del para_estimator['Number']

    # For certainty, we delete all parameters again
    para_estimator = delete_nonestimator_parameters(para_estimator)

    # NOTE: This just has to go to the construct classifier function,
    # although it is more convenient here due to the hyperparameter search
    if type(y) is list:
        labellength = 1
    else:
        try:
            labellength = y.shape[1]
        except IndexError:
            labellength = 1

    if labellength > 1 and type(estimator) not in [
            RankedSVM, RandomForestClassifier
    ]:
        # Multiclass, hence employ a multiclass classifier for e.g. SVM, LR
        estimator.set_params(**para_estimator)
        estimator = OneVsRestClassifier(estimator)

    if verbose:
        print(f"Fitting ML method: {parameters['classifiers']}.")

    # Recombine feature values and label for train and test set
    feature_values = np.concatenate((X_train, X_test), axis=0)
    y = np.concatenate((y_train, y_test), axis=0)
    para_estimator = None

    try:
        ret = _fit_and_score(estimator,
                             feature_values,
                             y,
                             scorers,
                             train,
                             test,
                             verbose,
                             para_estimator,
                             fit_params,
                             return_train_score=return_train_score,
                             return_parameters=return_parameters,
                             return_n_test_samples=return_n_test_samples,
                             return_times=return_times,
                             return_estimator=return_estimator,
                             error_score=error_score)
    except (ValueError, LinAlgError) as e:
        if type(estimator) == LDA:
            if verbose:
                print(
                    f'[WARNING]: skipping this setting due to LDA Error: {e}.')

            if return_all:
                return ret, GroupSel, VarSel, SelectModel, feature_labels[
                    0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
            else:
                return ret
        else:
            raise e

    # Add original parameters to return object
    ret.append(parameters)

    if return_all:
        return ret, GroupSel, VarSel, SelectModel, feature_labels[
            0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
    else:
        return ret
Пример #41
0
# --------------
#Code Starts here
#Import Libraries
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

#Intiate the linear svc class by fitting on X_train and y_train as follows (save it as lsvc)
#C:0.01, penalty = 'l1', dual = False, random_state =42
lsvc = LinearSVC(C=0.01, penalty='l1', dual=False, random_state=42)
lsvc.fit(X_train, y_train)

#Initiate SelectFromModel class on lsvc and set prefit as True, also name the result of this variable as model_2.
model_2 = SelectFromModel(lsvc, prefit=True)

#Create new_train_features and new_test_features using model_2 to transform on X_train and X_test respectively.
new_train_features = model_2.transform(X_train)
new_test_features = model_2.transform(X_test)

#Initiate the SVC class and call it as classifier_2.
classifier_2 = SVC()

#Fit the SVC classifier on new_train_features and y_train and store it in clf_2.
clf_2 = classifier_2.fit(new_train_features, y_train)

#Use clf_2 to predict on new_test_features and save it as y_pred_new.
y_pred_new = clf_2.predict(new_test_features)

#Store the accuracy score of the model in the variable named as model2_score.
model2_score = accuracy_score(y_test, y_pred_new)
precision, recall, f_score, support = error_metric(y_test,
                                                   y_pred_new,
Пример #42
0
print(
    "==========================================================================================="
)
print("Shape of Dataset:", maldata.shape)
print(
    "==========================================================================================="
)
df = maldata
X = df.iloc[:, 0:88].values
y = df.iloc[:, 88].values

# Random Forest importance
clf = RandomForestClassifier(random_state=0)
model = clf.fit(X, y)
select = SelectFromModel(model, prefit=True)
X_new = select.transform(X)
print("*Feature Selection*")
print("Shape before using feature selection:", X.shape)
print("Shape after feature selection:", X_new.shape)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
# print(importances)
# List of Feature
print("Feature ranking:")
for f in range(X_new.shape[1]):
    print("%d. %s (%f)" %
          (f + 1, df.columns[indices[f]], importances[indices[f]]))
print(
    "==========================================================================================="
)
# Visualization feature
Пример #43
0
def RF():
    global New_data, data_test
    global x_train, x_test, y_train, y_test
    global new_x_train, new_x_test, new_data
    text.delete('1.0', END)
    text.insert(END, "\t\t\t\tRandom Forest Classifier\n\n")
    clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
    clf = clf.fit(x_train, y_train)
    features = pd.DataFrame()
    features['Feature'] = x_train.columns
    features['Importance'] = clf.feature_importances_
    features.sort_values(by=['Importance'], ascending=False, inplace=True)
    features.set_index('Feature', inplace=True)
    text.insert(
        END,
        "Selected Important Features Automatically by using *feature_importances_* & *SelectFromModel*\n\n"
    )
    text.insert(END, features[:5])
    selector = SelectFromModel(clf, prefit=True)
    train_reduced = selector.transform(x_train)
    new_x_train = pd.DataFrame(train_reduced,
                               columns=[
                                   'Debt_Income_Ratio', 'Credit_History_Bad',
                                   'Total_Income', 'LoanAmount',
                                   'Credit_History_Good'
                               ])
    test_reduced = selector.transform(x_test)
    new_x_test = pd.DataFrame(test_reduced,
                              columns=[
                                  'Debt_Income_Ratio', 'Credit_History_Bad',
                                  'Total_Income', 'LoanAmount',
                                  'Credit_History_Good'
                              ])
    new_reduced = selector.transform(New_data)
    new_data = pd.DataFrame(new_reduced,
                            columns=[
                                'Debt_Income_Ratio', 'Credit_History_Bad',
                                'Total_Income', 'LoanAmount',
                                'Credit_History_Good'
                            ])
    parameters = {
        'bootstrap': False,
        'min_samples_leaf': 3,
        'n_estimators': 50,
        'min_samples_split': 10,
        'max_features': 'sqrt',
        'max_depth': 6
    }

    rf = RandomForestClassifier(**parameters)
    rf.fit(new_x_train, y_train)
    pred = rf.predict(new_x_test)
    acc = accuracy_score(y_test, pred)
    cm = confusion_matrix(y_test, pred)
    CR = classification_report(y_test, pred)
    output = rf.predict(new_data).astype(int)
    df_output = pd.DataFrame()
    df_output['Loan_ID'] = data_test['Loan_ID']
    df_output['Loan_Predicted_Status'] = np.vectorize(
        lambda s: 'Y' if s == 1 else 'N')(output)
    df_output[['Loan_ID', 'Loan_Predicted_Status'
               ]].to_csv('*****@*****.**',
                         index=False)
    text.insert(END, "\n\nConfusion Matrix:\n" + str(cm) + "\n\n")
    text.insert(
        END, "Accuracy Score:\n" + str(np.round(acc * 100, 4)) + ' %' + "\n\n")
    text.insert(END, "Predicted Values on Test Data:\n" + str(pred) + "\n\n")
    text.insert(END, "Classification Report:\n" + str(CR))
    text.insert(END, "\n\nFinal Predicted values on New Data:\n\n")
    text.insert(END, df_output)
    text.insert(END,
                "\n\nCheck the Project Directory for Submission CSV file\n\n")
    text.insert(END, "@@@------------------Thank You--------------------@@@")
Пример #44
0
    feats[feature] = importance
importances = pd.DataFrame.from_dict(
    feats, orient='index').rename(columns={0: 'Gini-importance'})
Feature_Importance = importances.sort_values(by='Gini-importance')
#print(Feature_Importance)

feat_labels = df.loc[:, 'BTC':].columns
#for feature in zip(feat_labels, regressor.feature_importances_):
#print(feature)

sfm = SelectFromModel(regressor, threshold=0.005)
sfm.fit(X_train, y_train)
#for feature_list_index in sfm.get_support(indices=True):
#print(feat_labels[feature_list_index])

X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

rfr_important = RandomForestRegressor(n_estimators=100,
                                      random_state=0,
                                      n_jobs=-1)
rfr_important.fit(X_important_train, y_train)

y_important_pred = rfr_important.predict(X_important_test)
#print("Explained Variance 2:", explained_variance_score(y_test, y_important_pred))

#cross validation
#cvscores_10 = cross_val_score(regressor, X, y, cv = 10)
#print("CV Score",np.mean(cvscores_10))

#svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
# =============================================================================

# 2) 확장된 dataset을 SelectFromModel에 적용
m_rf = rf_c()
m_select1 = SelectFromModel(m_rf,               # 변수 중요도를 파악할 모델 명 전달 
                            threshold='median') # 선택 범위
 
m_select1.fit(df_iris.data, df_iris.target)
m_select1.get_support()

m_select1.fit(df_iris_new, df_iris.target)
m_select1.get_support()

# 3) 선택된 변수의 dataset 추출
df_iris_new[:, m_select1.get_support()]     # 중요변수 선택 후 dataset
m_select1.transform(df_iris_new)

# 4) 변수 중요도 확인
m_select1.estimator_.feature_importances_

# 2.2.2 변수선택 방법 2 : 일변량 통계 기법
# - 변수 하나와 종속변수와의 상관 관계 중심으로 변수 선택
# - 다른 변수가 함께 학습될때의 판단과는 다른 결과가 나올 수 있음
# - 학습 시킬 모델이 필요 없어 연산속도가 매우 빠름
from sklearn.feature_selection import SelectPercentile
  
# 1) 변수 선택 모델 생성 및 적용
m_select2 = SelectPercentile(percentile=30)
m_select2.fit(df_iris_new, df_iris.target)

# 2) 변수 선택 결과 dataset 확인
Пример #46
0
def main():
    data = pd.read_csv('data.csv', sep='|')
    X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
    y = data['legitimate'].values
    print('Researching important feature based on %i total features\n' %
          X.shape[1])
    # import pdb; pdb.set_trace()

    # Feature selection using Trees Classifier
    fsel = ske.ExtraTreesClassifier().fit(X, y)
    # fsel = ske.GradientBoostingClassifier(n_estimators=100).fit(X, y)

    model = SelectFromModel(fsel, prefit=True)
    X_new = model.transform(X)
    nb_features = X_new.shape[1]
    # nb_features = X.shape[1]

    # sklearn has a test_train_split (who doesn't?)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X_new, y, test_size=0.9)

    features = []

    print('%i features identified as important:' % nb_features)

    indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
    for f in range(nb_features):
        print("%d. feature %s (%f)" % (f + 1, data.columns[2 + indices[f]],
                                       fsel.feature_importances_[indices[f]]))

    # XXX : take care of the feature order
    for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
        features.append(data.columns[2 + f])

    #Algorithm comparison
    algorithms = {
        "Toms Classifier rand":
        TomsClassifier(),
        "Toms Classifier const":
        TomsClassifier(random=False),
        "DecisionTree":
        sklearn.tree.DecisionTreeClassifier(max_depth=10),
        "RandomForest":
        ske.RandomForestClassifier(n_estimators=100),
        "GradientBoosting":
        ske.GradientBoostingClassifier(n_estimators=100),
        "AdaBoost":
        ske.AdaBoostClassifier(n_estimators=100),
        "Logistic Regression":
        LogisticRegression(random_state=0,
                           solver='lbfgs',
                           multi_class='multinomial').fit(X, y),
        "Gaussian Naive Bayes":
        GaussianNB(),
        "SVM":
        SVC(),
        "Perceptron":
        MLPClassifier(solver='lbfgs',
                      alpha=1e-5,
                      hidden_layer_sizes=(5, 2),
                      random_state=1),
        "Perceptron sgd":
        MLPClassifier(solver='sgd',
                      alpha=1e-2,
                      hidden_layer_sizes=(5, 2),
                      random_state=1),
        "Perceptron adam":
        MLPClassifier(solver='adam',
                      alpha=1e-5,
                      hidden_layer_sizes=(10, 10, 10, 10)),
    }

    results = {}
    print("\nNow testing algorithms")
    for algo in algorithms:
        clf = algorithms[algo]
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print('{:>25} : {}'.format(algo, score * 100))
        results[algo] = score

    winner = max(results, key=results.get)
    print('\nWinner algorithm is %s with a %f %% success' %
          (winner, results[winner] * 100))

    # # Save the algorithm and the feature list for later predictions
    # print('Saving algorithm and feature list in classifier directory...')
    # joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
    # open('classifier/features.pkl', 'wb').write(pickle.dumps(features))
    # print('Saved')

    # Identify false and true positive rates
    clf = algorithms[winner]
    res = clf.predict(X_test)
    mt = confusion_matrix(y_test, res)
    print("False positive rate : %f %%" %
          ((mt[0][1] / float(sum(mt[0]))) * 100))
    print('False negative rate : %f %%' %
          ((mt[1][0] / float(sum(mt[1])) * 100)))
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix


MalwareDataset = pd.read_csv('MalwareData.csv', sep='|')
Legit = MalwareDataset[0:41323].drop(['legitimate'], axis=1)
Malware = MalwareDataset[41323::].drop(['legitimate'], axis=1)

#print('[+] Number of important features is %i \n' % Legit.shape[1])

Data = MalwareDataset.drop(['Name', 'md5', 'legitimate'], axis=1).values
Target = MalwareDataset['legitimate'].values
FeatSelect =  ExtraTreesClassifier().fit(Data, Target)
Model = SelectFromModel(FeatSelect, prefit=True)
Data_new = Model.transform(Data)



Legit_Train, Legit_Test, Malware_Train, Malware_Test = train_test_split(Data_new, Target ,test_size=0.2)  
clf =  sklearn.ensemble.RandomForestClassifier(n_estimators=50)
clf.fit(Legit_Train, Malware_Train)
score = clf.score(Legit_Test, Malware_Test)

print("[+] model accuracy score of Random Forest Algorithm is: {}%".format(score*100))

Result = clf.predict(Legit_Test)
CM = confusion_matrix(Malware_Test, Result)
print("[+] False positive rate : %f %%" % ((CM[0][1] / float(sum(CM[0])))*100))
print('[+] False negative rate : %f %%' % ( (CM[1][0] / float(sum(CM[1]))*100)))
# with open('feature_model.pickle', 'wb') as fp:
#     pickle.dump(clf, fp)

with open('feature_model.pickle') as fp:
    clf = pickle.load(fp)

feature = pd.DataFrame()
feature['feature'] = train_set.columns
feature['importance'] = clf.feature_importances_
feature.sort_values(by=['importance'], ascending=True, inplace=True)
feature.set_index('feature', inplace=True)
# feature.plot(kind='barh', figsize=(20, 20))
# plt.savefig('figure1.png')

model = SelectFromModel(clf, prefit=True, threshold=0.1)
train_reduced = model.transform(train_set)
test_reduced = model.transform(test_set)
print 'Dimension after Feature Selection: ', train_reduced.shape[1]

header = feature.index.tolist()[::-1][:train_reduced.shape[1]]
header.append('label')

train_reduced = np.concatenate(
    [train_reduced, np.array(train_labels).reshape((-1, 1))], axis=1)
test_reduced = np.concatenate(
    [test_reduced, np.array(test_labels).reshape((-1, 1))], axis=1)

pd.DataFrame(train_reduced).to_csv('dataset/kddcup.train.data.reduced.csv',
                                   index=False,
                                   header=header)
pd.DataFrame(test_reduced).to_csv('dataset/kddcup.test.data.reduced.csv',
Пример #49
0
# 加载数据
cancer = load_breast_cancer()

# 获得确定性的随机数
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))

# 添加噪声
X_w_noise = np.hstack([cancer.data, noise])
X = X_w_noise
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 模型训练
select = SelectFromModel(RandomForestClassifier(n_estimators=100,
                                                random_state=42),
                         threshold='median')
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
# print(X_train.shape)
# print(X_train_l1.shape)
X_test_l1 = select.transform(X_test)
score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)
print(score)

# 可视化
mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel('sample index')
plt.show()
Пример #50
0
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(
    X, y, test_size=validation_size, random_state=seed)
X_train.shape

# ### L1-based feature selection
# Our dataset contains a lot of features (216 to be more specific).
#
# Some features are collinear, so we can and we must to transform our data. To do that, I choosed to use a L1-based feature selection method.
#
# Its importante to say that smaller C implies in fewer features selected.

# In[82]:

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train)
modellsvc = SelectFromModel(lsvc, prefit=True)
X_train_new = modellsvc.transform(X_train)
X_train_new.shape

# ### Select a classifier
# We will evaluate six classifiers, to choose the best model to classify our validation data. The criteria to choose the best is the accuracy of the model on the train data.
#
# We use a cross-validation (k-fold with k = 10) to evaluate the models.

# In[83]:

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
Пример #51
0
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
array_FS = titanic_train[predictors + ["Survived"]].values
X_FS = array_FS[:, 0:21]
Y_FS = array_FS[:, 21]
model = ExtraTreesClassifier()
model.fit(X_FS, Y_FS)
features = pandas.DataFrame()
features['feature'] = predictors
features['importance'] = model.feature_importances_
print(predictors)
print(model.feature_importances_)
features.sort(['importance'], ascending=False)

model_tr = SelectFromModel(model, prefit=True)
train_new = model_tr.transform(titanic_train[predictors])
train_new.shape
test_new = model_tr.transform(titanic_test[predictors])
test_new.shape

forest = RandomForestClassifier()
parameter_grid = {
    'max_depth': [4, 5, 6, 7, 8],
    'n_estimators': [200, 210, 220, 230, 240, 250, 260, 270, 280, 290],
    'criterion': ['gini', 'entropy']
}
cross_validation = StratifiedKFold(Y_FS, n_folds=10)
grid_search = GridSearchCV(forest,
                           param_grid=parameter_grid,
                           cv=cross_validation)
grid_search.fit(train_new, Y_FS)
Пример #52
0
mse = mean_squared_error(y_test, y_pred)  #, multioutput='raw_values')
r2 = r2_score(y_test, y_pred)  #, multioutput='raw_values')

ONE_MEGABYTE = 1048576

print("Prediction score (MAE): %.2f" % (mae / ONE_MEGABYTE))
print("Prediction score (MSE): %.2f" % (mse / ONE_MEGABYTE))
print("Prediction score (R2): %.2f" % (r2))

# In[21]:

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

model = SelectFromModel(clf, prefit=True)
tuxdata_reduced = model.transform(tuxdata.drop(columns=size_methods))
tuxdata_reduced.shape, tuxdata.shape

# In[22]:

#lass = SelectFromModel(LassoCV(tol = 0.001))
#lass.fit(X_train, y_train)
#tuxdata_reduced_lass = lass.transform(tuxdata.drop(columns=size_methods))
#tuxdata_reduced_lass.shape, tuxdata.shape

# In[23]:

ft_vals = ['y', 'n']
tri_state_values = ['y', 'n', 'm']
all(x in tri_state_values for x in ft_vals)
Пример #53
0
clf_mri.fit(scaled_X, labels_train)

# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than median
sfm_mri = SelectFromModel(clf_mri, threshold="mean")
# Train the selector
sfm_mri.fit(scaled_X, labels_train)

# Collect the feature with importance
anatomy = []
for feature_list_index in sfm_mri.get_support(indices=True):
    anatomy.append(data_train.columns[feature_list_index])

# Transform the data to create a new dataset containing only the most important features
# to both the training X and test X data.
X_important_train = sfm_mri.transform(scaled_X)
X_important_test = sfm_mri.transform(scaled_test)

# Create a new random forest classifier for the most important features
clf_mri_features = RandomForestClassifier(n_estimators=10000,
                                          random_state=1988,
                                          oob_score=True,
                                          n_jobs=-1)
# Train the new classifier on the new dataset containing the most important features
clf_mri_features.fit(X_important_train, labels_train)

from sklearn.metrics import accuracy_score
# Apply the full featured classifier to the Test Data
y_important_pred = clf_mri_features.predict(X_important_test)

# View the Accuracy of the Limited Feature Model
    np.std(results['test_accuracy']) * 2))
print("precision score: {0:.2%} (+/- {1:.2%})".format(
    np.mean(results['test_precision']),
    np.std(results['test_precision']) * 2))
print("recall score: {0:.2%} (+/- {1:.2%})".format(
    np.mean(results['test_recall']),
    np.std(results['test_recall']) * 2))
print("f1_score: {0:.2%} (+/- {1:.2%})".format(
    np.mean(results['test_f1_score']),
    np.std(results['test_f1_score']) * 2))

# plot feature importance
plot_importance(model)
pyplot.show()
"""# TRAIN SELECTED FEATURES"""

thresholds = sort(model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    # train model
    selection_model = XGBClassifier()
    selection_model.fit(select_X_train, y_train)
    # eval model
    select_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(select_X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" %
          (thresh, select_X_train.shape[1], accuracy * 100.0))
Пример #55
0
from sklearn import tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

data = pd.read_csv('data.csv', sep='|')
X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
y = data['legitimate'].values

print('Researching important feature based on %i total features\n' % X.shape[1])

# Feature selection using Trees Classifier
fsel = ske.ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(fsel, prefit=True)
X_new = model.transform(X)
nb_features = X_new.shape[1]

#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2)

features = []

print('%i features identified as important:' % nb_features)

indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
for f in range(nb_features):
    print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))

# XXX : take care of the feature order
for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
    features.append(data.columns[2+f])
Пример #56
0
def xgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
    # SelectfromModel
    clf = XGBClassifier(n_estimators=50)
    clf.fit(matrix_x_temp, label_y)
    sfm = SelectFromModel(clf, prefit=True, threshold=th)
    matrix_x = sfm.transform(matrix_x_temp)

    # 打印出有多少特征重要性非零的特征
    feature_score_dict = {}
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print 'number of not-zero features:' + str(len(feature_score_dict) - m)

    # 打印出特征重要性
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                       key=lambda d: d[1],
                                       reverse=True)
    print 'xgb_feature_importance:'
    for ii in range(len(feature_score_dict_sorted)):
        print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][
            1]
    print '\n'

    f = open('../eda/xgb_feature_importance.txt', 'w')
    f.write('Rank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(
            str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' +
            str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()

    # 打印具体使用了哪些字段
    how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    print 'feature_chooesed:'
    for ii in range(len(feature_used_name)):
        print feature_used_name[ii]
    print '\n'

    f = open('../eda/xgb_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()

    # 找到未被使用的字段名
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])

    # 生成一个染色体(诸如01011100这样的)
    chromosome_temp = ''
    feature_name_ivar = fe_name[:-1]
    for ii in range(len(feature_name_ivar)):
        if feature_name_ivar[ii] in feature_used_name:
            chromosome_temp += '1'
        else:
            chromosome_temp += '0'
    print 'Chromosome:'
    print chromosome_temp
    joblib.dump(chromosome_temp, '../config/chromosome.pkl')
    print '\n'
    return matrix_x, feature_not_used_name[:], len(feature_used_name)
Пример #57
0
# confusion matrix
print('confusion_matrix')
print(pd.DataFrame(confusion_matrix(y_test, pred)))

model = model.best_estimator_


n     = 0
b_acc = acc

thresholds = np.sort(model.feature_importances_)
for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    

    select_x_train = selection.transform(x_train)
    selection_model = XGBClassifier()
    selection_model.fit(select_x_train,y_train)

    select_x_test = selection.transform(x_test)
    y_predict = selection_model.predict(select_x_test)

    acc = selection_model.score(select_x_test,y_test)
    acc_score = accuracy_score(y_test,y_predict)
    if acc > b_acc:
        n = select_x_train.shape[1]
        b_acc = acc
        L_selection = selection
        print("Thresh=%.3f, n=%d, acc: %.15f%%, acc_score: %.15f%%"%(thresh,select_x_train.shape[1],acc,acc_score))
    
Пример #58
0
kmeans = KMeans(n_clusters=2)
kmeans.fit(x_train)
labels = kmeans.predict(x_train)

a = np.array(y_train)
df = pd.DataFrame({'Labels': labels, 'Actual': a.flatten()})
ct = pd.crosstab(df['Labels'], df['Actual'])
print(ct)

from sklearn.feature_selection import SelectFromModel

select = SelectFromModel(RandomForestClassifier(max_depth=9),
                         threshold='median')

select.fit(x_train, y_train)
x_train_l1 = select.transform(x_train)
print(x_train.shape)
print(x_train_l1.shape)

mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.yticks([0])

#총 특성 117개 중 59개가 선택되었습니다.

dtc3 = DecisionTreeClassifier(max_depth=7)
dtc3.fit(x_train_l1, y_train)
dtcscores = cross_val_score(dtc3, x_train, y_train, cv=5)
print("DecisionTreeClassifier Cross Validation Attempt 3: " + str(dtcscores))

#최종결과 출력
Пример #59
0
#Read in data
bid = pd.read_csv("full_features.csv", index_col=0)
bid = bid.drop(["address", "payment_account"], axis=1)
bid = bid[(bid.outcome==0) | (bid.numbids > 10)]
test = pd.read_csv("full_features_test.csv", index_col=0)
test = test.drop("address", axis=1)
X = bid.iloc[:,2:]
Y = bid.iloc[:,1]
testX = test.iloc[:,2:]
testY = test.iloc[:,1]

#SVM
lsvc = LinearSVC(C=1, penalty="l1", dual=False).fit(X, Y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
testX = model.transform(testX)


#Random Forest
print 'Random Forest'
algo_rf = RandomForestClassifier(280)
algo_rf.fit(X_new,Y)
hyp = algo_rf.predict(X_new)
kfold = KFold(n_splits=20, shuffle=True, random_state=200)
score = cross_val_score(algo_rf, X_new, Y, cv=kfold, scoring="roc_auc")
preds = algo_rf.predict_proba(X_new)
print "On Train: ", metrics.roc_auc_score(Y, preds[:,1])
print "Cross-Val: ", np.mean(score)

#Get test prediction and write to csv for Kaggle evaluation
Пример #60
0
        print("%s)  %f" % (feature_index[i], 
                            feature_influence[feature_index[i]]))
    newlist.append(feature_influence[feature_index[i]])

np.cumsum(newlist)
print(count)



yf_pred = forest1.predict(Xd_test)
print('Accuracy: {:.3f}'.format(accuracy_score(yd_test, yf_pred)))
print('Accuracy: {:.2f}%'.format(accuracy_score(yd_test, yf_pred) * 100))
# Accuracy: 96.48%



# using sklearn SelectFromModel
## to select important features
feature_select = SelectFromModel(forest1, threshold=0.0100, prefit=True)
features_selected = feature_select.transform(Xd_train)
print('Number meeting threshold criterion:', features_selected.shape[1])

print("{}    {}".format('Feature Number', 'Percentage Influence'))
for t in range(features_selected.shape[1]):
        print("{0:>12} {1:>11.02f}%".format(feature_index[t], 
                            feature_influence[feature_index[t]]))


for t in range(features_selected.shape[1]):
        print("{0:>3}) {1:^10.04f}".format(feature_index[t], 
                            feature_influence[feature_index[t]]))