def final_feats(df_data):
    x_train = df_data.iloc[:,1:370] #removing the "ID" and the "Target" columns

    

    """Getting the first 2 PCs""" 
    pca = PCA(n_components=2)
    x_train_projected = pca.fit_transform(normalize(x_train, axis=0))
   
    x_train, del_constants = remove_feat_constants(x_train) 
    """ removing columns with no 
    variance; in our case the all-zero columns"""
    x_train, del_identicals = remove_feat_identicals(x_train)
    """removing columns that are identical to each other, and retainining
    only one of them"""
    y_train = df_data["TARGET"]


# Using L1 based feature selection on X_train with 308 columns
    lsvc = svm.LinearSVC(C=0.01, penalty="l1", dual=False).fit(x_train, y_train)
    model = SelectFromModel(lsvc, prefit=True)
    feat_ix_keep = model.get_support(indices=True) #getting indices of selected features
#so that I don't have to use "transform" and convert the data frame to a matrix.
    orig_feat_ix = np.arange(x_train.columns.size)
    feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)

    X_train_new = x_train.drop(labels=x_train.columns[feat_ix_delete],
                                 axis=1)
    X_train_new.insert(1, 'PCAOne', x_train_projected[:, 0])
    X_train_new.insert(1, 'PCATwo', x_train_projected[:, 1])
    return X_train_new, y_train, feat_ix_keep, pca, del_constants, del_identicals                             
Exemplo n.º 2
0
def selecttest():
    import matplotlib.pyplot as plt
    import numpy as np

    from sklearn.datasets import load_boston
    from sklearn.feature_selection import SelectFromModel
    from sklearn.linear_model import LassoCV

    boston = load_boston()
    X,y = boston['data'], boston['target']

    clf = LassoCV()
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(X,y)
    n_features = sfm.transform(X).shape[1]

    while n_features > 2:
        sfm.threshold += 0.1
        X_transform = sfm.transform(X)
        n_features = X_transform.shape[1]

    plt.title(
    "Features selected from Boston using SelectFromModel with "
    "threshold %0.3f." % sfm.threshold)
    feature1 = X_transform[:, 0]
    feature2 = X_transform[:, 1]
    plt.plot(feature1, feature2, 'r.')
    plt.xlabel("Feature number 1")
    plt.ylabel("Feature number 2")
    plt.ylim([np.min(feature2), np.max(feature2)])
    plt.show()
Exemplo n.º 3
0
    def tree_based_selection(self, data_set, data_target, feature_names):
        """

        :param data_set:
        :return:
        """

        clf = ExtraTreesClassifier()
        clf = clf.fit(data_set, data_target)
        print clf.feature_importances_

        model = SelectFromModel(clf, prefit=True)
        feature_set = model.transform(data_set)

        fea_index = []
        for A_col in np.arange(data_set.shape[1]):
            for B_col in np.arange(feature_set.shape[1]):
                if (data_set[:, A_col] == feature_set[:, B_col]).all():
                    fea_index.append(A_col)

        check = {}
        for i in fea_index:
            check[feature_names[i]] = data_set[0][i]
        print np.array(check)

        return feature_set, fea_index
Exemplo n.º 4
0
def select_features(data, neg_cmpd, pos_cmpd, compound_col="Metadata_compound",
                    C=0.01):
    """
    Return selected features basd on L1 linear svc.

    Parameters
    -----------
    data : pandas DataFrame
    neg_cmpd : string
        name of negative control in compound_col
    pos_cmpd : string
        name of positive control in compound_col
    compound_col : string
        name of column in data that contains compound labels
    C : float (default=0.01)
        Sparsity, lower the number the fewer features are selected

    Returns
    -------
    selected_features : list
        Selected features
    """
    X, Y = _split_classes(data, neg_cmpd, pos_cmpd, compound_col)
    lin_svc = LinearSVC(C=C, penalty="l1", dual=False).fit(X, Y)
    model = SelectFromModel(lin_svc, prefit=True)
    feature_mask = np.array(model.get_support())
    feature_names = np.array(X.columns.tolist())
    selected_features = list(feature_names[feature_mask])
    return selected_features
Exemplo n.º 5
0
def lassoCV_regression(data,target,alphas):
    clf=LassoCV()
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(data, target)
    n_features = sfm.transform(data).shape[1]
    
    while n_features > 2:
        sfm.threshold += 0.1
        data_transform = sfm.transform(data)
        n_features = data_transform.shape[1]
     
    rmses=[]
    kf=KFold(len(target),10,True,None)
    for train_index, test_index in kf:
        data_train,data_test=data_transform[train_index],data_transform[test_index]
        target_train,target_test=target[train_index],target[test_index]
        clf.fit(data_train,target_train)
        rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2))
        rmses.append(rmse)
        
    x0=np.arange(1,11)
    
    plt.figure()
    plt.plot(x0,rmses,label='LassoCV')
    plt.legend()
    plt.show()
    
    return rmses
def run():
    allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove =
        ["KWH", "KWHSPH", "KWHCOL", "KWHWTH", "KWHRFG", "KWHOTH", "BTUEL", "BTUELSPH", "BTUELCOL", "BTUELWTH", "BTUELRFG","BTUELOTH",
        "DOLLAREL", "DOLELSPH", "DOLELCOL", "DOLELWTH", "DOLELRFG", "DOLELOTH", "TOTALBTUOTH", "TOTALBTUCOL", 'TOTALBTU', 'TOTALBTUWTH',
         'TOTALBTU', 'TOTALBTUSPH', 'TOTALBTURFG', 'TOTALDOL', 'TOTALDOLSPH', 'TOTALDOLCOL', 'TOTALDOLWTH', 'TOTALDOLRFG', 'TOTALDOLOTH'])
    #allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove = [], forceUse =
    #                         [
    #                           'WGTP', 'NP', 'TYPE', 'ACR', 'BDSP', 'BATH', 'FS','MHP', 'RMSP', 'RNTP', 'REFR', 'RNTP', 'RWAT', 'STOV', 'TEN', 'VALP', 'YBL', 'FES', 'FINCP', 'HINCP', 'HHT', 'KIT', 'NOC', 'NPF', 'PLM', 'SRNT', 'SVAL', 'TAXP', 'WIF', 'WORKSTAT',
    #                         ])

    clf = RandomForestRegressor(n_estimators = 100, n_jobs = 7)
    clf.fit(X, y)

    model = SelectFromModel(clf, prefit = True)
    X = model.transform(X)

    relevantFeatures = [allKeys[i] for i in range(len(model._get_support_mask())) if model._get_support_mask()[i] == True]
    print("Relevant Features", relevantFeatures)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)


    clf.fit(X_train, y_train)
    print(y_test[:100])
    print(metrics.mean_squared_error(clf.predict(X_test), y_test))
    features = sorted(zip(allKeys, clf.feature_importances_), key = lambda x : x[1], reverse = True)
    print("Features", features)
Exemplo n.º 7
0
def select_features(x, y, methods=('variance', 'correlation', 'l1', 'forest')):
    ''' methods = ('variance', 'correlation', 'l1', 'forest')
        - variance: use variance threshold to discard features that are mostly 0 or 1
        - correlation: use chi2 test to remove most very correlated features
        - l1: use l1 penalty to remove features that make solution sparse
        - forest: use ExtraTreesClassifier to point out importance of features
                    select important ones
    '''
    features = x.loc[:,'Feature_1':'Feature_2']

    if 'variance' in methods:
        vt = VT(threshold=(0.99*(1-0.99)))
        vt.fit(features)
        

    if 'correlation' in methods:
        cr = SP(f_regression, percentile=80)

    if 'l1' in methods:
        rgr = MultiTaskLassoCV(cv=5, n_jobs=-1)
        m = SFM(rgr)
        

    if 'forest' in methods:
        clf = RandomRorestRegressor(n_estimators=300, max_features=0.7,n_jobs=-1).fit(x,y)
        m = SFM(clf)
        m.fit(x.values, y.values)

    for indices in idx_list:
        x_indices = x_indices & indices
    print 'All: %s' % len(x_indices)

    return list(x_indices)
Exemplo n.º 8
0
def feature_selction(_train_data, _valid_data, _test_data, _train_label, _valid_label, _test_label):
    train_imageNo = _train_data.shape[0]
    valid_imageNo = _valid_data.shape[0]
    whole_data = numpy.concatenate((_train_data, _valid_data, _test_data))
    whole_data = whole_data.reshape((-1, 120))

    whole_label = numpy.concatenate((_train_label, _valid_label, _test_label))
    whole_label = list(whole_label)

    new_label_list = list()
    for i in whole_label:
        for j in range(100):
            new_label_list.append(i)

    assert len(new_label_list) == whole_data.shape[0]

    lsvc = LinearSVC(C=0.1, penalty="l1", dual=False).fit(whole_data, new_label_list)
    model = SelectFromModel(lsvc, prefit=True)
    data_new = model.transform(whole_data)
    print ('After feature selection we have', data_new.shape[1], 'features.')

    data_new = data_new.reshape((-1, 100, data_new.shape[1]))
    _train_data = data_new[:train_imageNo,:,:]
    _valid_data = data_new[train_imageNo:train_imageNo+valid_imageNo,:,:]
    _test_data = data_new[train_imageNo+valid_imageNo:,:,:]

    return _train_data, _valid_data, _test_data
Exemplo n.º 9
0
def feature_importance_with_forest(rforest_classifier, issues_train, priority_train, issues_test, priority_test):
    """
    Assess feature importance using a Random Forest.
    :param rforest_classifier: An already fitted classifier.
    :param issues_train: Train features.
    :param priority_train: Train classes.
    :param issues_test: Test features.
    :param priority_test: Test classes.
    :return: None
    """
    importances = rforest_classifier.feature_importances_
    indices = np.argsort(importances)[::-1]

    for column_index in range(len(issues_train.columns)):
        print column_index + 1, ") ", issues_train.columns[column_index], " ", importances[indices[column_index]]

    figure, axes = plt.subplots(1, 1)
    plt.title('Feature importance')
    plt.bar(range(len(issues_train.columns)), importances[indices], color='lightblue', align='center')
    plt.xticks(range(len(issues_train.columns)), issues_train.columns, rotation=90)
    plt.xlim([-1, len(issues_train.columns)])
    plt.tight_layout()
    plt.show()

    evaluate_performance("FOREST", rforest_classifier, issues_train, priority_train, issues_test, priority_test)

    print "Selecting important features ..."
    select = SelectFromModel(rforest_classifier, threshold=0.05, prefit=True)

    train_selected = select.transform(issues_train)
    test_selected = select.transform(issues_test)

    rforest_classifier.fit(train_selected, priority_train)
    evaluate_performance("FOREST-IMPORTANT", rforest_classifier, train_selected, priority_train, test_selected,
                         priority_test)
Exemplo n.º 10
0
def forests(input_df, target_df):
	"""This method implements two types of forest features selection. ExtraTreesClassifier & RandomForestClassifier. Features are ranked in order of importance."""
	from sklearn.ensemble import ExtraTreesClassifier
	from sklearn.feature_selection import SelectFromModel
	clf = ExtraTreesClassifier(random_state = 0)
	clf = clf.fit(input_df, target_df)
	model = SelectFromModel(clf, prefit=True)
	input_df_new = model.transform(input_df)
	original_space = input_df.shape
	new_space_ETC = input_df_new.shape
	tuple_holder = [(j, i) for i, j in zip(feature_space, clf.feature_importances_)]
	tuple_holder.sort()
	tuple_holder.reverse()
	################################################
	################################################
	from sklearn.ensemble import RandomForestClassifier
	clf = RandomForestClassifier(random_state = 0)
	clf = clf.fit(input_df, target_df)
	model = SelectFromModel(clf, prefit=True)
	input_df_new = model.transform(input_df)
	new_space_RFC = input_df_new.shape
	tuple_holder_2 = [(j, i) for i, j in zip(feature_space, clf.feature_importances_)]
	tuple_holder_2.sort()
	tuple_holder_2.reverse()
	################################################
	################################################
	rank_number = 0
	print 'ExtraTreesClassifier', '\t'*4, 'RandomForestClassifier'
	print 'Old Space: ', original_space, '\t'*4, 'Old Space:', original_space
	print 'New Space: ', new_space_ETC, '\t'*4, 'New Space:', new_space_RFC
	for i, j in zip(tuple_holder, tuple_holder_2):
		rank_number += 1
		print rank_number, '|', i, '\t'*3, rank_number, '|', j
Exemplo n.º 11
0
def test_max_features_dim(max_features):
    clf = RandomForestClassifier(n_estimators=50, random_state=0)
    transformer = SelectFromModel(estimator=clf,
                                  max_features=max_features,
                                  threshold=-np.inf)
    X_trans = transformer.fit_transform(data, y)
    assert X_trans.shape[1] == max_features
Exemplo n.º 12
0
def test_invalid_input():
    clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True,
                        random_state=None, tol=None)
    for threshold in ["gobbledigook", ".5 * gobbledigook"]:
        model = SelectFromModel(clf, threshold=threshold)
        model.fit(data, y)
        assert_raises(ValueError, model.transform, data)
Exemplo n.º 13
0
def lasso_reducer(X, y):

    clf = LassoCV()

    # Set a minimum threshold of 0.25
    # this is a 'maxing out' of the sum of all coefficients
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(X, y)

    n_features = sfm.transform(X).shape[1]

    # reset the threshold until the number of features equals two.
    # Note that the attribute can be set directly instead of repeatedley
    # fitting the metatransformer.
    while n_features > 2:
        sfm.threshold += 0.1
        X_transform = sfm.transform(X)
        n_features = X_transform.shape[1]

    # Plot the seelcted two features from X.
    plt.title('features selected from boston using the SelectFromModel with'
              'threshold of %0.3f.' % sfm.threshold)

    feature1 = X_transform[:, 0]
    feature2 = X_transform[:, 1]
    plt.plot(feature1, feature2, 'r.')
    plt.xlabel("Value of Feature number 1")
    plt.ylabel("Value of Feature number 2")
    plt.ylim([np.min(feature2), np.max(feature2)])
    plt.show()

    return
Exemplo n.º 14
0
def test_feature_importances_2d_coef():
    X, y = datasets.make_classification(
        n_samples=1000,
        n_features=10,
        n_informative=3,
        n_redundant=0,
        n_repeated=0,
        shuffle=False,
        random_state=0,
        n_classes=4,
    )

    est = LogisticRegression()
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        for order in [1, 2, np.inf]:
            # Fit SelectFromModel a multi-class problem
            transformer = SelectFromModel(estimator=LogisticRegression(), threshold=threshold, norm_order=order)
            transformer.fit(X, y)
            assert_true(hasattr(transformer.estimator_, "coef_"))
            X_new = transformer.transform(X)
            assert_less(X_new.shape[1], X.shape[1])

            # Manually check that the norm is correctly performed
            est.fit(X, y)
            importances = norm(est.coef_, axis=0, ord=order)
            feature_mask = importances > func(importances)
            assert_array_equal(X_new, X[:, feature_mask])
Exemplo n.º 15
0
class SelectFromModelSelection(SelectionModel):
    name = "SelectFromModel"

    def __init__(self, *args):
        SelectionModel.__init__(self, *args)
        self.selector = SelectFromModel(self.estimator)
        self.selector.fit(self.x_array, self.y_array)
        self.support_ = self.selector.get_support()
Exemplo n.º 16
0
def test_warm_start():
    est = PassiveAggressiveClassifier(warm_start=True, random_state=0)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(data, y)
    old_model = transformer.estimator_
    transformer.fit(data, y)
    new_model = transformer.estimator_
    assert_true(old_model is new_model)
Exemplo n.º 17
0
def rf_feat_reduction(rf_model, features):

    print " Reducing number of input features based on feature importance."
    subset_model = SelectFromModel(rf_model, prefit=True)
    feat_subset = subset_model.transform(features)
    feat_bool = subset_model.get_support()
    print " " + str(len(feat_subset[0])) + " features chosen after model selection."
    return feat_subset, feat_bool
Exemplo n.º 18
0
def test_max_features_error(max_features, err_type, err_msg):
    clf = RandomForestClassifier(n_estimators=50, random_state=0)

    transformer = SelectFromModel(estimator=clf,
                                  max_features=max_features,
                                  threshold=-np.inf)
    with pytest.raises(err_type, match=err_msg):
        transformer.fit(data, y)
Exemplo n.º 19
0
def test_input_estimator_unchanged():
    """
    Test that SelectFromModel fits on a clone of the estimator.
    """
    est = RandomForestClassifier()
    transformer = SelectFromModel(estimator=est)
    transformer.fit(data, y)
    assert_true(transformer.estimator is est)
Exemplo n.º 20
0
def selectfeature(x, y, x_pre):
	x, x_pre = datscater(x, x_pre)
	clf = linear_model.LassoLars().fit(x, y)
	model = SelectFromModel(clf, prefit=True)
	x_new = model.transform(x)
	print 'x',x.shape
	print x_new.shape
	x_pre = model.transform(x_pre)
	return x_new, x_pre
Exemplo n.º 21
0
def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
    # SelectfromModel
    clf = LGBMClassifier(n_estimators=400)
    clf.fit(matrix_x_temp, label_y)
    sfm = SelectFromModel(clf, prefit=True, threshold=th)
    matrix_x = sfm.transform(matrix_x_temp)

    # 打印出有多少特征重要性非零的特征
    feature_score_dict = {}
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print 'number of not-zero features:' + str(len(feature_score_dict) - m)

    # 打印出特征重要性
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                       key=lambda d: d[1], reverse=True)
    print 'feature_importance:'
    for ii in range(len(feature_score_dict_sorted)):
        print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
    print '\n'

    f = open('../eda/lgb_feature_importance.txt', 'w')
    f.write(th)
    f.write('\nRank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()

    # 打印具体使用了哪些字段
    how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    print 'feature_chooesed:'
    for ii in range(len(feature_used_name)):
        print feature_used_name[ii]
    print '\n'

    f = open('../eda/lgb_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()

    # 找到未被使用的字段名
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])

    return matrix_x, feature_not_used_name[:], len(feature_used_name)
def select_feature(clf,x_train,x_valid):
    clf.fit(x_train, y_train)
    model = SelectFromModel(clf, prefit=True, threshold="mean")

    print x_train.shape
    x_train = model.transform(x_train)
    x_valid = model.transform(x_valid)
    print x_train.shape

    return x_train,x_valid
Exemplo n.º 23
0
    def train(self):
        rfc = RandomForestRegressor()
        rfc.fit(self.data, self.target)

        model = SelectFromModel(rfc, prefit=True)
        X = model.transform(self.data)
        self.predict = model.transform(self.predict)

        rfc.fit(X, self.target)
        return rfc
def select_features_tree(X, y, feature_names = []):
    print X.shape
    #forest = RandomForestClassifier(n_estimators=1000, n_jobs=4)
    forest = ExtraTreesClassifier(n_estimators=1000, n_jobs=8)
    fo = forest.fit(X, y)
    sorted_feature_names = plot_feature_importance(fo, X, feature_names)
    model = SelectFromModel(fo, prefit=True, )
    X_new = model.transform(X)
    print X_new.shape
    return X_new, sorted_feature_names[0:X_new.shape[1]]
Exemplo n.º 25
0
def test_coef_default_threshold():
    X, y = datasets.make_classification(
        n_samples=100, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    # For the Lasso and related models, the threshold defaults to 1e-5
    transformer = SelectFromModel(estimator=Lasso(alpha=0.1))
    transformer.fit(X, y)
    X_new = transformer.transform(X)
    mask = np.abs(transformer.estimator_.coef_) > 1e-5
    assert_array_almost_equal(X_new, X[:, mask])
Exemplo n.º 26
0
def test_threshold_string():
    est = RandomForestClassifier(n_estimators=50, random_state=0)
    model = SelectFromModel(est, threshold="0.5*mean")
    model.fit(data, y)
    X_transform = model.transform(data)

    # Calculate the threshold from the estimator directly.
    est.fit(data, y)
    threshold = 0.5 * np.mean(est.feature_importances_)
    mask = est.feature_importances_ > threshold
    assert_array_equal(X_transform, data[:, mask])
def select_features(inputs, label, threshold):
    print 'training ExtraTreesClassifier...'
    clf = ExtraTreesClassifier(criterion='entropy')
    clf.fit(inputs, label)
    
    threshold='%f*mean'%(threshold)
    print 'training SelectFromModel, threshold=%s...'%(threshold)
    sfm = SelectFromModel(clf, threshold=threshold, prefit=True)
    inputs_new = sfm.transform(inputs)
    #pdb.set_trace()
    print inputs_new.shape
    
    return sfm, inputs_new
Exemplo n.º 28
0
def extra_trees_classifier():

    tianic=Titanic_Data('../input/train.csv','../input/test.csv')

    combined_normalized_data=tianic.get_normalized_data()

    train,test,targets = recover_train_test_target('../input/train.csv', combined_normalized_data)

    
    clf = ExtraTreesClassifier(n_estimators=200)
    clf = clf.fit(train, targets)

    features = pd.DataFrame()
    features['feature'] = train.columns
    features['importance'] = clf.feature_importances_

    features.sort(['importance'],ascending=False)

    model = SelectFromModel(clf, prefit=True)
    train_new = model.transform(train)
    train_new.shape

    test_new = model.transform(test)
    test_new.shape

    forest = RandomForestClassifier(max_features='sqrt')

    parameter_grid = {
                     'max_depth' : [4,5,6,7,8],
                     'n_estimators': [200,210,240,250],
                     'criterion': ['gini','entropy']
                     }

    cross_validation = StratifiedKFold(targets, n_folds=5)

    grid_search = GridSearchCV(forest,
                               param_grid=parameter_grid,
                               cv=cross_validation)

    grid_search.fit(train_new, targets)

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

    output = grid_search.predict(test_new).astype(int)
    df_output = pd.DataFrame()
    df_output['PassengerId'] = test['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId','Survived']].to_csv('./extra_trees_classifier_output.csv',index=False)
Exemplo n.º 29
0
def test_partial_fit():
    est = PassiveAggressiveClassifier(random_state=0, shuffle=False)
    transformer = SelectFromModel(estimator=est)
    transformer.partial_fit(data, y, classes=np.unique(y))
    old_model = transformer.estimator_
    transformer.partial_fit(data, y, classes=np.unique(y))
    new_model = transformer.estimator_
    assert_true(old_model is new_model)

    X_transform = transformer.transform(data)
    transformer.fit(np.vstack((data, data)), np.concatenate((y, y)))
    assert_array_equal(X_transform, transformer.transform(data))
def execute(fdata):

    data = list()
    target = list()
    storeDict = dict()

    for i, lines in enumerate(fdata):
        sline = lines.split(",")
        target.append(int(sline[0]))
        data.append([float(x) for j, x in enumerate(sline) if j != 0])
        storeDict[i] = [float(x) for j, x in enumerate(sline) if j != 0]

    data = np.array(data)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.25, random_state=0)
    clf = ExtraTreesClassifier()
    clf = clf.fit(X_train, y_train)
    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X_train)

    clfNew = svm.SVC(kernel='linear', C=1).fit(X_new, y_train)

    value_feature = list()
    countDict = dict()
    for key, val in storeDict.items():
        countDict[key] = 0
        for i, inval in enumerate(val):
            if inval in X_new[0]:
                countDict[key] = countDict[key] + 1


    keyName = max(countDict, key=countDict.get)
    posStore = list()
    for val in X_new[0]:
        posStore.append(storeDict[keyName].index(val))

    X_test_new = list()

    for val in X_test:
        inlist = list()
        for i, inval in enumerate(val):
            if i in posStore:
                inlist.append(inval)

        X_test_new.append(inlist)

    X_test_new = np.array(X_test_new)

    return accuracy_score(y_test, clf.predict(X_test)), accuracy_score(y_test, clfNew.predict(X_test_new))
Exemplo n.º 31
0
    model_filter = SelectKBest(f_classif, k=10)
    lr = LogisticRegression(max_iter=100, class_weight=None)
    model_pl = Pipeline([('SelectKBest', model_filter),
                         ('LogisticRegression', lr)])
    model_pl.fit(x_train, y_train)
    model_pl.predict(x_test)
    print model_pl.score(x_test, y_test)
    print model_pl.named_steps['SelectKBest'].get_support()
    print dataset.atribNombre([
        i for i, x in enumerate(
            model_pl.named_steps['SelectKBest'].get_support().tolist()) if x
    ])

    lr = LogisticRegression(max_iter=100, class_weight=None)
    lrS = LogisticRegression(max_iter=100,
                             class_weight=None).fit(x_train, y_train)
    model_filter = SelectFromModel(lrS)

    print '\nSelectFromModel'
    model_pl = Pipeline([('SelectFromModel', model_filter),
                         ('LogisticRegression', lr)])
    model_pl.fit(x_train, y_train)
    model_pl.predict(x_test)
    print model_pl.score(x_test, y_test)
    print model_pl.named_steps['SelectFromModel'].get_support()
    print dataset.atribNombre([
        i for i, x in enumerate(
            model_pl.named_steps['SelectFromModel'].get_support().tolist())
        if x
    ])
Exemplo n.º 32
0
def logistic_dimension(data, label, parameter=1):
    logistic_ = LogisticRegression(penalty="l1", C=parameter, max_iter=30)
    model = SelectFromModel(logistic_)
    new_data = model.fit_transform(data, label)
    mask = model.get_support(indices=True)
    return new_data, mask
Exemplo n.º 33
0
fraudInstanceData = pd.read_csv("FraudInstanceData.csv", header=0, index_col=0)
maritalStatuses = pd.get_dummies(fraudInstanceData["Marital Status"])
accomodationTypes = pd.get_dummies(fraudInstanceData["Accomodation Type"])
fraudInstanceData = fraudInstanceData.drop('Marital Status', axis=1)
fraudInstanceData = fraudInstanceData.drop('Accomodation Type', axis=1)
fraudInstanceData = fraudInstanceData.join(maritalStatuses)
fraudInstanceData = fraudInstanceData.join(accomodationTypes)

currencyToMoney = lambda c: Decimal(sub(r'[^\d.]', '', c))
fraudInstanceData['Claim Amount'] = fraudInstanceData["Claim Amount"].apply(
    currencyToMoney)

y = fraudInstanceData.iloc[:, 1]
X = fraudInstanceData.iloc[:, 1:]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=23)

pipeline = Pipeline([('feature_selection',
                      SelectFromModel(LogisticRegression(penalty="l1"))),
                     ('regression', LogisticRegression())])
grid_cv = GridSearchCV(pipeline, {}, cv=10)
grid_cv.fit(X_train, y_train)

selected_feature = grid_cv.transform(X_train.co)

y_pred = grid_cv.predict(X_test)
print(grid_cv.score(X_test, y_pred))
print(confusion_matrix(y_test, y_pred))
    def build_model(self, fname_structlib):

        #MODIFIED BY JIM (this way don't have to remember to close the file...)
        with open(fname_structlib, 'rb') as f_structlib:
            structs = pickle.load(f_structlib)

        n_structs = 0
        for struct in structs:
            if not struct.metricpredicted:
                n_structs += 1
        metrics = np.zeros(n_structs)

        n_features = 0
        for prop in self.properties:
            if prop.useful:
                n_features += 1
        features = np.zeros((n_structs, n_features))

        count_structs = 0
        for struct in structs:
            if not struct.metricpredicted:
                props = self.calc_properties(struct)
                count_features = 0
                for prop in self.properties:  # make sure this happens in the same order each time
                    if prop.useful:
                        #Need to prune properties we don't need (i.e. smaller rdf, etc.)
                        try:
                            features[count_structs,
                                     count_features] = props[prop.label]
                            count_features += 1
                        except KeyError:
                            #Remove this property so don't have to do this again
                            prop.useful = False
                metrics[count_structs] = struct.metric
                count_structs += 1

        # cross-validation etc. etc. and change property.useful's
        # need to make sure that property.useful status is consistent with the model (has same number of features)
        # make new model to test with
        test_model = clone(self.model)
        test_scaler = clone(self.scaler)
        # split data into testing and training sets
        features_train, features_test, metrics_train, metrics_test = train_test_split(
            features, metrics, test_size=0.25, shuffle=True)
        # using training set, perform feature selection by selecting from fitted LASSO model
        features_train_scaled = test_scaler.fit_transform(features_train)
        features_test_scaled = test_scaler.transform(features_test)
        selector = SelectFromModel(test_model,
                                   threshold=1e-4)  # HARD CODED NUMBER HERE
        selector.fit(features_train_scaled, metrics_train)
        print('number of features selected',
              np.sum(selector.get_support().astype(int)))
        features_train_reduced_unscaled = selector.transform(features_train)
        features_test_reduced_unscaled = selector.transform(features_test)

        # using training set, perform recursive feature elimination with cross-validation
        #     selector = RFECV(test_model, step=1, scoring='neg_mean_squared_error')
        #     features_train_new = selector.fit_transform(features_train, metrics_train)
        #     print('number of features selected after cross-validation', selector.n_features_)
        #     features_test_new = selector.transform(features_test)
        #     features_new = selector.transform(features)

        # fit with reduced number of features
        features_train_reduced_scaled = test_scaler.fit_transform(
            features_train_reduced_unscaled)
        features_test_reduced_scaled = test_scaler.transform(
            features_test_reduced_unscaled)
        test_model.fit(features_train_reduced_scaled, metrics_train)

        # compute RMSE of test set
        # should also compute for training set??
        mse_test = mean_squared_error(
            metrics_test, test_model.predict(features_test_reduced_scaled))
        # Below switching to using coefficient of determination, not RMSE, but still calling it RMSE
        # This normalizes things to the variance in the data, so now want to be bigger and close to 1
        # A good cutoff is probably 0.8 or 0.9
        #rmse_norm_new = np.sqrt(mse_test)/np.mean(metrics)
        rmse_norm_new = (np.var(metrics) - mse_test) / np.var(metrics)
        print('rmse_norm_new', rmse_norm_new)
        print('self.rmse_norm', self.rmse_norm)
        #if rmse_norm_new < self.rmse_norm: # should we do something fancier than this?
        # copy model (or should we maybe refit it to all the data?? not sure if this would violate something machine learning)
        self.scaler = clone(test_scaler)
        features_train_reduced_scaled = self.scaler.fit_transform(
            features_train_reduced_unscaled)
        self.model = clone(test_model)
        self.model.fit(features_train_reduced_scaled, metrics_train)
        # change useful labels on properties
        count_features = 0
        selector_support = selector.get_support()
        for prop in self.properties:
            if prop.useful:
                prop.useful = selector_support[count_features]
                count_features += 1

        if rmse_norm_new > self.rmse_norm:  # should we do something fancier than this?
            self.rmse_norm = rmse_norm_new
            return True
        else:
            return False
Exemplo n.º 35
0
forest.fit(X_train, y_train)
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]  # Extented Slice -> reverse array

print('Features ranked:')
for f in range(X_train.shape[1]):
    print(f'{f+1}) {feat_labels[indices[f]]:<30} {importances[indices[f]]}')

import matplotlib.pyplot as plt

plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), importances[indices], align='center')

plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])

plt.tight_layout()
plt.show()

# Note: highly correlated features may not all be ranked high

from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
X_selected = sfm.transform(X_train)
print('Number of features that meet this threshold: {X_selected.shape[1]}')

for f in range(X_selected.shape[1]):
    print(f'{f+1}) {feat_labels[indices[f]]:<30} {importances[indices[f]]:3f}')
Exemplo n.º 36
0
dataframe['label'].value_counts()

x_matrix = dataframe.copy()
x_matrix.drop(['label'], axis=1, inplace=True)
y_vector = dataframe['label']

sc = StandardScaler()
sc.fit(x_matrix)
x_matrix = sc.transform(x_matrix)

############feature selection

classif = ExtraTreesClassifier(n_estimators=100)
classif = classif.fit(x_matrix, y_vector)
classif.feature_importances_  
selected = SelectFromModel(classif, prefit=True)
x_matrix_new = selected.transform(x_matrix)
x_matrix_new.shape

X_train, X_test, y_train, y_test = train_test_split(x_matrix_new, y_vector, test_size=0.33, random_state=42,shuffle=True)

#############



classif_LR = LogisticRegression()
classif_KN = KNeighborsClassifier()
classif_RF = RandomForestClassifier()

###############Log
    
Exemplo n.º 37
0
    print("%2d) %-*s %f" % (f + 1, 30,
    feat_labels[indices[f]],
    importances[indices[f]]))

#feature selection threshold - deprecated
X.shape    
X_selected = clf.transform(X, threshold=0.02)
X_selected.shape 

#SelectModel method
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

X.shape
lsvc = LinearSVC(C=0.1, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
X_new.shape

   

yhat = clf.predict_proba(X)  #which is which?
clf.classes_
yhat.shape
X.shape

yhat[:,0].shape

X0['yhats_A'] = yhat[:,0]
X0['yhats_D'] = yhat[:,1]
X0['yhats_H'] = yhat[:,2]
Exemplo n.º 38
0
class LinearSVM(SemevalModel):
    def __init__(self):
        SemevalModel.__init__(self)

    def __transform__(self, q1, q2):
        if type(q1) == list: q1 = ' '.join(q1)
        if type(q2) == list: q2 = ' '.join(q2)

        lcs = features.lcs(re.split('(\W)', q1), re.split('(\W)', q2))
        lcs1 = len(lcs[1].split())
        lcs2 = lcs[0]
        lcsub = features.lcsub(q1, q2)[0]
        jaccard = features.jaccard(q1, q2)
        containment_similarity = features.containment_similarities(q1, q2)
        # greedy_tiling = features.greedy_string_tiling(q1, q2)

        X = [lcs1, lcsub, jaccard, containment_similarity]

        # ngram features
        for n in range(2, 5):
            ngram1 = ' '
            for gram in nltk.ngrams(q1.split(), n):
                ngram1 += 'x'.join(gram) + ' '

            ngram2 = ' '
            for gram in nltk.ngrams(q2.split(), n):
                ngram2 += 'x'.join(gram) + ' '

            lcs = features.lcs(re.split('(\W)', ngram1),
                               re.split('(\W)', ngram2))
            X.append(len(lcs[1].split()))
            # X.append(lcs[0])
            X.append(features.lcsub(ngram1, ngram2)[0])
            X.append(features.jaccard(ngram1, ngram2))
            X.append(features.containment_similarities(ngram1, ngram2))

        return X

    def get_features(self, q1id, q1, q2id, q2, set='train'):
        X = []
        if set == 'train':
            q1_elmo = self.trainelmo.get(str(self.trainidx[q1id]))
            q2_elmo = self.trainelmo.get(str(self.trainidx[q2id]))
        else:
            q1_elmo = self.develmo.get(str(self.devidx[q1id]))
            q2_elmo = self.develmo.get(str(self.devidx[q2id]))

        q1_w2v = features.encode(q1, self.word2vec)
        q1_elmo_bottom = [
            np.concatenate([q1_w2v[i], q1_elmo[0][i]])
            for i in range(len(q1_w2v))
        ]
        q1_elmo_middle = [
            np.concatenate([q1_w2v[i], q1_elmo[1][i]])
            for i in range(len(q1_w2v))
        ]
        q1_elmo_top = [
            np.concatenate([q1_w2v[i], q1_elmo[2][i]])
            for i in range(len(q1_w2v))
        ]

        q2_w2v = features.encode(q2, self.word2vec)
        q2_elmo_bottom = [
            np.concatenate([q2_w2v[i], q2_elmo[0][i]])
            for i in range(len(q2_w2v))
        ]
        q2_elmo_middle = [
            np.concatenate([q2_w2v[i], q2_elmo[1][i]])
            for i in range(len(q2_w2v))
        ]
        q2_elmo_top = [
            np.concatenate([q2_w2v[i], q2_elmo[2][i]])
            for i in range(len(q2_w2v))
        ]

        # X.append(self.simbow.score(q1, q1_w2v, q2, q2_w2v))
        X.append(self.simbow.score(q1, q1_elmo_bottom, q2, q2_elmo_bottom))
        X.append(self.simbow.score(q1, q1_elmo_middle, q2, q2_elmo_middle))
        X.append(self.simbow.score(q1, q1_elmo_top, q2, q2_elmo_top))
        return X

    def train(self):
        logging.info('Training svm.', extra=d)
        treekernel = features.TreeKernel(alpha=0,
                                         decay=1,
                                         ignore_leaves=True,
                                         smoothed=False)
        self.bm25_model, self.avg_idf, self.bm25_qid_index = features.init_bm25(
            traindata=self.trainset, devdata=self.devset, testdata=[])

        if not os.path.exists(FEATURE_PATH):
            X, y = [], []
            for i, query_question in enumerate(self.traindata):
                percentage = round(float(i + 1) / len(self.traindata), 2)
                print('Preparing traindata: ',
                      percentage,
                      i + 1,
                      sep='\t',
                      end='\r')
                q1id = query_question['q1_id']
                q2id = query_question['q2_id']
                q1, q2 = query_question['q1'], query_question['q2']
                # x = self.get_features(q1id, q1, q2id, q2)
                x = []
                # x = self.__transform__(q1, q2)
                #
                # # elmo and word2vec embeddings
                q1_elmo = self.trainelmo.get(str(self.trainidx[q1id]))
                q1_w2v = features.encode(q1, self.word2vec)
                q1_emb = [
                    np.concatenate([q1_w2v[i], q1_elmo[i]])
                    for i in range(len(q1_w2v))
                ]

                q2_elmo = self.trainelmo.get(str(self.trainidx[q2id]))
                q2_w2v = features.encode(q2, self.word2vec)
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                # # translation
                # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb)
                # x.append(trlmprob)
                #
                # # bm25
                # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf)
                # x.append(bm25_score)
                #
                # # cosine
                # q1_lemma = query_question['q1_lemmas']
                # q1_pos = query_question['q1_pos']
                # q2_lemma = query_question['q2_lemmas']
                # q2_pos = query_question['q2_pos']
                # for n in range(1,5):
                #     try:
                #         x.append(features.cosine(' '.join(q1), ' '.join(q2), n=n))
                #     except:
                #         x.append(0.0)
                #     try:
                #         x.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n))
                #     except:
                #         x.append(0.0)
                #     try:
                #         x.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n))
                #     except:
                #         x.append(0.0)
                #
                # # tree kernels
                # q1_token2lemma = dict(zip(query_question['q1_full'], query_question['q1_lemmas']))
                # q2_token2lemma = dict(zip(query_question['q2_full'], query_question['q2_lemmas']))
                # q1_tree, q2_tree = utils.parse_tree(query_question['q1_tree'], q1_token2lemma), utils.parse_tree(query_question['q2_tree'], q2_token2lemma)
                # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree)
                # x.append(treekernel(q1_tree, q2_tree))
                #
                # # frobenius norm
                # x.append(features.frobenius_norm(q1_emb, q2_emb))
                #
                # # softcosine
                simbow = self.simbow.score(q1, q1_emb, q2, q2_emb)
                x.append(simbow)

                for comment in query_question['comments']:
                    q3id = comment['id']
                    q3 = comment['tokens']
                    simbow_q1q3, simbow_q2q3 = 0, 0
                    if len(q3) > 0:
                        # x.extend(self.get_features(q1id, q1, q3id, q3))
                        q3_elmo = self.trainelmo.get(str(self.trainidx[q3id]))
                        q3_w2v = features.encode(q3, self.word2vec)
                        q3_emb = [
                            np.concatenate([q3_w2v[i], q3_elmo[i]])
                            for i in range(len(q3_w2v))
                        ]
                        simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb)
                        # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb)
                        # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb)
                        # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf)

                    # x.append(trlmprob)
                    # x.append(bm25_score)
                    x.append(simbow_q1q3)
                    # x.append(simbow_q2q3)

                X.append(x)
                y.append(query_question['label'])

            p.dump(list(zip(X, y)), open(FEATURE_PATH, 'wb'))
        else:
            f = p.load(open(FEATURE_PATH, 'rb'))
            X = list(map(lambda x: x[0], f))
            y = list(map(lambda x: x[1], f))

        # scale features
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.scaler.fit(X)
        X = self.scaler.transform(X)

        clf = LassoCV(cv=10)
        self.feat_selector = SelectFromModel(clf)
        self.feat_selector.fit(X, y)
        X = self.feat_selector.transform(X)

        self.model = self.train_svm(trainvectors=X,
                                    labels=y,
                                    c='search',
                                    kernel='search',
                                    gamma='search',
                                    degree='search',
                                    jobs=4)
        # self.model = self.train_regression(trainvectors=X, labels=y, c='search', penalty='search', tol='search')
        logging.info('Finishing to train svm.')

    def validate(self):
        logging.info('Validating svm.', extra=d)
        treekernel = features.TreeKernel(alpha=0,
                                         decay=1,
                                         ignore_leaves=True,
                                         smoothed=False)
        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(self.devset):
            ranking[q1id] = []
            percentage = round(float(i + 1) / len(self.devset), 2)
            print('Progress: ', percentage, i + 1, sep='\t', end='\r')

            query = self.devset[q1id]
            q1 = query['tokens_proc']
            # q1_lemma = query['lemmas']
            # q1_pos = query['pos']
            # q1_token2lemma = dict(zip(query['tokens'], query['lemmas']))
            # q1_tree = utils.parse_tree(query['subj_tree'], q1_token2lemma)

            q1_elmo = self.develmo.get(str(self.devidx[q1id]))
            q1_w2v = features.encode(q1, self.word2vec)
            q1_emb = [
                np.concatenate([q1_w2v[i], q1_elmo[i]])
                for i in range(len(q1_w2v))
            ]

            duplicates = query['duplicates']
            for duplicate in duplicates:
                rel_question = duplicate['rel_question']
                q2id = rel_question['id']
                q2 = rel_question['tokens_proc']
                # X = self.get_features(q1id, q1, q2id, q2, set='dev')
                # X = self.__transform__(q1, q2)
                X = []

                q2_elmo = self.develmo.get(str(self.devidx[q2id]))
                q2_w2v = features.encode(q2, self.word2vec)
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                # # translation
                # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb)
                # X.append(trlmprob)
                #
                # # bm25
                # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf)
                # X.append(bm25_score)
                #
                # # cosine
                # q2_lemma = rel_question['lemmas']
                # q2_pos = rel_question['pos']
                # for n in range(1,5):
                #     try:
                #         X.append(features.cosine(' '.join(q1), ' '.join(q2), n=n))
                #     except:
                #         X.append(0.0)
                #     try:
                #         X.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n))
                #     except:
                #         X.append(0.0)
                #     try:
                #         X.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n))
                #     except:
                #         X.append(0.0)
                #
                # # tree kernel
                # q2_token2lemma = dict(zip(rel_question['tokens'], rel_question['lemmas']))
                # q2_tree = utils.parse_tree(rel_question['subj_tree'], q2_token2lemma)
                # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree)
                # X.append(treekernel(q1_tree, q2_tree))
                #
                # # frobenius norm
                # X.append(features.frobenius_norm(q1_emb, q2_emb))

                # softcosine
                simbow = self.simbow.score(q1, q1_emb, q2, q2_emb)
                X.append(simbow)

                for comment in duplicate['rel_comments']:
                    q3id = comment['id']
                    q3 = comment['tokens_proc']
                    simbow_q1q3, simbow_q2q3 = 0, 0
                    if len(q3) > 0:
                        # X.extend(self.get_features(q1id, q1, q3id, q3, set='dev'))
                        q3_elmo = self.develmo.get(
                            str(self.devidx[comment['id']]))
                        q3_w2v = features.encode(q3, self.word2vec)
                        q3_emb = [
                            np.concatenate([q3_w2v[i], q3_elmo[i]])
                            for i in range(len(q3_w2v))
                        ]
                        simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb)
                        # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb)
                        # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf)
                        # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb)
                    # X.append(trlmprob)
                    # X.append(bm25_score)
                    X.append(simbow_q1q3)
                    # X.append(simbow_q2q3)

                # scale
                X = self.scaler.transform([X])
                # feature selection
                X = self.feat_selector.transform(X)

                score = self.model.decision_function(X)[0]
                pred_label = self.model.predict(X)[0]
                y_pred.append(pred_label)

                real_label = 0
                if rel_question['relevance'] != 'Irrelevant':
                    real_label = 1
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        with open('data/ranking.txt', 'w') as f:
            for q1id in ranking:
                for row in ranking[q1id]:
                    label = 'false'
                    if row[0] == 1:
                        label = 'true'
                    f.write('\t'.join([
                        str(q1id),
                        str(row[2]),
                        str(0),
                        str(row[1]), label, '\n'
                    ]))

        logging.info('Finishing to validate svm.', extra=d)
        return ranking, y_real, y_pred
Exemplo n.º 39
0
                       step=.1,
                       cv=5,
                       scoring='roc_auc')

        for i in range(0, len(skpipes)):
            skpipes[i].append(('rfe_rf' + str(i), cv_rfc))

    if fs_type == 2:
        #Wrapper Select via model
        clf = RandomForestClassifier(n_estimators=200,
                                     max_depth=None,
                                     min_samples_split=3,
                                     criterion='entropy',
                                     random_state=None)
        sel = SelectFromModel(
            clf, prefit=False, threshold='mean', max_features=None
        )  #to select only based on max_features, set to integer value and set threshold=-np.inf

        for i in range(0, len(skpipes)):
            skpipes[i].append(('wrapper_rf' + str(i), sel))

    if fs_type == 3:  ######Only work if the Target is binned###########
        #Univariate Feature Selection - Chi-squared
        #will throw error if any negative values in features, so turn off feature normalization, or switch to mutual_info_classif
        print('Univariate Feature Selection - Chi2: ')
        sel = SelectKBest(chi2, k=k_cnt)

        for i in range(0, len(skpipes)):
            skpipes[i].append(('ufs' + str(i), sel))

# %%
Exemplo n.º 40
0
    def train(self):
        logging.info('Training svm.', extra=d)
        treekernel = features.TreeKernel(alpha=0,
                                         decay=1,
                                         ignore_leaves=True,
                                         smoothed=False)
        self.bm25_model, self.avg_idf, self.bm25_qid_index = features.init_bm25(
            traindata=self.trainset, devdata=self.devset, testdata=[])

        if not os.path.exists(FEATURE_PATH):
            X, y = [], []
            for i, query_question in enumerate(self.traindata):
                percentage = round(float(i + 1) / len(self.traindata), 2)
                print('Preparing traindata: ',
                      percentage,
                      i + 1,
                      sep='\t',
                      end='\r')
                q1id = query_question['q1_id']
                q2id = query_question['q2_id']
                q1, q2 = query_question['q1'], query_question['q2']
                # x = self.get_features(q1id, q1, q2id, q2)
                x = []
                # x = self.__transform__(q1, q2)
                #
                # # elmo and word2vec embeddings
                q1_elmo = self.trainelmo.get(str(self.trainidx[q1id]))
                q1_w2v = features.encode(q1, self.word2vec)
                q1_emb = [
                    np.concatenate([q1_w2v[i], q1_elmo[i]])
                    for i in range(len(q1_w2v))
                ]

                q2_elmo = self.trainelmo.get(str(self.trainidx[q2id]))
                q2_w2v = features.encode(q2, self.word2vec)
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                # # translation
                # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb)
                # x.append(trlmprob)
                #
                # # bm25
                # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf)
                # x.append(bm25_score)
                #
                # # cosine
                # q1_lemma = query_question['q1_lemmas']
                # q1_pos = query_question['q1_pos']
                # q2_lemma = query_question['q2_lemmas']
                # q2_pos = query_question['q2_pos']
                # for n in range(1,5):
                #     try:
                #         x.append(features.cosine(' '.join(q1), ' '.join(q2), n=n))
                #     except:
                #         x.append(0.0)
                #     try:
                #         x.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n))
                #     except:
                #         x.append(0.0)
                #     try:
                #         x.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n))
                #     except:
                #         x.append(0.0)
                #
                # # tree kernels
                # q1_token2lemma = dict(zip(query_question['q1_full'], query_question['q1_lemmas']))
                # q2_token2lemma = dict(zip(query_question['q2_full'], query_question['q2_lemmas']))
                # q1_tree, q2_tree = utils.parse_tree(query_question['q1_tree'], q1_token2lemma), utils.parse_tree(query_question['q2_tree'], q2_token2lemma)
                # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree)
                # x.append(treekernel(q1_tree, q2_tree))
                #
                # # frobenius norm
                # x.append(features.frobenius_norm(q1_emb, q2_emb))
                #
                # # softcosine
                simbow = self.simbow.score(q1, q1_emb, q2, q2_emb)
                x.append(simbow)

                for comment in query_question['comments']:
                    q3id = comment['id']
                    q3 = comment['tokens']
                    simbow_q1q3, simbow_q2q3 = 0, 0
                    if len(q3) > 0:
                        # x.extend(self.get_features(q1id, q1, q3id, q3))
                        q3_elmo = self.trainelmo.get(str(self.trainidx[q3id]))
                        q3_w2v = features.encode(q3, self.word2vec)
                        q3_emb = [
                            np.concatenate([q3_w2v[i], q3_elmo[i]])
                            for i in range(len(q3_w2v))
                        ]
                        simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb)
                        # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb)
                        # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb)
                        # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf)

                    # x.append(trlmprob)
                    # x.append(bm25_score)
                    x.append(simbow_q1q3)
                    # x.append(simbow_q2q3)

                X.append(x)
                y.append(query_question['label'])

            p.dump(list(zip(X, y)), open(FEATURE_PATH, 'wb'))
        else:
            f = p.load(open(FEATURE_PATH, 'rb'))
            X = list(map(lambda x: x[0], f))
            y = list(map(lambda x: x[1], f))

        # scale features
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.scaler.fit(X)
        X = self.scaler.transform(X)

        clf = LassoCV(cv=10)
        self.feat_selector = SelectFromModel(clf)
        self.feat_selector.fit(X, y)
        X = self.feat_selector.transform(X)

        self.model = self.train_svm(trainvectors=X,
                                    labels=y,
                                    c='search',
                                    kernel='search',
                                    gamma='search',
                                    degree='search',
                                    jobs=4)
        # self.model = self.train_regression(trainvectors=X, labels=y, c='search', penalty='search', tol='search')
        logging.info('Finishing to train svm.')
Exemplo n.º 41
0
    Y = np.concatenate((Positive_y, Negitive_y))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.33,
                                                        random_state=7)
    model = XGBClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    thresholds = sort(model.feature_importances_)

    for thresh in thresholds:
        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)
        selection_model = XGBClassifier()
        selection_model.fit(select_X_train, y_train)
        select_X_test = selection.transform(X_test)
        y_pred = selection_model.predict(select_X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)

        print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" %
              (thresh, select_X_train.shape[1], accuracy * 100.0))

    b = sorted(enumerate(model.feature_importances_),
               key=lambda x: x[1],
               reverse=True)
    a = np.array(b)[:, 0][0:MAX_LEN].astype(np.uint8)
Exemplo n.º 42
0
[0.82122905 0.83240223 0.81460674 0.8258427  0.85875706]
0.8305675570530682
'''




bagging_clf = BaggingRegressor(lr, n_estimators=10, max_samples=0.8, max_features=1.0, n_jobs=-1) 
# here we can even set bootstrap=false to get duplicate samples
evaluate_model(bagging_clf)


from sklearn.feature_selection import SelectFromModel
lr = LogisticRegression(C=20, penalty='l2', tol=1e-8)

selector = SelectFromModel(lr, threshold='1.25*median')
selector.fit(train_x, train_y)

train_x2 = selector.transform(train_x)
print(train_x.columns[selector.get_support()])
lr.fit(train_x2, train_y)
print(lr.score(train_x2, train_y))
print(lr.score(selector.transform(test_x),test_y))
cvs = cross_val_score(lr, selector.transform(train_X), train_Y, cv=5)
print(cvs)
print(np.mean(cvs), np.std(cvs))


'''

0.8475120385232745
Exemplo n.º 43
0
 scaler = MinMaxScaler()
 data1 = scaler.fit_transform(data[numeric_feature])
 
 #calibrate the categorical features, 
 encoder = OneHotEncoder(categories = 'auto', sparse = False)
 data2 = encoder.fit_transform(data[categorical_feature])
 
 #merge preprocessed features
 x = np.append(data1, data2, axis = 1)
 print('number of features after preprocessing: %d' % len(x[0]))
 print("")
 
 #use extra trees for feature selection
 clf = ExtraTreesClassifier(n_jobs=-1, random_state=0)
 clf = clf.fit(x,y)
 model = SelectFromModel(clf, prefit=True)
 x = model.transform(x)
 print('number of features after feature selection: %d' % len(x[0]))
 print("")
 
 #calculuate feature importance and output top 30 features with high importance
 categorical_name = encoder.get_feature_names(categorical_feature)
 feature_name = np.append(numeric_feature, categorical_name)
 feature_importance = clf.feature_importances_
 print('average feature importance: %f' % feature_importance.mean())
 print("")
 importance = dict(zip(feature_name, feature_importance))
 importance_sorted = sorted(importance.items(), key = lambda x: x[1], reverse=True)
 print("top 30 features with high importance:")
 print(importance_sorted[0:30])
 print("")
Exemplo n.º 44
0
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(solver='saga', penalty="l1", max_iter=1000), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
Exemplo n.º 45
0
    optimizers = ['rmsprop', 'adam', 'adadelta']
    return {
        'deep__batch_size': batches,
        'deep__epochs': epochs,
        'deep__act': activation,
        'deep__drop': dropout,
        'deep__optimizer': optimizers
    }


for i in range(len(multi_XGB.estimators_)):
    threshold = np.sort(multi_XGB.estimators_[i].feature_importances_)

    for thres in threshold:
        selection = SelectFromModel(multi_XGB.estimators_[i],
                                    threshold=thres,
                                    prefit=True)

        select_x_train = selection.transform(x_train)
        select_x_test = selection.transform(x_test)
        select_x_pred = selection.transform(x_pred)

        def build_model(drop=0.5, optimizer='adam', act='relu'):
            if act == 'leaky':
                act = leaky
            inputs = Input(shape=(select_x_train.shape[1], ))
            x = Dense(51, activation=act)(inputs)
            x = Dropout(drop)(x)
            x = Dense(150, activation=act)(x)
            x = Dropout(drop)(x)
            x = Dense(300, activation=act)(x)
Exemplo n.º 46
0
@简介:对特征进行嵌入式选择
@author: Jian
"""
import time
import pickle
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

t_start = time.time()
"""读取特征"""
features_path = './data_tfidf_100000.pkl'  #tfidf特征的路径
fp = open(features_path, 'rb')
x_train, y_train, x_test = pickle.load(fp)
fp.close()
"""进行特征选择"""
alo_name = 'lsvc_l2'
lsvc = LinearSVC(penalty='l2', C=1.0, dual=True).fit(x_train, y_train)
slt = SelectFromModel(lsvc, prefit=True)
x_train_s = slt.transform(x_train)
x_test_s = slt.transform(x_test)
"""保存选择后的特征至本地"""
num_features = x_train_s.shape[1]
data_path = './' + features_path.split(
    '.')[-2] + '_select_' + alo_name + '_' + str(num_features) + '.pkl'
data_f = open(data_path, 'wb')
pickle.dump((x_train_s, y_train, x_test_s), data_f)
data_f.close()

t_end = time.time()
print("特征选择完成,选择{}个特征,共耗时{}min".format(num_features, (t_end - t_start) / 60))
Exemplo n.º 47
0
    def run_grid_pipeline(self, features, labels, standardization_colms,
                          parameters, estimator,
                          feature_selection_threshold_type):

        # Preprocessing for numerical data
        numerical_transformer = StandardScaler()

        # Preprocessing for categorical data
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        # Bundle preprocessing for numerical and categorical data
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, standardization_colms),
                # ('cat', categorical_transformer, self.onehot_colms)
                # ], n_jobs = self.n_jobs)
            ],
            n_jobs=self.n_jobs,
            remainder='passthrough')

        feature_selection_clf = RandomForestClassifier(
            random_state=self.random_state, n_jobs=self.n_jobs)
        feature_selection_model = SelectFromModel(
            feature_selection_clf, threshold=feature_selection_threshold_type)

        grid = GridSearchCV(estimator=estimator,
                            param_grid=parameters,
                            cv=5,
                            scoring='accuracy',
                            refit=True,
                            n_jobs=-1)

        pipeline = Pipeline(steps=[(
            'preprocessor',
            preprocessor), ('feature_selection',
                            feature_selection_model), ('grid_search', grid)])

        pipeline.fit(features, labels)

        def print_results(results):
            print('BEST PARAMS: {}\n'.format(results.best_params_))

            means = results.cv_results_['mean_test_score']
            stds = results.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds,
                                         results.cv_results_['params']):
                print('{} (+/-{}) for {}'.format(round(mean, 3),
                                                 round(std * 2, 3), params))

        print_results(pipeline['grid_search'])

        # print(features.columns)
        feature_selection_model = pipeline['feature_selection']
        selected_features = feature_selection_model.transform(features)
        selected_features = pd.DataFrame(
            feature_selection_model.inverse_transform(selected_features),
            index=features.index,
            columns=features.columns)
        self.selected_columns = selected_features.columns[
            selected_features.var() != 0]
        print(
            '\nColumns selected for {0} threshold'.format(
                feature_selection_threshold_type), self.selected_columns)

        # print('\nBest estimator:\n')
        # print(pipeline['grid_search'].best_estimator_)
        # print(pipeline['grid_search'].best_score_)
        # print(pipeline['grid_search'].best_params_)
        # print(pipeline['grid_search'].scorer_)

        return pipeline
Exemplo n.º 48
0
                                                    y,
                                                    test_size=0.10)

vectorizer = TfidfVectorizer(binary=True, ngram_range=(1, 2))

X_train_rf = vectorizer.fit_transform(X_train)

rf = RandomForestClassifier(n_estimators=400,
                            verbose=100,
                            n_jobs=-1,
                            random_state=0,
                            max_samples=5000)

rf.fit(X_train_rf, y_train)

feature_selector = SelectFromModel(rf, prefit=True, max_features=100000)

svc_set = pd.concat([X_train, y_train], axis=1)
svc_set = svc_set.sample(100000, random_state=0)

svc_X = svc_set['review']
svc_y = svc_set['label']

svc_X = vectorizer.transform(svc_X)
svc_X = feature_selector.transform(svc_X)

svc = SVC(cache_size=1000, random_state=0)

svc.fit(svc_X, svc_y)

final_pipe = make_pipeline(vectorizer, feature_selector, svc)
Exemplo n.º 49
0
    def feature_selection(self):

        onehot_features = self.original_features
        onehot_labels = self.original_labels

        onehot_encoder = OneHotEncoder(handle_unknown='error', sparse=False)
        onehot_encoder.fit(onehot_features[self.onehot_colms])
        onehot_transformed_colms = onehot_encoder.get_feature_names(
            self.onehot_colms)
        onehot_transformed_features = onehot_encoder.transform(
            onehot_features[self.onehot_colms])
        onehot_features = onehot_features.join(pd.DataFrame(
            onehot_transformed_features,
            index=onehot_features.index,
            columns=onehot_transformed_colms),
                                               how='inner')
        # print(onehot_features.info())
        # print(onehot_transformed_colms)
        onehot_features = onehot_features.drop(columns=self.onehot_colms)
        # print(onehot_features.info())
        # print(self.original_features.loc[0:5,'Region'])
        # print(onehot_features.loc[0:5, ['Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_5', 'Region_6', 'Region_7', 'Region_8', 'Region_9'] ] )

        sss = StratifiedShuffleSplit(n_splits=1,
                                     train_size=self.train_ratio,
                                     random_state=self.random_state)
        for train_indx, test_indx in sss.split(onehot_features, onehot_labels):
            # print(len(train_indx)/len(features), len(test_indx)/len(features))
            # print('% Survived:', labels[test_indx].mean())

            # Using RandomForestClassifier gives non-linear decision boundary
            clf = RandomForestClassifier(random_state=self.random_state,
                                         n_jobs=self.n_jobs)

            # Using LogisticRegression (default L1) gives linear decision boundary
            # clf = LogisticRegression()

            clf.fit(onehot_features.iloc[train_indx],
                    onehot_labels.iloc[train_indx])

            # Using mean threshold in SelectFromModel
            feature_selection_model = SelectFromModel(clf,
                                                      prefit=True,
                                                      threshold='mean')
            selected_features = feature_selection_model.transform(
                onehot_features.iloc[train_indx])
            selected_features = pd.DataFrame(
                feature_selection_model.inverse_transform(selected_features),
                index=onehot_features.iloc[train_indx].index,
                columns=onehot_features.iloc[train_indx].columns)
            self.selected_columns_mean = selected_features.columns[
                selected_features.var() != 0]
            print('Mean threshold:', self.selected_columns_mean)

            # Using Median threshold for SelectFromModel
            feature_selection_model = SelectFromModel(clf,
                                                      prefit=True,
                                                      threshold='median')
            selected_features = feature_selection_model.transform(
                onehot_features.iloc[train_indx])
            selected_features = pd.DataFrame(
                feature_selection_model.inverse_transform(selected_features),
                index=onehot_features.iloc[train_indx].index,
                columns=onehot_features.iloc[train_indx].columns)
            self.selected_columns_median = selected_features.columns[
                selected_features.var() != 0]
            print('Median threshold', self.selected_columns_median)
Exemplo n.º 50
0
def gbdt_feature_selection(fe_name, matrix_x_temp, label_y, th):
    # SelectfromModel
    clf = GradientBoostingClassifier(n_estimators=200, random_state=100)
    clf.fit(matrix_x_temp, label_y)
    sfm = SelectFromModel(clf, prefit=True, threshold=th)
    matrix_x = sfm.transform(matrix_x_temp)

    # 打印出有多少特征重要性非零的特征
    feature_score_dict = {}
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print 'number of not-zero features:' + str(len(feature_score_dict) - m)

    # 打印出特征重要性
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                       key=lambda d: d[1],
                                       reverse=True)
    print 'feature_importance:'
    for ii in range(len(feature_score_dict_sorted)):
        print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][
            1]
    print '\n'

    f = open('../eda/A_gbdt_feature_importance.txt', 'w')
    f.write(th)
    f.write('\nRank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(
            str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' +
            str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()

    # 打印具体使用了哪些字段
    how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    print 'feature_chooesed:'
    for ii in range(len(feature_used_name)):
        print feature_used_name[ii]
    print '\n'

    f = open('../eda/A_gbdt_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()

    # 找到未被使用的字段名
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])

    # 生成一个染色体(诸如01011100这样的)
    chromosome_temp = ''
    feature_name_ivar = fe_name[:-1]
    for ii in range(len(feature_name_ivar)):
        if feature_name_ivar[ii] in feature_used_name:
            chromosome_temp += '1'
        else:
            chromosome_temp += '0'
    print 'Chromosome:'
    print chromosome_temp
    joblib.dump(chromosome_temp, '../config/chromosome.pkl')
    print '\n'
    return matrix_x, feature_not_used_name[:], len(feature_used_name)
"""

  #

#加载库
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.feature_selection import SelectFromModel
import numpy as np

#加载数据
iris = datasets.load_iris()
features = iris.data
target = iris.target
#创建随机森林分类器对象
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
#创建对象,选择重要性大于或等于阈值的特征
selector = SelectFromModel(randomforest, threshold=0.3)
#使用选择器创建新的特征矩阵
features_important = selector.fit_transform(features, target)
#使用重要特征训练随机森林模型
model = randomforest.fit(features_important, target)
#计算特征的重要性
importances = model.feature_importances_
#查看模型中每个特征的重要程度
print(importances)
#将特征的重要性按降序排列
indices = np.argsort(importances)[::-1]
#按照特征的重要性对特征名称重新排序
names = [iris.feature_names[i] for i in indices]
        fit_mod = sel.fit(data_np, target_np)
        print(sel.ranking_)
        sel_idx = fit_mod.get_support()

    if fs_type == 2:
        #Wrapper Select via model
        if binning == 0:
            clf = DecisionTreeClassifier(criterion='gini',
                                         splitter='best',
                                         max_depth=None,
                                         min_samples_split=3,
                                         min_samples_leaf=1,
                                         max_features=None,
                                         random_state=rand_st)
            sel = SelectFromModel(
                clf, prefit=False, threshold='mean', max_features=None
            )  #to select only based on max_features, set to integer value and set threshold=-np.inf
            print('Wrapper Select: ')
        if binning == 1:
            rgr = '''Unused in this homework'''
            sel = SelectFromModel(rgr,
                                  prefit=False,
                                  threshold='mean',
                                  max_features=None)
            print('Wrapper Select: ')

        fit_mod = sel.fit(data_np, target_np)
        sel_idx = fit_mod.get_support()

    if fs_type == 3:
        if binning == 1:  ######Only work if the Target is binned###########
Exemplo n.º 53
0
df = df.set_index('trace:id')

df_join = df2['status']

df_all = df.join(df_join, how='inner')
df_all = df_all.dropna()
y = df_all.pop('status')
df_all = df_all.drop('qr', axis=1)

print(y)
print(len(df_all))
# plot high dim data
plot_data(df_all, y, 'TSNE', 2)
#print(df_all.columns)
pipe = make_pipeline(
    SelectFromModel(estimator=RandomForestClassifier(
        n_estimators=100, max_depth=2, random_state=0)),
    LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=5000))
lr = LogisticRegression(solver='lbfgs',
                        multi_class='auto',
                        max_iter=5000,
                        class_weight='balanced')
#rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rf = RandomForestClassifier(n_estimators=500,
                            max_depth=5,
                            random_state=0,
                            class_weight={
                                'nok': 4,
                                'ok': 1
                            })
print(cross_val_score(lr, df_all, y, scoring='accuracy', cv=5).mean())
print(cross_val_score(rf, df_all, y, scoring='accuracy', cv=5).mean())
Exemplo n.º 54
0
    error_train = mean_squared_error(y_tr, y_tr_pred)
    error_test = mean_squared_error(y_ts, y_ts_pred)
    error_std_train = mean_squared_error(y_std_tr, y_std_tr_pred)
    error_std_test = mean_squared_error(y_std_ts, y_std_ts_pred)

    print("---------------------------------------")
    print("# Mean Squared Error:")
    print(regressor_name + " MSE train: %.3f, test: %.3f" % (error_train, error_test))
    print(regressor_name + " STD MSE train: %.3f, test: %.3f" % (error_std_train, error_std_test))

# Performance improvement
print("\n\n\n======================")
print("PERFORMANCE IMPROVEMENT")
clf = LassoCV(cv=5)
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(x, y)
n_features = sfm.transform(x).shape[1]
while n_features > 4:
    sfm.threshold += 0.1
    x_new = sfm.transform(x)
    n_features = x_new.shape[1]

# Standardizing
sc_x = StandardScaler()
x_std_new = sc_x.fit_transform(x_new)

sc_y = StandardScaler()
y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()

# Splitting train and test data
Exemplo n.º 55
0
             drop=False, nan=False),
   'clipper': OutliersClipper(columns=['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']),
   'combinations': FeatureProduct(columns=['LotFrontage', 'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GrLivArea']),
   'dropper__drop': ['LotFrontage_nan', 'MasVnrArea_nan', 'GarageYrBlt_nan'],
   'main_imputer': HotDeckFullImputer(col_k_pairs=[('LotFrontage', None), ('MasVnrArea', None), ('GarageYrBlt', None)],
             default_k=5),
   'poly': PolynomialsAdder(powers_per_column={'LotFrontage': [2], 'LotArea': [2], 'MasVnrArea': [2], 'BsmtFinSF1': [2], 'BsmtFinSF2': [2], 'BsmtUnfSF': [2], 'TotalBsmtSF': [2], '1stFlrSF': [2], '2ndFlrSF': [2], 'LowQualFinSF': [2], 'GrLivArea': [2], 'GarageArea': [2], 'WoodDeckSF': [2], 'OpenPorchSF': [2], 'EnclosedPorch': [2], '3SsnPorch': [2], 'ScreenPorch': [2], 'PoolArea': [2], 'MiscVal': [2]}),
   'predictor': DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best'),
   'reduce_dim': SelectFromModel(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
              oob_score=False, random_state=None, verbose=0, warm_start=False),
           norm_order=1, prefit=False, threshold=None),
   'simple_imputer': FillNaTransformer(from_dict={},
            mean=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], median=[],
            nan_flag=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], zero=[])},
  'score': 37866.96889728187,
  'std': 5359.25597193946},
 'Lasso': {'params': {'binner': None,
   'binner2': CustomBinaryBinner(configuration={'LotFrontage': {'values': [182.0]}, 'LotArea': {'values': [215245]}, 'MasVnrArea': {'values': [1378.0]}, 'BsmtFinSF1': {'values': [2188]}, 'BsmtFinSF2': {'values': [1120]}, 'BsmtUnfSF': {'values': [2336]}, 'TotalBsmtSF': {'values': [3206]}, '1stFlrSF': {'values': [3228]}, '2ndFlrSF': ... [2010.0]}, 'GarageCars': {'values': [4]}, 'MoSold': {'values': [12]}, 'YrSold': {'values': [2010]}},
             drop=False, nan=False),
   'clipper': None,
   'combinations': FeatureProduct(columns=['LotFrontage', 'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GrLivArea']),
   'dropper__drop': ['LotFrontage_nan', 'MasVnrArea_nan', 'GarageYrBlt_nan'],
   'main_imputer': HotDeckFullImputer(col_k_pairs=[('LotFrontage', None), ('MasVnrArea', None), ('GarageYrBlt', None)],
Exemplo n.º 56
0
train['Trap'] = lbl.transform(train['Trap'].values)

lbl.fit(list(train['CodeSum_x'].values))  # + list(test['CodeSum_x'].values))
train['CodeSum_x'] = lbl.transform(train['CodeSum_x'].values)

lbl.fit(list(train['CodeSum_y'].values))  # + list(test['CodeSum_y'].values))
train['CodeSum_y'] = lbl.transform(train['CodeSum_y'].values)

########################################################################################################################
train = train.astype(float)

#train = train.loc[:,(train != -1).any(axis=0)]

label = train.WnvPresent
train = train.drop('WnvPresent', axis=1)
sfm = SelectFromModel(LinearSVC(penalty='l1', loss='squared_hinge',
                                dual=False))
data = sfm.fit_transform(train, label)
data = preprocessing.scale(data)
#data = preprocessing.scale(train)
transformer = FunctionTransformer(np.log1p, validate=True)
transformer.transform(data)
data = preprocessing.normalize(data, norm='l2')

feature_cols = train.columns
databackup = data
data = pd.DataFrame(sfm.inverse_transform(data),
                    index=train.index,
                    columns=feature_cols)
selCols = data.columns[data.var() != 0]
data = data[selCols]
def main():

    # Input datasets as pandas dataframes
    df_original = pd.read_csv('Predictive Modelling Train.txt',
                              decimal=",",
                              sep="|")
    df_original_predict = pd.read_csv('Predictive Modelling Test.txt',
                                      decimal=",",
                                      sep="|")

    #---------------------------------------------------------------------------#
    #                           3- DATA MANIPULATION                            #
    #---------------------------------------------------------------------------#

    # Train - Drop ID and remove duplicates in train set
    df_original = df_original.drop(['ID'], 1)
    df_original = df_original.drop_duplicates()

    # Predict - Save ID of dataframe to predict
    test_id = df_original_predict.ID

    # Predict - Drop ID and convert to numpy array
    test_preprocessed = df_original_predict.drop(["ID"], 1)
    X_predict = np.array(test_preprocessed)

    # Train - Separate class and features
    X = np.array(df_original.drop(['TARGET'], axis=1))
    y = np.array(df_original.TARGET.values)

    # Split train dataset
    X_to_balance, X_real_test, y_to_balance, y_real_test = train_test_split(
        X, y, test_size=test_size_value, random_state=random_state_value)

    # Oversample data (TARGET=1) to balance
    sm = SMOTE(kind='regular')
    X_balanced, y_balanced = sm.fit_sample(X_to_balance, y_to_balance)

    # Create new features
    df_real = feature_engineering_df(X_real_test, df_original)
    df_balanced = feature_engineering_df(X_balanced, df_original)
    df_test_processed = feature_engineering_df(X_predict, df_original)

    # Convert pandas dataframes to numpy arrays
    X_balanced = np.array(df_balanced)
    X_real_test = np.array(df_real)
    X_predict = np.array(df_test_processed)

    #---------------------------------------------------------------------------#
    #                           4- FEATURE SELECTION                            #
    #---------------------------------------------------------------------------#

    # Define classifier for feature importance and selection
    clf1 = ExtraTreesClassifier(n_jobs=-1, random_state=random_state_value)

    selector = clf1.fit(X_balanced, y_balanced)

    # Choose best features
    fs = SelectFromModel(selector, prefit=True)

    # Discard non selected features
    X_real_test = fs.transform(X_real_test)
    X_balanced = fs.transform(X_balanced)
    X_predict_final = fs.transform(X_predict)

    #---------------------------------------------------------------------------#
    #                           5- MODEL TRAIN + FIT                            #
    #---------------------------------------------------------------------------#

    # Define prediction classifier and fit
    clf2 = KNeighborsClassifier(n_jobs=-1, n_neighbors=9)

    clf2.fit(X_balanced, y_balanced)

    #---------------------------------------------------------------------------#
    #                           6- MODEL EVALUATION                             #
    #---------------------------------------------------------------------------#

    # !!! IMPORTANT: COMMENT WHOLE BLOCK WHEN DOING REAL TRAINING AND PREDCTION (test_size_value = 0)

    # Print used classifiers and their parameters
    print("Feature selection classifier: ", clf1, "\n")
    print("Model classifier: ", clf2, "\n")

    # Calculate predictions
    y_pred = clf2.predict_proba(X_real_test)[:, 1]
    y_pred_int = clf2.predict(X_real_test)

    # Evaluate model
    print("Roc AUC: ", roc_auc_score(y_real_test, y_pred, average='macro'))
    accuracy = clf2.score(X_real_test, y_real_test)
    print("Accuracy: ", accuracy)
    print("f1 Score: ", f1_score(y_real_test, y_pred_int, average='macro'))

    # Hardcoded benchmark of filling prediction with most common class (0)
    zeros_benchmark = 1 - 7477 / 459992
    print("Filling with 0's benchmark:    ", zeros_benchmark)

    # Fixed accuracy with zeros_benchmark
    print("Fixed accuracy with benchmark: ",
          (accuracy - zeros_benchmark) / (1 - zeros_benchmark))

    # Confusion matrix
    conf_matrix = confusion_matrix(y_real_test, y_pred_int)
    print("\n[Confusion Matrix]: \n", conf_matrix)

    print(
        "\n------------------------------------------------------------------\n\n"
    )

    #---------------------------------------------------------------------------#
    #                       7- PREDICTION AND SUBMISSION                        #
    #---------------------------------------------------------------------------#

    # Make prediction
    predict_submission = clf2.predict(X_predict_final)

    # Save in csv
    submission = pd.DataFrame({"ID": test_id, "TARGET": predict_submission})
    submission.to_csv("submission.csv", index=False, sep="|")
Exemplo n.º 58
0
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(data[:train_objs_num], y)


features = pd.DataFrame()
features['feature'] = data.columns
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)

features.plot(kind='barh', figsize=(25, 25))

from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(clf, prefit=True)
train_reduced = model.transform(data[:train_objs_num])
test_reduced = model.transform(data[train_objs_num:])
print(train_reduced.shape,test_reduced.shape)


from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV


logreg = LogisticRegression()
Exemplo n.º 59
0
                            idx.append(k)
                    # 计算这一类特征的权值系数均值
                    mean = coef / len(idx)
                    self.coef_[i][idx] = mean
        return self
import scipy.io as sio
image_path = r'./mat/simple.mat'
image_D = sio.loadmat(image_path)
X = image_D['dataset']
y = image_D['label2']
'''
iris = load_iris()
X, y = iris.data, iris.target
y.resize((150,1))
y = np.hstack((y,np.zeros((150,1))))
'''
from sklearn.ensemble import  ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

X2 = np.delete(X,range(16,32,1),1)

model = ExtraTreesClassifier()
model.fit(X2, y)
print(model.feature_importances_)
#带L1和L2惩罚项的逻辑回归作为基模型的特征选择
#参数threshold为权值系数之差的阈值
a = SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(X, y)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
a = SelectFromModel(lsvc,prefit=True)
a = a.transform(X)
print("X_new 共有 %s 个特征"%a.shape[1])
Exemplo n.º 60
0
madelonY = madelon['Class'].copy().values

adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split(
    adultX, adultY, test_size=0.3, random_state=0, stratify=adultY)
madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split(
    madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY)

pipeA = Pipeline([('Scale', StandardScaler()),
                  ('MLP',
                   MLPClassifier(max_iter=2000,
                                 early_stopping=True,
                                 random_state=55))])

pipeM = Pipeline([('Scale', StandardScaler()),
                  ('Cull1',
                   SelectFromModel(RandomForestClassifier(random_state=1),
                                   threshold='median')),
                  ('Cull2',
                   SelectFromModel(RandomForestClassifier(random_state=2),
                                   threshold='median')),
                  ('Cull3',
                   SelectFromModel(RandomForestClassifier(random_state=3),
                                   threshold='median')),
                  ('Cull4',
                   SelectFromModel(RandomForestClassifier(random_state=4),
                                   threshold='median')),
                  ('MLP',
                   MLPClassifier(max_iter=2000,
                                 early_stopping=True,
                                 random_state=55))])

d = adultX.shape[1]