Exemplo n.º 1
0
def test_select_percentile_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the percentile heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Exemplo n.º 2
0
def test_select_percentile_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the percentile heuristic
    """
    X, Y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='percentile',
                                   param=25).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Exemplo n.º 3
0
def test_select_percentile_regression_full():
    """
    Test whether the relative univariate feature selection
    selects all features when '100%' is asked.
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=100)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=100).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.ones(20)
    assert_array_equal(support, gtruth)
Exemplo n.º 4
0
def test_select_percentile_regression_full():
    """
    Test whether the relative univariate feature selection
    selects all features when '100%' is asked.
    """
    X, Y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=100)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                    param=100).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.ones(20)
    assert_array_equal(support, gtruth)
Exemplo n.º 5
0
def test_select_percentile_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the percentile heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=25).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
Exemplo n.º 6
0
def test_select_percentile_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the percentile heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                    param=25).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
Exemplo n.º 7
0
def selected_features(pair_data, labels, features):
    #过滤掉低方差的特征值
    vt_sel = VarianceThreshold(threshold=(0.85 * (1 - 0.85)))
    vt_sel.fit(pair_data)

    #本次试验中没有需要过滤的特征,在这里只是举例

    print 'vt_sel.get_support()====', vt_sel.get_support()
    sel_features1 = features[vt_sel.get_support()]

    sel_pair_data1 = pair_data[:, vt_sel.get_support()]
    print '低方差过滤掉%d个特征' % (features.shape[0] - sel_features1.shape[0])

    print 'features.shape[0]====', features.shape[0], '======', features.shape
    print 'sel_features1.shape[0]====', sel_features1.shape[
        0], '=========', sel_features1.shape

    #2  根据  单变量统计分析   选择特证
    #保留重要的前90%的特征
    sp_sel = SelectPercentile(percentile=95)
    sp_sel.fit(sel_pair_data1, labels)

    sel_features2 = sel_features1[sp_sel.get_support()]

    sel_pair_data2 = sel_pair_data1[:, sp_sel.get_support()]
    print '单变量统计分析过滤掉%d个特征' % (sel_features1.shape[0] - sel_features2.shape[0])

    # 根据特征scroe绘制柱状图
    feat_ser = pd.Series(data=sp_sel.scores_, index=features)
    sort_feat_ser = feat_ser.sort_values(ascending=False)
    plt.figure(figsize=(18, 12))
    sort_feat_ser.plot(kind='bar')
    plt.savefig('../feat_importance.png')
    plt.show()
    return sel_pair_data2, sel_features2
Exemplo n.º 8
0
def select_features(pair_data, labels, features):
    """
            进行特征选择
    """
    print '特征选择...'

    # 1. 过滤掉“低方差”的特征列
    vt_sel = VarianceThreshold(threshold=(0.85 * (1 - 0.85)))
    vt_sel.fit(pair_data)

    # 本次实验中没有需要过滤的特征,在这里只是举例
    sel_features1 = features[vt_sel.get_support()]
    sel_pair_data1 = pair_data[:, vt_sel.get_support()]
    print '“低方差”过滤掉%d个特征' % (features.shape[0] - sel_features1.shape[0])

    # 2. 根据“单变量统计分析”选择特征\
    # 保留重要的前90%的特征
    sp_sel = SelectPercentile(percentile=95)
    sp_sel.fit(sel_pair_data1, labels)

    sel_features2 = sel_features1[sp_sel.get_support()]
    sel_pair_data2 = sel_pair_data1[:, sp_sel.get_support()]
    print '“单变量统计分析”过滤掉%d个特征' % (sel_features1.shape[0] -
                                 sel_features2.shape[0])

    # 根据特征的score绘制柱状图
    feat_ser = pd.Series(data=sp_sel.scores_, index=features)
    sorted_feat_ser = feat_ser.sort_values(ascending=False)
    plt.figure(figsize=(18, 12))
    sorted_feat_ser.plot(kind='bar')
    plt.savefig('./feat_importance.png')
    plt.show()

    return sel_pair_data2, sel_features2
Exemplo n.º 9
0
def select_features(pair_data, labels, features):
    # 1. 过滤掉“低方差”的特征列
    vt_sel = VarianceThreshold(threshold=(0.9*(1-0.9)))
    vt_sel.fit(pair_data)
#     print(vt_sel.get_support())
    
 
    #过滤掉噪声特征
    features = features[vt_sel.get_support()]
    
    pair_data = pair_data[:,vt_sel.get_support()]
#     print(pair_data)
    #得到最重要的95%的样本
    sp_sel = SelectPercentile(percentile=95)
    sp_sel.fit(pair_data, labels)
    features = features[sp_sel.get_support()]
    pair_data_1 = pair_data[:,sp_sel.get_support()]
#     print(pair_data_1)
    return pair_data_1,features
Exemplo n.º 10
0
        Xn = csr_matrix(np.array((0, 0)))
        yn = load_numpy_matrix(feature_set_path + 'valueVector' + tag +
                               '_train.npy')
        print Counter(yn)

        filepath = 'MANUAL'
        print load_numpy_matrix(feature_set_path + 'featureArray' + tag +
                                '_train.npy').shape
        print load_numpy_matrix(feature_set_path + 'socialVector' + tag +
                                '_train.npy').shape
        Xn = np.hstack((load_numpy_matrix(feature_set_path + 'featureArray' +
                                          tag + '_train.npy'),
                        load_numpy_matrix(feature_set_path + 'socialVector' +
                                          tag + '_train.npy')))

        Xn = SelectPercentile(score_func=f_classif,
                              percentile=perc).fit_transform(Xn, yn)

        if split:
            sss = StratifiedShuffleSplit(yn,
                                         1,
                                         test_size=0.85,
                                         random_state=42)
            for train, test in sss:
                Xn, yn = Xn[train], yn[train]

        parameter_tuning(Xn, yn, scale=1)
        print "DONE WITH MANUAL"

    if sparse_tests:
        filepaths = list()
        #         filepaths.append(feature_set_path + 'binaryWordData' + tag + '_train.npz')
Exemplo n.º 11
0
from sklearn.externals import joblib


#get the data_txt from DB
numDimensions = 22
numFolds = 5

X_train = uux_data.getUUXSentences(numDimensions)
y_train = uux_data.getUUXSentenceDimension(numDimensions)
y_train_binary = MultiLabelBinarizer().fit_transform(y_train)

target_names = uux_data.getUUXDimensions(numDimensions)


#data_txt preproccessing - tokenization, selecting 90% of the best features
vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize)
X_train_features = vectorizer.fit_transform(X_train)
X_train_features_names = vectorizer.fit(X_train).vocabulary_

ch2 = SelectPercentile(chi2, percentile=16)
X_train_features = ch2.fit_transform(X_train_features, y_train_binary)
selected_features_names = np.asarray(vectorizer.get_feature_names())[ch2.get_support()]
print str(len(selected_features_names))

classifier = Pipeline([
    ('tfidf', vectorizer),
    ('chi2', ch2),
    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, y_train_binary)
joblib.dump(classifier, 'classifier/uux_classifier.pkl')
from sklearn.feature_extraction.text import TfidfTransformer

f = open('../sentistrength/data_txt/combined/truth_dataset_3_scale.txt')
lines = f.readlines()
f.close()

sentences = []
sentiments = []

for line in lines:
    row = []
    elements = line.rstrip('\r\n').split('\t')
    sentences.append(elements[1])
    sentiments.append(int(elements[0]))

ch2 = SelectPercentile(chi2, percentile=96)

# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {'tfidf__use_idf': (True, False)}

# K-Fold cross-validation strategy
skf = cross_validation.StratifiedKFold(sentences, n_folds=5)

mnb_grid = GridSearchCV(pipeline,
from sklearn.cross_validation import KFold
from sklearn.pipeline import Pipeline
import numpy as np

X_train = uux_data.getUUXSentences(22)
y_train = uux_data.getUUXSentenceDimension(22)
y_train_binary = MultiLabelBinarizer().fit_transform(y_train)
target_names = uux_data.getUUXDimensions(22)

###############################################################################
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('feature_selection', SelectPercentile()),
    ('clf', OneVsRestClassifier(LinearSVC())),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2), (2, 2)),  # unigrams or bigrams
    'vect__lowercase': (True, False),
    'tfidf__use_idf': (True, False),
    'tfidf__smooth_idf': (True, False),
    'tfidf__sublinear_tf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'feature_selection__score_func': (chi2, f_classif),
    'feature_selection__percentile': (25, 90)
Exemplo n.º 14
0
        'Lemma',
        #                          'Form',
        #                          'LemmaFormDiff_Back',   # class
        #                          'LemmaFormDiff_Front',  # not used, yet
        'LemmaSuff_1',
        'LemmaSuff_2',
        'LemmaSuff_3',
        'LemmaSuff_4',  #'LemmaSuff_5', 'LemmaSuff_6',
        #'LemmaSuff_7', 'LemmaSuff_8',
        'Tag_POS',
        'Tag_CPOS',
        'NEIGHBOR-1_Tag_POS',
        'NEIGHBOR-1_Tag_CPOS',
        'NEIGHBOR-1_Lemma'
    ],
    #          'filter_attr': lambda key, val: False if key.startswith('Tag') and val in ['.', '-'] else True,
    'vectorizer':
    DictVectorizer(),
    'feature_filter':
    SelectPercentile(percentile=20),
    'classifier_class':
    LogisticRegression,
    'classifier_params': {
        'penalty': ['l1'],
        'C': [1, 10, 100, 1000],
        'tol': [0.01, 0.001, 0.0001]
    },
    'unfold_pattern':
    '^(penalty|C|tol)$'
}
Exemplo n.º 15
0
# Support Vector Machine
for perc in range(1, 100, 2):

    vectorizer = CountVectorizer(tokenizer=negation_handling.tokenize,
                                 ngram_range=(1, 1),
                                 max_df=0.5,
                                 lowercase=False)
    tfidfTans = TfidfTransformer(use_idf=True,
                                 sublinear_tf=True,
                                 smooth_idf=False,
                                 norm='l2')

    classifier = Pipeline([
        ('vect', vectorizer),
        ('tfidf', tfidfTans),
        ('feature_selection', SelectPercentile(chi2, percentile=perc)),
        ('clf', LinearSVC(C=0.10000000000000001, multi_class='ovr')),
    ])

    scores = cross_validation.cross_val_score(classifier,
                                              sentences,
                                              sentiments,
                                              cv=5,
                                              scoring='precision')
    results = np.append(results, scores.mean())
    skf = cross_validation.StratifiedKFold(sentiments, n_folds=10)

# Multinomial Naive Bayes
# for perc in range(1, 100, 2):
#
#     vectorizer = CountVectorizer(tokenizer=negation_handling.tokenize, ngram_range=(1, 2), max_df=0.5, lowercase=True)
Exemplo n.º 16
0
y_train = uux_data.getUUXSentenceDimension(numDimensions)
y_train_binary = MultiLabelBinarizer().fit_transform(y_train)

target_names = uux_data.getUUXDimensions(numDimensions)

x_train_folds, x_test_folds, y_train_folds, y_test_folds = uux_labelset_stratification.kFoldStratify(
    numFolds)
target_names = uux_data.getUUXDimensions(numDimensions)

percentiles = range(1, 100, 5)
results = []

for perc in range(1, 100, 5):

    p = np.empty([numFolds])
    ch2 = SelectPercentile(chi2, percentile=perc)

    #perfrom 5folds cross-validation
    for i in range(0, numFolds):

        #data_txt preproccessing - tokenization, selecting 90% of the best features
        vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize)
        X_train_features = vectorizer.fit_transform(x_train_folds[i])
        X_train_features_names = vectorizer.fit(x_train_folds[i]).vocabulary_

        X_train_features = ch2.fit_transform(X_train_features,
                                             y_train_folds[i])
        selected_features_names = np.asarray(
            vectorizer.get_feature_names())[ch2.get_support()]

        classifier = Pipeline([('tfidf', vectorizer), ('chi2', ch2),
Exemplo n.º 17
0
			'RandomizedLogisticRegression':RandomizedLogisticRegression(),
			'RandomizedPCA':RandomizedPCA(),
			'Ridge':Ridge(),
			'RidgeCV':RidgeCV(),
			'RidgeClassifier':RidgeClassifier(),
			'RidgeClassifierCV':RidgeClassifierCV(),
			'RobustScaler':RobustScaler(),
			'SGDClassifier':SGDClassifier(),
			'SGDRegressor':SGDRegressor(),
			'SVC':SVC(),
			'SVR':SVR(),
			'SelectFdr':SelectFdr(),
			'SelectFpr':SelectFpr(),
			'SelectFwe':SelectFwe(),
			'SelectKBest':SelectKBest(),
			'SelectPercentile':SelectPercentile(),
			'ShrunkCovariance':ShrunkCovariance(),
			'SkewedChi2Sampler':SkewedChi2Sampler(),
			'SparsePCA':SparsePCA(),
			'SparseRandomProjection':SparseRandomProjection(),
			'SpectralBiclustering':SpectralBiclustering(),
			'SpectralClustering':SpectralClustering(),
			'SpectralCoclustering':SpectralCoclustering(),
			'SpectralEmbedding':SpectralEmbedding(),
			'StandardScaler':StandardScaler(),
			'TSNE':TSNE(),
			'TheilSenRegressor':TheilSenRegressor(),
			'VBGMM':VBGMM(),
			'VarianceThreshold':VarianceThreshold(),}

    
Exemplo n.º 18
0
# Y=labelProp.transduction_
# print('Shape of Y:', Y.shape)
# print('first row: ', Y[0])

# SCORER
scorer = make_scorer(score_func=singleLabelScore, greater_is_better=False)

# PREPROCESSING
# SCALING
minMaxScaler = MinMaxScaler(feature_range=(0.0, 1.0))
#normalizer = skprep.Normalizer()
columnDeleter = fs.FeatureDeleter()

# FEATURE SELECTION
varianceThresholdSelector = VarianceThreshold(threshold=(0))
percentileSelector = SelectPercentile(score_func=f_classif, percentile=20)
kBestSelector = SelectKBest(f_classif, 1000)

# FEATURE EXTRACTION
#rbmPipe = skpipe.Pipeline(steps=[('scaling', minMaxScaler), ('rbm', rbm)])
nmf = NMF(n_components=150)
pca = PCA(n_components=80)
sparse_pca = SparsePCA(n_components=700, max_iter=3, verbose=2)
kernel_pca = KernelPCA(n_components=150)  # Costs huge amounts of ram
randomized_pca = RandomizedPCA(n_components=500)

# REGRESSORS
random_forest_regressor = RandomForestRegressor(n_estimators=256)
gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60)
support_vector_regressor = svm.SVR()
Exemplo n.º 19
0
        'LemmaSuff_5', 'LemmaSuff_6', 'LemmaSuff_7', 'LemmaSuff_8', 'Tag_POS',
        'Tag_SubPOS', 'Tag_Gen', 'Tag_Num', 'Tag_Cas', 'Tag_PGe', 'Tag_PNu',
        'Tag_Per', 'Tag_Ten', 'Tag_Gra', 'Tag_Neg', 'Tag_Voi', 'Tag_Var'
    ],

    # This filters out some feature values (here 'Tag_*' values equal to '.' or '-'.
    # You can use an arbitrary lambda function here (or None if you don't want it).
    'filter_attr':
    lambda key, val: False
    if key.startswith('Tag') and val in ['.', '-'] else True,
    'vectorizer':
    DictVectorizer(),

    # Feature filtering using ANOVA (recommended)
    'feature_filter':
    SelectPercentile(percentile=10),

    # You can use any Scikit-Learn classifier here
    'classifier_class':
    LogisticRegression,

    # Classifier parameter settings (see Scikit-Learn documentation for the list of parameters).
    # If you use lists instead of single values and specify the unfold_pattern, all the values
    # in the lists will be tried in parallel on a cluster using qsub).
    # Do not use lists of values and the unfold_pattern setting if you don't have access to
    # cluster/qsub.
    'classifier_params': {
        'penalty': ['l1', 'l2'],
        'C': [1, 10, 100, 1000],
        'tol': [0.01, 0.001, 0.0001]
    },
Exemplo n.º 20
0
labels_test_gold = data['labels']
labels_test_gold.shape = (labels_test_gold.shape[1], )
# with open('../feature_names.pickle', 'r') as pickled:
#    feature_names = pickle.load(pickled)

print "Loaded data; testing classifier..."

features_train, labels_train = ClassBalancingClassifierWrapper.rebalance(
    features_train, labels_train, ratio=2)

results = []
for i in range(15):
    print 'Round', i
    classifier = DecisionTreeClassifier()
    classifier = SKLPipeline([('feature_selection',
                               SelectPercentile(f_classif, 1)),
                              ('classification', classifier)])
    classifier.fit(features_train, labels_train)

    labels_test_predicted = classifier.predict(features_test)
    results.append(diff_binary_vectors(labels_test_predicted,
                                       labels_test_gold))

# support = classifier.steps[0][1].get_support(True)
# print 'Selected', len(support), 'features:'
# for index in support:
#    print '   ', feature_names[index]

print 'Results:'
print ClassificationMetrics.average(results, False)