예제 #1
0
파일: test_base.py 프로젝트: yueyuep/TCNN
def test_function_sampler_reject_sparse():
    X_sparse = sparse.csr_matrix(X)
    sampler = FunctionSampler(accept_sparse=False)
    with pytest.raises(TypeError,
                       match="A sparse matrix was passed, "
                       "but dense data is required"):
        sampler.fit_resample(X_sparse, y)
예제 #2
0
def test_function_sampler_reject_sparse():
    X_sparse = sparse.csr_matrix(X)
    sampler = FunctionSampler(accept_sparse=False)
    with pytest.raises(
            TypeError,
            match="A sparse matrix was passed, "
            "but dense data is required"):
        sampler.fit_resample(X_sparse, y)
예제 #3
0
def test_function_resampler_fit():
    # Check that the validation is bypass when calling `fit`
    # Non-regression test for:
    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/782
    X = np.array([[1, np.nan], [2, 3], [np.inf, 4]])
    y = np.array([0, 1, 1])

    def func(X, y):
        return X[:1], y[:1]

    sampler = FunctionSampler(func=func, validate=False)
    sampler.fit(X, y)
    sampler.fit_resample(X, y)
예제 #4
0
def test_function_sampler_func(X, y):
    def func(X, y):
        return X[:10], y[:10]

    sampler = FunctionSampler(func=func)
    X_res, y_res = sampler.fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X[:10])
    assert_array_equal(y_res, y[:10])
예제 #5
0
def test_function_sampler_func(X, y):
    def func(X, y):
        return X[:10], y[:10]

    sampler = FunctionSampler(func=func)
    X_res, y_res = sampler.fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X[:10])
    assert_array_equal(y_res, y[:10])
예제 #6
0
def test_function_sampler_func_kwargs(X, y):
    def func(X, y, sampling_strategy, random_state):
        rus = RandomUnderSampler(
            sampling_strategy=sampling_strategy, random_state=random_state)
        return rus.fit_resample(X, y)

    sampler = FunctionSampler(
        func=func, kw_args={'sampling_strategy': 'auto',
                            'random_state': 0})
    X_res, y_res = sampler.fit_resample(X, y)
    X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X_res_2)
    assert_array_equal(y_res, y_res_2)
예제 #7
0
def test_function_sampler_func_kwargs(X, y):
    def func(X, y, sampling_strategy, random_state):
        rus = RandomUnderSampler(
            sampling_strategy=sampling_strategy, random_state=random_state
        )
        return rus.fit_resample(X, y)

    sampler = FunctionSampler(
        func=func, kw_args={"sampling_strategy": "auto", "random_state": 0}
    )
    X_res, y_res = sampler.fit_resample(X, y)
    X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X_res_2)
    assert_array_equal(y_res, y_res_2)
예제 #8
0
def test_function_sampler_identity(X, y):
    sampler = FunctionSampler()
    X_res, y_res = sampler.fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X)
    assert_array_equal(y_res, y)
# ``fit_resample``.


def outlier_rejection(X, y):
    """This will be our function used to resample our dataset."""
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng,
                            behaviour='new')
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]


reject_sampler = FunctionSampler(func=outlier_rejection)
X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train)
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')

##############################################################################
# Integrate it within a pipeline
##############################################################################

##############################################################################
# By elimnating outliers before the training, the classifier will be less
# affected during the prediction.

pipe = make_pipeline(
    FunctionSampler(func=outlier_rejection),
    LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng))
y_pred = pipe.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))
def plot_scatter(X, y, title):
    """Function to plot some data as a scatter plot."""
    plt.figure(figsize=(16, 16))
    plt.scatter(X[y == 1, 0], X[y == 1, 1], c='r', label='Class #1 - Fraud')
    plt.scatter(X[y == 0, 0],
                X[y == 0, 1],
                c='b',
                label='Class #0 - Non-Fraud')
    plt.legend()
    plt.title(title)


reject_sampler = FunctionSampler(func=outlier_rejection)
X_vals = X.values
y_vals = y.values
X_inliers, y_inliers = reject_sampler.fit_resample(X_vals, y_vals)
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')

print("Total outliers removed: {:}".format(len(X_vals) - len(X_inliers)))
print("New lenght of X: {} ; new length of y {}".format(
    len(X_inliers), len(y_inliers)))


def nosampling_pipeline(data=[], verbose=False, clean=False, plot=False):

    results_table = []
    results = []
    rand_state = 42

    if clean:
        X = data.drop('Class', axis=1)
예제 #11
0
def test_function_sampler_identity(X, y):
    sampler = FunctionSampler()
    X_res, y_res = sampler.fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X)
    assert_array_equal(y_res, y)
#print(blind_reviews.shape,blind_sentiments.shape)

#OVERSAMPLING of Data
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn import FunctionSampler

train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(reviews_data['reviews_text'],
                                  reviews_data['sentiment'], test_size=0.3, random_state=0)

def resample(X, y):
    return RandomOverSampler().fit_resample(X, y)

sampler = FunctionSampler(func=resample, validate=False)
#reshape and ravel coverts pandas df to numpy array, since RandomOverSampler only accepts numpy array
train_reviews, train_sentiments = sampler.fit_resample(train_reviews.values.reshape(-1,1), train_sentiments.ravel())

train_reviews = train_reviews.reshape(train_reviews.size,)

train_reviews = pd.Series(train_reviews)
train_sentiments = pd.Series(train_sentiments)
train_sentiments.value_counts()

# CountVectorizer implements both tokenization and occurrence counting in a single class. Read more here https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# You can also reuse the from scratch code we learnt in previous class
# TfidfVectorizer Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer followed by TfidfTransformer.
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#Count vectorizer with 
lower_count_thr = 50 # rare words/tokens
upper_count_thr = 200 # frequent/common tokens
# ``fit_resample``.


def outlier_rejection(X, y):
    """This will be our function used to resample our dataset."""
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng,
                            behaviour='new')
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]


reject_sampler = FunctionSampler(func=outlier_rejection)
X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train)
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')

##############################################################################
# Integrate it within a pipeline
##############################################################################

##############################################################################
# By elimnating outliers before the training, the classifier will be less
# affected during the prediction.

pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
                     LogisticRegression(solver='lbfgs', multi_class='auto',
                                        random_state=rng))
y_pred = pipe.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))