def test_function_sampler_reject_sparse(): X_sparse = sparse.csr_matrix(X) sampler = FunctionSampler(accept_sparse=False) with pytest.raises(TypeError, match="A sparse matrix was passed, " "but dense data is required"): sampler.fit_resample(X_sparse, y)
def test_function_sampler_reject_sparse(): X_sparse = sparse.csr_matrix(X) sampler = FunctionSampler(accept_sparse=False) with pytest.raises( TypeError, match="A sparse matrix was passed, " "but dense data is required"): sampler.fit_resample(X_sparse, y)
def test_function_resampler_fit(): # Check that the validation is bypass when calling `fit` # Non-regression test for: # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/782 X = np.array([[1, np.nan], [2, 3], [np.inf, 4]]) y = np.array([0, 1, 1]) def func(X, y): return X[:1], y[:1] sampler = FunctionSampler(func=func, validate=False) sampler.fit(X, y) sampler.fit_resample(X, y)
def test_function_sampler_func(X, y): def func(X, y): return X[:10], y[:10] sampler = FunctionSampler(func=func) X_res, y_res = sampler.fit_resample(X, y) assert_allclose_dense_sparse(X_res, X[:10]) assert_array_equal(y_res, y[:10])
def test_function_sampler_func_kwargs(X, y): def func(X, y, sampling_strategy, random_state): rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=random_state) return rus.fit_resample(X, y) sampler = FunctionSampler( func=func, kw_args={'sampling_strategy': 'auto', 'random_state': 0}) X_res, y_res = sampler.fit_resample(X, y) X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y) assert_allclose_dense_sparse(X_res, X_res_2) assert_array_equal(y_res, y_res_2)
def test_function_sampler_func_kwargs(X, y): def func(X, y, sampling_strategy, random_state): rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=random_state ) return rus.fit_resample(X, y) sampler = FunctionSampler( func=func, kw_args={"sampling_strategy": "auto", "random_state": 0} ) X_res, y_res = sampler.fit_resample(X, y) X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y) assert_allclose_dense_sparse(X_res, X_res_2) assert_array_equal(y_res, y_res_2)
def test_function_sampler_identity(X, y): sampler = FunctionSampler() X_res, y_res = sampler.fit_resample(X, y) assert_allclose_dense_sparse(X_res, X) assert_array_equal(y_res, y)
# ``fit_resample``. def outlier_rejection(X, y): """This will be our function used to resample our dataset.""" model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng, behaviour='new') model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1] reject_sampler = FunctionSampler(func=outlier_rejection) X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train) plot_scatter(X_inliers, y_inliers, 'Training data without outliers') ############################################################################## # Integrate it within a pipeline ############################################################################## ############################################################################## # By elimnating outliers before the training, the classifier will be less # affected during the prediction. pipe = make_pipeline( FunctionSampler(func=outlier_rejection), LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng)) y_pred = pipe.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred))
def plot_scatter(X, y, title): """Function to plot some data as a scatter plot.""" plt.figure(figsize=(16, 16)) plt.scatter(X[y == 1, 0], X[y == 1, 1], c='r', label='Class #1 - Fraud') plt.scatter(X[y == 0, 0], X[y == 0, 1], c='b', label='Class #0 - Non-Fraud') plt.legend() plt.title(title) reject_sampler = FunctionSampler(func=outlier_rejection) X_vals = X.values y_vals = y.values X_inliers, y_inliers = reject_sampler.fit_resample(X_vals, y_vals) plot_scatter(X_inliers, y_inliers, 'Training data without outliers') print("Total outliers removed: {:}".format(len(X_vals) - len(X_inliers))) print("New lenght of X: {} ; new length of y {}".format( len(X_inliers), len(y_inliers))) def nosampling_pipeline(data=[], verbose=False, clean=False, plot=False): results_table = [] results = [] rand_state = 42 if clean: X = data.drop('Class', axis=1)
#print(blind_reviews.shape,blind_sentiments.shape) #OVERSAMPLING of Data from sklearn.model_selection import train_test_split from imblearn.over_sampling import RandomOverSampler from imblearn import FunctionSampler train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(reviews_data['reviews_text'], reviews_data['sentiment'], test_size=0.3, random_state=0) def resample(X, y): return RandomOverSampler().fit_resample(X, y) sampler = FunctionSampler(func=resample, validate=False) #reshape and ravel coverts pandas df to numpy array, since RandomOverSampler only accepts numpy array train_reviews, train_sentiments = sampler.fit_resample(train_reviews.values.reshape(-1,1), train_sentiments.ravel()) train_reviews = train_reviews.reshape(train_reviews.size,) train_reviews = pd.Series(train_reviews) train_sentiments = pd.Series(train_sentiments) train_sentiments.value_counts() # CountVectorizer implements both tokenization and occurrence counting in a single class. Read more here https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html # You can also reuse the from scratch code we learnt in previous class # TfidfVectorizer Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer followed by TfidfTransformer. # from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer #Count vectorizer with lower_count_thr = 50 # rare words/tokens upper_count_thr = 200 # frequent/common tokens
# ``fit_resample``. def outlier_rejection(X, y): """This will be our function used to resample our dataset.""" model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng, behaviour='new') model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1] reject_sampler = FunctionSampler(func=outlier_rejection) X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train) plot_scatter(X_inliers, y_inliers, 'Training data without outliers') ############################################################################## # Integrate it within a pipeline ############################################################################## ############################################################################## # By elimnating outliers before the training, the classifier will be less # affected during the prediction. pipe = make_pipeline(FunctionSampler(func=outlier_rejection), LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng)) y_pred = pipe.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred))