from sklearn.datasets import load_iris from sklearn.svm import LinearSVC from sklearn.model_selection import train_test_split from imblearn.under_sampling import NearMiss from imblearn.pipeline import make_pipeline from imblearn.metrics import classification_report_imbalanced print(__doc__) RANDOM_STATE = 42 # Create a folder to fetch the dataset iris = load_iris() # Make the dataset imbalanced # Select only half of the first class iris.data = iris.data[25:-1, :] iris.target = iris.target[25:-1] X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=RANDOM_STATE) # Create a pipeline pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE)) pipeline.fit(X_train, y_train) # Classify and report the results print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
X_batch, y_batch = next(training_generator) sess.run( [train_op, loss], feed_dict={ data: X_batch, targets: y_batch }, ) # For each epoch, run accuracy on train and test predicts_train = sess.run(predict, feed_dict={data: X}) print("epoch: {} train accuracy: {:.3f}".format( e, accuracy(y, predicts_train))) @pytest.mark.parametrize("sampler", [None, NearMiss(), RandomOverSampler()]) def test_balanced_batch_generator(data, sampler): if LooseVersion(tf.__version__) < '2': check_balanced_batch_generator_tf_1_X_X(data, sampler) else: check_balanced_batch_generator_tf_2_X_X_compat_1_X_X(data, sampler) @pytest.mark.parametrize("keep_sparse", [True, False]) def test_balanced_batch_generator_function_sparse(data, keep_sparse): X, y = data training_generator, steps_per_epoch = balanced_batch_generator( sparse.csr_matrix(X), y, keep_sparse=keep_sparse,
# heuristic rules in order to select samples. NearMiss-1 selects samples from # the majority class for which the average distance of the :math:`k`` nearest # samples of the minority class is the smallest. NearMiss-2 selects the samples # from the majority class for which the average distance to the farthest # samples of the negative class is the smallest. NearMiss-3 is a 2-step # algorithm: first, for each minority sample, their :math:`m` # nearest-neighbors will be kept; then, the majority samples selected are the # on for which the average distance to the :math:`k` nearest neighbors is the # largest. # %% from imblearn.under_sampling import NearMiss X, y = create_dataset(n_samples=1000, weights=(0.05, 0.15, 0.8), class_sep=1.5) samplers = [NearMiss(version=1), NearMiss(version=2), NearMiss(version=3)] fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15, 25)) for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, model, ax[0], title=f"Decision function for {sampler.__class__.__name__}-{sampler.version}", ) plot_resampling( X, y, sampler,
print('Accuracy : ', accuracy_score(y, predictions), end='\n') print('Classification Report : ', end='\n') print(classification_report(y, predictions)) # # Class Imbalance - UnderSampling # In[96]: ####################### Class Imbalance - Undersampling ###################### X_train = df_train.iloc[:, df_train.columns != 'renewal_status'].values y_train = df_train['renewal_status'].values from imblearn.under_sampling import NearMiss nr = NearMiss() X_train, y_train = nr.fit_sample(X_train, y_train) # In[97]: np.bincount(y_train) # In[98]: # import the ML algorithm from xgboost import XGBClassifier # Instantiate the classifier xgbClassifier = XGBClassifier(random_state=1, learning_rate=0.01) # Train classifier
labels = [] index = 0 for line in contain: # 一行行读数据文件 line = line.strip() # 删除line头和尾的空格 listFormLine = re.split(r'[ ,;:\t]+', line) # 指定','为分隔符,将line分割开 '''将listFormLine中的前len(len(listFormLine)-1)列加入到矩阵中去''' features[index:] = listFormLine[0:len(listFormLine) - 1] labels.append(listFormLine[-1]) # 最后一列作为类标 index += 1 '''返回的features为特征矩阵,labels为类别列表''' labels=np.array([int(x) for x in labels]) file.close() X=features y=labels # Apply Nearmiss nm=NearMiss(version=2) X_resampled = [] y_resampled = [] X_res,y_res=nm.fit_sample(X,y) X_resampled.append(X_res) y_resampled.append(y_res) y_resampled=y_resampled[0] X_resampled=X_resampled[0] y_resampled=y_resampled[:,np.newaxis] resampled=np.hstack((X_resampled,y_resampled)).tolist() f=open("re_NearMiss2.csv",'w') for i in range(len(resampled)): for j in range(len(resampled[i])): if j<len(resampled[i])-1: f.write(str(resampled[i][j])+',')
def sample_data(self, sampling_method: str, X_train, Y_train, base_file_name, target_column="star_rating"): """ Creates sampler based in sampling method and return the resulting X and y This method will also save the final distribution to a CSV file based on base_file_name :param X_train: Original features :param Y_train: Original labels :param base_file_name: base file name to save the final distribution csv :return: """ ## if we want to over sample or under sample log.debug(f'Y_train {Y_train.shape}') log.debug(f'Y_train {Y_train.head()}') grouped_df = Y_train.reset_index().groupby(target_column).count() log.info( f'Distribution before sampling with {sampling_method}\n{grouped_df}' ) log.debug(f'grouped type: {type(grouped_df)}') log.debug(f'grouped: {grouped_df.head()}') log.debug(f'grouped: {grouped_df.shape}') if sampling_method == "smote": sampler = SMOTE(random_state=RSTATE, sampling_strategy='not majority', n_jobs=self.n_jobs) elif sampling_method == "adasyn": sampler = ADASYN(random_state=RSTATE, sampling_strategy='not majority', n_jobs=self.n_jobs) elif sampling_method == "random_over_sampling": sampler = RandomOverSampler(random_state=RSTATE, sampling_strategy='not majority') elif sampling_method == "random_under_sampling": sampler = RandomUnderSampler(random_state=RSTATE, replacement=True) elif sampling_method == "nearmiss2": sampler = NearMiss(random_state=RSTATE, sampling_strategy='not minority', version=2, n_jobs=self.n_jobs) else: raise Exception( f"Sampling method not supported: {sampling_method}") X_train_res, Y_train_res = sampler.fit_resample( X_train, Y_train.ravel()) X_train = pd.DataFrame(X_train_res, columns=X_train.columns) Y_train = pd.DataFrame(Y_train_res, columns=[target_column]) # get distribution of samples after samping dist = Y_train.reset_index().groupby(target_column).count() log.info(f'Distribution after sampling with {sampling_method}\n{dist}') log.debug(dist.head()) dist.to_csv( f'{REPORT_DIR}/{base_file_name}-histogram-{sampling_method}.csv') return X_train, Y_train
def crossvalidate(directory_name, splits, data, X, y, baseline=-1, model_num=None, resample=0, feature_set=None, feature_importance=0, average_method='macro', path=None): """ Store the results calculated according to the arguments and store them in a file. Arguments: directory_name (str): the directory under which the files should be stored splits (int): number of folds data (dataframe): the whole dataset X (dataframe): examples y (dataframe): target/label baseline (int): -1 for no baseline, 1 for all predictions as 1, 0 for all predictions as 0 model_num (int): classification model 1: 2: 3: 4: 5: 6: resample (int): -1 for undersampling, 1 for oversampling and 0 for no resampling feature_set (list): list of features to be considered feature_importance (int): 0 for absent, 1 for present average_method: macro by default path: the path to the directory where the recordings should be stored """ #prepare the dictionary to be written to the file data_dict = dict() metrics_dict = dict() dir_name = path + directory_name + '/' os.mkdir(dir_name) #create a directory for each split for fold in range(1, splits + 1): os.mkdir(dir_name + str(fold)) print(dir_name + str(fold)) #open the config file for writing config_file = open(dir_name + 'config.json', 'w') #open the metrics file for writing metrics_file = open(dir_name + 'metrics.json', 'w') data_dict = {'model_num': model_num} data_dict = {'baseline': baseline} data_dict.update({'resample': resample}) data_dict.update({'feature_set': feature_set}) data_dict.update({'n_features': n_features}) data_dict.update({'feature_importance': feature_importance}) metrics_dict = dict() metrics_dict['f1_macro'] = list() metrics_dict['tpr'] = list() metrics_dict['tnr'] = list() metrics_dict['fpr'] = list() metrics_dict['precision'] = list() metrics_dict['recall'] = list() metrics_dict['accuracy'] = list() metrics_dict['f1'] = list() model = get_model(model_num) kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777) #if model_num == 3: #kfold = ShuffleSplit(n_splits=splits, test_size=0.2, random_state=0) plot_lc(model=model, cv=kfold, X=X, y=y, resample=resample) #linearity test_for_linearity(X, y) i = 0 for train_index, test_index in kfold.split(X, y): #create train-test splits X_train, y_train = X.iloc[train_index], y.iloc[train_index] X_test, y_test = X.iloc[test_index], y.iloc[test_index] ''' #create test set labels for the baseline if applicable if baseline == 0: y_test = y_test.replace(1,0) elif baseline == 1: y_test = y_test.replace(0,1) ''' #resample the training set (if applicable) if resample == -1: #undersample '''NearMiss 3 . NearMiss-3 is a 2-step algorithm: first, for each minority sample, their :m nearest-neighbors will be kept; then, the majority samples selected are the on for which the average distance to the k nearest neighbors is the largest.''' nm = NearMiss(version=3) print(str(sorted(Counter(y_train).items()))) X_resampled, y_resampled = nm.fit_resample(X_train, y_train) X_train = X_resampled y_train = y_resampled print(sorted(Counter(y_train).items())) elif resample == 1: #oversample X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train) X_train = X_resampled y_train = y_resampled print(sorted(Counter(y_resampled).items())) #write the training dataset class distribution to the file file = open(dir_name + str(i + 1) + '/train_val_dist.csv', 'a') file.write(str(sorted(Counter(y_train).items()))) file.write('\n') file.close() model.fit(X_train, y_train) y_pred = model.predict(X_test) if baseline == 0: y_pred = y_pred.replace(1, 0) elif baseline == 1: y_pred = y_pred.replace(0, 1) metrics = get_metrics(y_test, y_pred) for key, value in metrics.items(): metrics_dict[key].append(value) #homoscedasticity test_for_homoscedasticity(X_train, y_train, X_test, y_test) #correlation correlation(data) if feature_importance == 1: if model_num == 1: feat_importances = pd.Series(model.feature_importances_, index=X.columns) elif model_num == 3: feat_importances = pd.Series(abs(svm.coef_[0]), index=X.columns) if model_num != 2: print('Feat. Imp.: ', feat_importances) feat_importances.nlargest(20).plot(kind='barh') #plot_importance(model) plt.show() #write the feature importance values to the file file = open(dir_name + str(i + 1) + '/feature_importances.csv', 'a') for ind in range(0, len(feature_set)): file.write(feature_set[ind] + ',' + str(feat_importances[ind]) + '\n') file.close() perm = PermutationImportance(model, random_state=1).fit(X_train, y_train) print('PERM: ', perm.feature_importances_) display( eli5.show_weights(perm, feature_names=X_train.columns.tolist())) #write the permutation feature importance decrease in error values to the file file = open( dir_name + str(i + 1) + '/permutation_feature_importances.csv', 'a') for ind in range(0, len(feature_set)): file.write(feature_set[ind] + ',' + str(perm.feature_importances_[ind]) + '\n') file.write('\n') file.close() i += 1 for key, values in metrics_dict.items(): metrics_dict[key] = sum(values) / len(values) #write the scores to the file json.dump(metrics_dict, metrics_file) metrics_file.close() #write the configuration values to the file json.dump(data_dict, config_file) config_file.close()
def undersample_data(self, X, y): under_sampler = NearMiss('majority', n_jobs=2) heart_signal_res, labels_res = under_sampler.fit_sample(X, y) heart_signal_res = np.reshape(heart_signal_res, (heart_signal_res.shape[0],)) return heart_signal_res, labels_res
#importing modules import pandas as pd import numpy as np import matplotlib.pyplot as plt #reading data data = pd.read_csv("creditcard.csv") #spliting data read = data.columns.tolist() x = data.iloc[:, 0:30] y = data.iloc[:, -1] #analyzing data print(x.shape) print(y.shape) print(data.isnull().values.any()) #looking for null value dif = count_classes = pd.value_counts(data["Class"], sort=True) print(dif) #plotting imbalnce data dif.plot(kind="bar", rot=0) plt.title("Fraud vs Normal transactions") plt.xlabel("Class") plt.ylabel("Frequency") plt.show() from imblearn.under_sampling import NearMiss nm = NearMiss() x_new, y_new = nm.fit_sample(x, y) print(x_new.shape) print(y_new.shape) from collections import Counter print("Original Data", format(Counter(y))) print("resample Data", format(Counter(y_new)))
from imblearn.under_sampling import NearMiss ## Cria dataframe com os dados de fertilidade fertility_df = pandas.read_csv('fertility_Diagnosis.txt', header=-1) fertility_df.columns = ['Season','Age','Childish diseases','Accident or serious trauma','Surgical intervention', 'High fevers in the last year','Frequency of alcohol consumption','Smoking habit', 'Number of hours spent sitting per day ene-16','Output'] ## Mapeia a coluna de saída dos dados para valores numéricos fertility_df['Output'] = fertility_df['Output'].map({'N': 0, 'O': 1}).astype(int) ## Retira dados de saída das amostras fertility_df_output = fertility_df['Output'] del fertility_df['Output'] ## Faz o balanceamento dos dados, baseado nas saídas desbalanceadas do conjunto de dados nm = NearMiss(random_state=42) fertility_df_balanced, fertility_output_balanced = nm.fit_sample(fertility_df, fertility_df_output) # fertility_df_balanced = fertility_df.as_matrix() # fertility_output_balanced = fertility_df_output.tolist() ## Faz o split dos dados para 70% de treino e 30% de teste training_data, test_data, training_output, test_output = train_test_split(fertility_df_balanced, fertility_output_balanced, test_size=0.3, random_state=42) ## Realiza o treinamento do MLP quantidade_features = training_data.shape[1] mlp = MultiLayerPerceptron( numero_de_entradas=quantidade_features, neuronios_por_camada=[quantidade_features, 1], taxa_aprendizagem=0.5, epocas=5000, precisao=0, debug_training=False, plot=False )
for key in random_samplers: print("######################## %s ########################" % (key)) rus = random_samplers.get(key) model = logistic_regression.Module(X_train.shape[1], 2) X_res, y_res = rus.fit_sample(X_train, y_train) print(X_train.shape) print(X_res.shape, y_res.shape) print(np.sum(y_res)) clf = SVC(probability=True) clf.fit(X_res, y_res) score = clf.predict_proba(X_test) evaluate(y_test, score) # near miss near_miss_models = { 'near miss1': NearMiss(random_state=0, version=1), 'near miss2': NearMiss(random_state=0, version=2), 'near miss3': NearMiss(random_state=0, version=3) } for key in near_miss_models: print("######################## %s ########################" % (key)) nm = near_miss_models.get(key) model = logistic_regression.Module(X_train.shape[1], 2) X_res, y_res = rus.fit_sample(X_train, y_train) print(X_train.shape) print(X_res.shape, y_res.shape) print(np.sum(y_res)) clf = SVC(probability=True) clf.fit(X_res, y_res) score = clf.predict_proba(X_test) evaluate(y_test, score)
def near_miss(X, y): nm = NearMiss() X_res, y_res = nm.fit_resample(X, y) return X_res, y_res
def test_nearmiss_wrong_version(): """Test either if an error is raised when the version is unknown.""" version = 1000 nm1 = NearMiss(version=version, random_state=RND_SEED) assert_raises(ValueError, nm1.fit_sample, X, Y)
def plot_lc(model, X, y, cv, resample=0): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) #resample the training set (if applicable) if resample == -1: #undersample '''NearMiss 3 . NearMiss-3 is a 2-step algorithm: first, for each minority sample, their :m nearest-neighbors will be kept; then, the majority samples selected are the on for which the average distance to the k nearest neighbors is the largest.''' nm = NearMiss(version=3) #print(str(sorted(Counter(y_train).items()))) X_resampled, y_resampled = nm.fit_resample(X_train, y_train) X_train = X_resampled y_train = y_resampled #print(sorted(Counter(y_train).items())) elif resample == 1: #oversample X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train) X_train = X_resampled y_train = y_resampled print(sorted(Counter(y_resampled).items())) train_sizes, train_scores, test_scores = learning_curve( estimator=model, X=X, y=y, train_sizes=np.linspace(0.01, 1.0, 50), cv=cv, scoring='f1_macro') # Create means and standard deviations of training set scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) # Create means and standard deviations of test set scores test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Draw lines plt.plot(train_sizes, train_mean, '--', color="#111111", label="Training score") plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score") # Draw bands plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD") plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD") # Create plot plt.title("Learning Curve") plt.xlabel("Training Set Size"), plt.ylabel("Macro-F1 Score"), plt.legend( loc="best") plt.tight_layout() plt.show()
from sklearn.metrics import confusion_matrix from sklearn.model_selection import GroupKFold from imblearn.under_sampling import NearMiss from imblearn.over_sampling import SMOTE from imblearn.over_sampling import RandomOverSampler train_data = pd.read_csv(root_dir + "train.csv") X = train_data.loc[:, 'mean_x':].values y = train_data.loc[:, 'activity_id'].values groups = train_data.loc[:, 'user_id'].values #%%------------------------------------------------------------------------- start_time = time.time() nm = NearMiss(random_state=31416, ratio='auto', n_jobs=-1) sm1 = SMOTE(random_state=31416, ratio='auto', k_neighbors=5, n_jobs=-1) sm3 = SMOTE(random_state=31416, ratio='auto', k_neighbors=5, n_jobs=-1) ros = RandomOverSampler(random_state=31416) gkf = GroupKFold(n_splits=4) scores_test = [] scores_train = [] for train_index, test_index in gkf.split(X, y, groups): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] ''' activity_filter = (y_train==1) | (y_train==5) X_res = X_train[activity_filter]
def test_nearmiss_error(nearmiss_params, err_msg): nm = NearMiss(**nearmiss_params) with pytest.raises(ValueError, match=err_msg): nm.fit_resample(X, Y)
def underSampling(X, Y): nm1 = NearMiss(version=1) X_resampled, y_resampled = nm1.fit_resample(X, Y) return X_resampled, y_resampled
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Nearmiss 3 nm3 = NearMiss(version=3) X_resampled, y_resampled = nm3.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1],
def main(): samplers = [ None, InstanceHardnessThreshold(sampling_strategy='majority', random_state=123, n_jobs=-1), NearMiss(version=1, sampling_strategy='majority', random_state=123, n_jobs=-1), NearMiss(version=3, sampling_strategy='majority', random_state=123, n_jobs=-1), RandomUnderSampler(sampling_strategy='majority', random_state=123) ] outliers = [ None, IsolationForest(random_state=123, behaviour='new', contamination=0.1), LocalOutlierFactor(n_neighbors=27, contamination=0.1) ] for sampler in samplers: for out in outliers: global sampler_str, out_str, perm_str sampler_str = sampler.__class__.__name__ out_str = out.__class__.__name__ print(f"\nsampler={sampler_str}, outlier={out_str}") X, y, X_valid, y_valid = Dataset.read_all() X, y, X_valid, y_valid = Modification.apply_standartization( X, y, X_valid, y_valid) print(X.shape) if out is not None: X, y = Modification.apply_outliers(X, y, out) print(X.shape) if sampler is None: weights, weight_valid = Modification.make_weights_column( X, y, X_valid, y_valid) else: weights, weight_valid = None, None X, y = Modification.apply_samplers(X, y, sampler) if "Instance" in sampler_str: X, y = Modification.apply_samplers( X, y, RandomUnderSampler(sampling_strategy='majority', random_state=123)) print("0st perm:") perm_str = "0st" est = Model.train(X, y, X_valid, y_valid, weights, weight_valid) print("1st perm:") perm_str = "1st" X, y, X_valid, y_valid = Modification.apply_permutation( X, y, X_valid, y_valid, est, sampler.__class__.__name__, weight_valid) est = Model.train(X, y, X_valid, y_valid, weights, weight_valid) print("2nd perm:") perm_str = "2nd" X, y, X_valid, y_valid = Modification.apply_permutation( X, y, X_valid, y_valid, est, sampler.__class__.__name__, weight_valid) Model.train(X, y, X_valid, y_valid, weights, weight_valid) print(results) analyze_results()
print('Total time - Without Undersampling: ', end - start, ' seconds\n') print(metrics.classification_report(y_validation, validation_result)) print() print('Without Undersampling - Pipeline Score {}'.format(multiC.fit(X_train, y_train).score(X_validation, y_validation))) print() print_results("Without Undersampling - Validation set: ", true_validation, validation_result) print('===============================Without Undersampling Ends===============================\n') print('================================With Undersampling Starts===============================\n') start = time.time() # build model with undersampling nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=0), multiC) nearmiss_model = nearmiss_pipeline.fit(X_train, y_train) nearmiss_prediction = nearmiss_model.predict(X_validation) # Print the distribution of labels about both models print() print("Without Undersampling - data distribution: {}".format(Counter(y_train))) X_nearmiss, y_nearmiss = NearMiss(random_state = 0).fit_sample(X_train, y_train) print("With Undersampling - data distribution: {}".format(Counter(y_nearmiss))) print() end = time.time() # Here comes the result with Undersampling print('Total time - With Undersampling: ', end - start, ' seconds\n') print(classification_report_imbalanced(y_validation, nearmiss_prediction))
plt.title('Data Distribution') plt.xlabel('Label') plt.ylabel('Count') label_0 = data[data['SepsisLabel'] == 0] label_1 = data[data['SepsisLabel'] == 1] print(label_0.shape, label_1.shape) X = data.drop('SepsisLabel', axis=1).values y = data['SepsisLabel'].values X = X[:, 1:] print(X) from imblearn.under_sampling import NearMiss nm = NearMiss() X_res, y_res = nm.fit_sample(X, y) print(X_res.shape, y_res.shape) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0) import pickle pickle.dump(X_test, open('X_test.pkl', 'wb')) from xgboost import XGBClassifier model = XGBClassifier(min_child_weight=3,
n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Nearmiss version = [1, 2, 3] nm = [NearMiss(version=v, return_indices=True) for v in version] X_resampled = [] y_resampled = [] X_res_vis = [] idx_samples_removed = [] for method in nm: X_res, y_res, idx_res = method.fit_sample(X, y) X_resampled.append(X_res) y_resampled.append(y_res) X_res_vis.append(pca.transform(X_res)) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_res) # Two subplots, unpack the axes array immediately f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2) ax_res = [ax2, ax3, ax4]
ClassifierTesting('Случайный лес',RandomForestClassifier(), rf_prop, NearMiss(version=1,sampling_strategy='majority',n_jobs=-1)) ClassifierTesting('K-ближайших соседей',KNeighborsClassifier(), knn_prop, NearMiss(version=1,sampling_strategy='majority',n_jobs=-1)) ClassifierTesting('Логистическая регрессия',LogisticRegression(), lr_prop, NearMiss(version=1,sampling_strategy='majority',n_jobs=-1)) # Алгоритм SMOTE. ClassifierTesting('Метод опорных векторов',SVC(), svc_prop, SMOTE(sampling_strategy='minority',n_jobs=-1)) ClassifierTesting('Случайный лес',RandomForestClassifier(), rf_prop, SMOTE(sampling_strategy='minority',n_jobs=-1)) ClassifierTesting('K-ближайших соседей',KNeighborsClassifier(), knn_prop, SMOTE(sampling_strategy='minority',n_jobs=-1)) ClassifierTesting('Логистическая регрессия',LogisticRegression(), lr_prop, SMOTE(sampling_strategy='minority',n_jobs=-1)) # Алгоритм ADASYN. ClassifierTesting('Метод опорных векторов',SVC(), svc_prop, ADASYN(sampling_strategy='minority',n_jobs=-1)) ClassifierTesting('Случайный лес',RandomForestClassifier(), rf_prop, ADASYN(sampling_strategy='minority',n_jobs=-1)) ClassifierTesting('K-ближайших соседей',KNeighborsClassifier(), knn_prop, ADASYN(sampling_strategy='minority',n_jobs=-1)) ClassifierTesting('Логистическая регрессия',LogisticRegression(), lr_prop, ADASYN(sampling_strategy='minority',n_jobs=-1)) """ # Тестирование нейронной сети. NEURO('RandomUnderSampler', RandomUnderSampler(sampling_strategy='majority', random_state=36)) NEURO( 'NearMiss', NearMiss(sampling_strategy='majority', version=1, random_state=36, n_jobs=-1)) NEURO('SMOTE', SMOTE(sampling_strategy='minority', random_state=36, n_jobs=-1)) NEURO('ADASYN', ADASYN(sampling_strategy='minority', random_state=36, n_jobs=-1))
from sklearn.model_selection import KFold import numpy as np from sklearn.model_selection import GridSearchCV y_pred = classifier.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) print(classification_report(y_test, y_pred)) #Under Sampling from collections import Counter Counter(y_train) from collections import Counter from imblearn.under_sampling import NearMiss ns = NearMiss(0.8) X_train_ns, y_train_ns = ns.fit_sample(X_train, y_train) print("The number of classes before fit {}".format(Counter(y_train))) print("The number of classes after fit {}".format(Counter(y_train_ns))) #Over Sampling¶ from imblearn.over_sampling import RandomOverSampler os = RandomOverSampler(0.75) X_train_ns, y_train_ns = os.fit_sample(X_train, y_train) print("The number of classes before fit {}".format(Counter(y_train))) print("The number of classes after fit {}".format(Counter(y_train_ns))) from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier()