def test_validate_estimator_deprecation(): """Test right processing while passing old parameters""" X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) smt = SMOTEENN(random_state=RND_SEED, k=5) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_error_wrong_object(): smote = 'rnd' enn = 'rnd' smt = SMOTEENN(smote=smote, random_state=RND_SEED) with raises(ValueError, match="smote needs to be a SMOTE"): smt.fit_sample(X, Y) smt = SMOTEENN(enn=enn, random_state=RND_SEED) with raises(ValueError, match="enn needs to be an "): smt.fit_sample(X, Y)
def test_validate_estimator_deprecation(): smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) smt = SMOTEENN(random_state=RND_SEED, k=5) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_regular(): """Test sample function with regular SMOTE.""" # Create the object smote = SMOTEENN(random_state=RND_SEED) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_init(): """Test right processing while passing objects as initialization""" # Create a SMOTE and Tomek object smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(random_state=RND_SEED) smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def resample(X, Y, nb_class): print("original shape: ", X.shape) labels = Y.astype(int) counts = np.bincount(labels) if len(counts) != nb_class: print("there is no samples to interpolate! skip this fold.") return X, Y class_dist = counts / float(sum(counts)) print("original dist: ", class_dist) org_shape = X.shape sampler = SMOTEENN(random_state=0) flattend_X = X.reshape( (X.shape[0], X.shape[1] * X.shape[2] * X.shape[3] * X.shape[4])) X_resampled, Y_resampled = sampler.fit_sample(flattend_X, labels) X_resampled = X_resampled.reshape( (X_resampled.shape[0], X.shape[1], X.shape[2], X.shape[3], X.shape[4])) print("sampled shape: ", X_resampled.shape) Y_resampled = Y_resampled.astype(int) counts = np.bincount(Y_resampled) class_dist = counts / float(sum(counts)) print("after SMOTEENN dist: ", class_dist) return X_resampled, Y_resampled
def runtree(data, target): lb = preprocessing.LabelEncoder() lb.fit(target) target1 = lb.transform(target) sm = SMOTEENN() clf = tree.DecisionTreeClassifier() folds = [3] depths = [10] print("------------ TREE ------------") for fold in folds: skf = StratifiedKFold(n_splits=fold, random_state=5) test_target = [] test_predict = [] test_proba = [] test_proba_target = [] for train_index, test_index in skf.split(data, target1): clf_ = clone(clf) X_resampled, y_resampled = sm.fit_sample(data[train_index], target1[train_index]) clf_.fit(X_resampled, y_resampled) test_predict.append(clf_.predict(data[test_index])) test_target.append(target1[test_index]) test_proba_target.extend(target1[test_index]) test_proba.extend(clf_.predict_proba(data[test_index])[:, 1]) print_scores(test_predict, test_target) print(roc_auc_score(y_true=test_proba_target, y_score=test_proba))
def smot2(train_x, train_y, feature_columns): from imblearn.combine import SMOTEENN from imblearn.over_sampling import SMOTE from imblearn.under_sampling import TomekLinks from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import ADASYN from sklearn.svm import SVC from imblearn.under_sampling import CondensedNearestNeighbour print('\nOriginal dataset shape {}'.format(Counter(train_y))) sm = SMOTEENN(ratio='minority', n_jobs=3, random_state=42, n_neighbors=50, smote=SMOTE()) #sm = ADASYN(ratio='minority', n_jobs=3,random_state=42,n_neighbors=100) #sm = SMOTE(ratio='minority', n_jobs=3, random_state=42,m_neighbors=200) #sm = CondensedNearestNeighbour(ratio='majority', random_state=42) log.traceLogInfo("\nFIT DE SMOT2 ...equilibrage") X_res, y_res = sm.fit_sample(train_x, train_y) print('\nResampled dataset shape {}'.format(Counter(y_res))) # reconstitution DATAFRAME train_x = pd.DataFrame(X_res, columns=feature_columns) train_y = pd.Series(y_res) return train_x, train_y
def smpote_test(): # 读取测试测试数据集中的数据 truth_df = pd.read_hdf('D:\\kpi\\1.hdf') # print(truth_df["KPI ID"]) kpi_names = truth_df['KPI ID'].values truth = truth_df[truth_df["KPI ID"] == kpi_names[0]] y = truth['label'] X = truth.drop(columns=['label', 'KPI ID']) sm = SMOTEENN() X_resampled, y_resampled = sm.fit_sample(X, y) dfX = pd.DataFrame(X_resampled, columns=['timestamp', 'value']) DFy = pd.DataFrame(y_resampled, columns=['label']) plt.plot(np.array(X['timestamp']), np.array(X['value']), color='green', label='training accuracy') plt.legend() # 显示图例 plt.show() dfX = dfX.join(DFy).sort_values(by="timestamp", ascending=True) plt.plot(np.array(dfX['timestamp']), np.array(dfX['value']), color='red', label='training accuracy') plt.legend() # 显示图例 plt.show()
def resampling(X_train, y_train): from imblearn.combine import SMOTEENN sm = SMOTEENN() print('dataset shape {}'.format(Counter(y_train))) X_train, y_train = sm.fit_sample(X_train, y_train) print('Resampled dataset shape {}'.format(Counter(y_train))) return X_train, y_train
def test_sample_regular_half(): """Test sample function with regular SMOTE and a ratio of 0.5.""" # Create the object ratio = 0.8 smote = SMOTEENN(ratio=ratio, random_state=RND_SEED) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.36784496, -0.1953161], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def balanced_train(data, features): X = data[features] y = data['label'] from imblearn.combine import SMOTEENN smote_enn = SMOTEENN(random_state=42) X_resampled, y_resampled = smote_enn.fit_sample(X, y) return X_resampled, y_resampled
def balance(x, y, randomstate=None, **kwargs): sm = SMOTEENN(random_state=randomstate, n_jobs=3, n_neighbors=kwargs['neighbors']) print('dataset shape {}'.format(Counter(y))) print('Resampling...') rx, ry = sm.fit_sample(x, y) print('Resampled dataset shape {}'.format(Counter(ry))) return rx, ry
def SMOTEENN_oversampling(x, y): print('Original dataset shape {}'.format(Counter(y))) smote_enn = SMOTEENN(random_state=42) x_sampled, y_sampled = smote_enn.fit_sample(x, y) print('With SMOTEENN sampled dataset shape {}'.format(Counter(y_sampled))) return x_sampled, y_sampled
def over_sampling(data): data = data.drop('aid', axis=1) data = data.drop('uid', axis=1) y = data['label'] X = data.drop('label', axis=1) sme = SMOTEENN() X_res, y_res = sme.fit_sample(X, y) data_res = pd.concat([X_res, y_res], axis=1) data_res.to_csv('./data/train_all_after_overSamlping.csv', index=False)
def balance_train_data(data): print("Start balancing...") features, labels = data start_time = time.time() smote_enn = SMOTEENN(random_state=42) features, labels = smote_enn.fit_sample(features, labels) print("Balanced dataset:", sorted(Counter(labels).items())) print("Balancing time:", time.time() - start_time) return (features, labels)
def test_sample_regular_half(): ratio = 0.8 smote = SMOTEENN(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def SMOTE_ENN_method(sm, combined, Cols, nn, ks): X_train, y_train, X_test, y_test = train_test_split(combined, Cols) enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel=ks) st = SMOTEENN(random_state=33, smote=sm, enn=enn) X_train, y_train = st.fit_sample(X_train, y_train) classifier_and_metrics(X_train, y_train, X_test, y_test)
def test_sample_regular(): smote = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_half(): ratio = {0: 10, 1: 12} smote = SMOTEENN(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def over_under_sampling(data): column_names = data.columns[:-1] smote_tomek = SMOTEENN(ratio='auto') features, label = smote_tomek.fit_sample(data[data.columns[:-1]], data['Tumor'].as_matrix()) data = pd.DataFrame(features) data.columns = column_names data['Tumor'] = label logger.info(data) return data
def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(random_state=RND_SEED, ratio='all') smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def fit(self, x, y): # 随机下采样 smoteenn = SMOTEENN() x_train, y_train = smoteenn.fit_sample(x, y) self.estimators_.append(self._fit_base_estimator(x_train, y_train)) for i in range(len(self.estimators_)): joblib.dump( self.estimators_[i], "model/card/SMOTEENN_" + self.model_name + "_" + str(self.cnt) + "_model.pkl") return self
def get_data(self): """ function to fetch data """ with open(os.path.join(self.data_dir, 'shuffled_processed_data.csv'), 'r') as r: data = pd.read_csv(r, nrows=self.nrows) X = data.iloc[:, 1:7] y = data.iloc[:, 7] if self.SMOTENN: sm = SMOTEENN(random_state=0) X, y = sm.fit_sample(X, y) return (X, y)
def resample(x, y, sampling_type=None): x_out, y_out = x, y if sampling_type == "smoteenn": sme = SMOTEENN(random_state=1) x_out, y_out = sme.fit_sample(x, y) else: if sampling_type == "enn": enn = EditedNearestNeighbours(random_state=1) x_out, y_out = enn.fit_sample(x, y) print("Before resampling:", sorted(Counter(y).items())) print("After resampling:", sorted(Counter(y_out).items())) return x_out, y_out
def smote_en_resampling(data_X, data_y, k_neighbors=5): # Perform under and over sampling using SMOTE and EN smote = SMOTE(sampling_strategy='minority', k_neighbors=k_neighbors, n_jobs=8) enn = EditedNearestNeighbours(n_neighbors=k_neighbors, n_jobs=8) smoteen = SMOTEENN(sampling_strategy="minority", smote=smote, enn=enn, n_jobs=8) resamp_X, resamp_y = smoteen.fit_sample(data_X, data_y) return resamp_X, resamp_y
def imbalanceProcess(self, X, y): ''' 样本不平衡处理 Args: X: 待处理的数据特征样本 y: 待处理的数据标记样本 Returns: X: 处理后的数据特征样本 y: 处理后的数据标记样本 ''' sm = SMOTEENN() X, y = sm.fit_sample(X, y) return X, y
def data_smot(): sm = SMOTEENN() x_res, y_res = sm.fit_sample(test_data[:, 3:], test_data[:, 2]) print(len(y_res[y_res == 1])) print(len(y_res[y_res == 0])) y_res = np.reshape(y_res, [-1, 1]) x_y = np.hstack((x_res, y_res)) col = list(data.columns[3:]) col.append(data.columns[2]) val = x_y df = pd.DataFrame(data=val, columns=col) df.to_csv("data/new_data_test.csv", index=False) print("over")
def smoteenn(X_train, y_train): ## DOES NOT WORK CORRECTLY smoteenn = SMOTEENN(random_state=42) n_samples, n_levels, n_variables = X_train.shape[0], \ X_train.shape[1], \ X_train.shape[2] X_train = X_train.reshape((n_samples, -1), order='F') X_train, y_train = smoteenn.fit_sample(X_train, y_train) X_train = np.reshape(X_train, (-1, n_levels, n_variables)) return X_train, y_train
def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def create_synthetic_balanced_data_set(args, data_set, selected_class, ratio='auto'): """ Creates a balanced data set by adding synthetic samples to underrepresented classes using SMOTE upsampling. :param args: Program arguments. :param data_set: The data set to balance. :param selected_class: The class to be balanced with synthetic samples. :param ratio: Upsampling ratio. :return: A data set with the selected class balanced using synthetic samples. """ from imblearn.combine import SMOTEENN num_classes = data_set.get_num_classes() seq_copy = data_set.to_one_vs_k(selected_class) # X must be padded and y must be binarized to work with the SMOTE implementation. padded_x = pad_sequences(seq_copy.x, maxlen=args["max_sequence_length"], padding="post", truncating="post", dtype="float32") binary_y = np.argmax(seq_copy.y, axis=-1) sm = SMOTEENN(n_jobs=4, ratio=ratio) new_x, new_y = sm.fit_sample(padded_x, binary_y) # Transform the data back to the application format. synthetic_data_set = data_set.__class__(new_x, new_y) synthetic_data_set.to_categorical(num_classes) synthetic_data_set = synthetic_data_set.single_class_data_set(0) synthetic_data_set.y = np.ones((len(synthetic_data_set.x), 1)) * selected_class synthetic_data_set.to_categorical(num_classes) synthetic_data_set.x = map(lambda x: x, synthetic_data_set.x) synthetic_data_set.y = synthetic_data_set.y.tolist() # Remove the samples used to generate synthetic samples. balance = data_set.get_class_balance() balance[selected_class] = 0 data_set.set_class_balance(balance) data_set.x = data_set.x.tolist() data_set.y = data_set.y.tolist() # Merge sets. return_set = data_set.merged(synthetic_data_set) return_set.x = np.asarray(return_set.x) return_set.y = np.asarray(return_set.y) return return_set
def use_OSSSMOTEENN(self): X,y = preparation(self.path) ############################## dy = pd.DataFrame(y) dy.value_counts().plot(kind='bar',title='Count(label)') plt.show() ################################# oss = OneSidedSelection(random_state = 42,n_jobs=-1,sampling_strategy="majority") X_res,y_res = oss.fit_sample(X,y) dy_res = pd.DataFrame(y_res) dy_res.value_counts().plot(kind='bar',title='Count(label)') plt.show() ############################## sme = SMOTEENN(random_state=42,n_jobs=-1) X_sme, y_sme = sme.fit_sample(X_res, y_res) #draw bar dy_sme = pd.DataFrame(y_sme) dy_sme.value_counts().plot(kind='bar',title='Count(label)') plt.show() #generate csv df=pd.concat([X_sme,pd.DataFrame(y_sme)],axis=1) df.to_csv(self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv') ,index = None,header=None,float_format='%.4f') ###the first line of data will be delete ##########draw PCA pca = PCA(n_components=2) X_sme = pca.fit_transform(X_sme) plot_2d_space(X_sme,y_sme, 'SMOTE + ENN') return self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv') # if __name__ == '__main__': # path ="++Final_Test++_pre.csv" # #draw_bar(path) # mhi = My_handle_imbalance(path) # mhi.use_OSSSMOTEENN() # # #use_SMOTETomek(path) # #draw_origin(path)
def test_sample_regular(): """Test sample function with regular SMOTE.""" # Create the object smote = SMOTEENN(random_state=RND_SEED) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'smote_enn_reg_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'smote_enn_reg_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_pass_smote_enn(): smote = SMOTEENN(smote=SMOTE(ratio='auto', random_state=RND_SEED), enn=EditedNearestNeighbours(ratio='all', random_state=RND_SEED), random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def SMOTE(self, bug_rate, X, Y): """ Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours. 通过改进的SMOTE来对原来的数据集做处理 :param bug_rate: :param X:数据集除了lable以外的部分 :param Y:lable信息 :return:处理过的X,Y。 """ from collections import Counter from imblearn.combine import SMOTEENN sme = SMOTEENN(ratio=bug_rate) x_res, y_res = sme.fit_sample(X, Y) import numpy as np nx = np.column_stack((x_res, y_res)) self.new_list_SMOTE = nx
class Undersampler: def __init__(self,kind,data,target,verbose = False, ratio = 'auto'): assert len(data) == len(target) self.data = data self.target = target if kind in [Undersampling.ClusterCentroids]: if verbose: print('> CLUSTER CENTROIDS') # Undersampling por Cluster Centroids self.undersampler = ClusterCentroids(verbose = verbose, ratio=ratio) elif kind in [Undersampling.SMOTEENN]: if verbose: print('> SMOTEENN') # Undersampling por SMOTEENN self.undersampler = SMOTEENN(verbose = verbose, ratio=ratio) else: raise("Nonexistent undersampling type: "+kind.name) def balance(self): #return self.undersampler.fit_transform(self.data, self.target) return self.undersampler.fit_sample(self.data, self.target)
return (data[i - 1] + data[i])/2 start = time() n_iter = 100 ## Number of evaluations (SMAC) n_validations = 7 ## Number of Monte-Carlo Cross-Validations for each model's accuracy evaluated ## Dataset 11 url11 = "https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticdata2000.txt" dataset11 = np.genfromtxt(urllib.urlopen(url11)) X = dataset11[:,0:85] Y = dataset11[:,85] sm = SMOTEENN() X, Y = sm.fit_sample(X, Y) # We fit the MLP with the hyperparameters given and return the model's median accuracy from 7 trials def mlp(number_layers, number_neurons_1, number_neurons_2, number_neurons_3, number_neurons_4, dropout_rate): layers = [] number_neurons = [] number_neurons.append(number_neurons_1) number_neurons.append(number_neurons_2) number_neurons.append(number_neurons_3) number_neurons.append(number_neurons_4) for i in np.arange(number_layers): layers.append(Layer("Sigmoid", units=number_neurons[i], dropout = dropout_rate))
from imblearn.combine import SMOTEENN # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply SMOTE + ENN sm = SMOTEENN() X_resampled, y_resampled = sm.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #smoteen sme = SMOTEENN(random_state=42) os_X,os_y = sme.fit_sample(X_train,y_train) #QDA clf_QDA = QuadraticDiscriminantAnalysis(store_covariances=True) clf_QDA.fit(os_X, os_y) y_true, y_pred = y_test, clf_QDA.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test,y_pred) np.set_printoptions(precision=2)