def test_sample_regular_wrong_svm(): kind = 'svm' nn_k = NearestNeighbors(n_neighbors=6) svm = 'rnd' smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) with raises(ValueError, match="has to be one of"): smote.fit_sample(X, Y)
def test_fit_sample_nn_obj(): """Test sample with NN object provided.""" # Create the object kind = 'borderline1' nn_m = NearestNeighbors(n_neighbors=11) nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.3765279, -0.2009615], [0.55276636, -0.10550373], [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_with_nn_svm(): """Test sample function with regular SMOTE with a NN object.""" # Create the object kind = 'svm' nn_k = NearestNeighbors(n_neighbors=6) svm = SVC(random_state=RND_SEED) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.47436888, -0.2645749], [1.07844561, -0.19435291], [1.44015515, -1.30621303]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def fit(self, X , y = None): # 'Random under-sampling' # CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51) #Accuracy: 0.939693267481 #Precision: 0.238095238095 #Recall: 0.897435897436 #Accuracy: 0.962568234988 #Precision: 0.324468085106 #Recall: 0.782051282051 #SMOTE(ratio=ratio, kind='borderline1') #Accuracy: 0.971146347803 #Precision: 0.372093023256 #Recall: 0.615384615385 #SMOTE(ratio=ratio, kind='borderline2') #Accuracy: 0.965427605927 #Precision: 0.333333333333 #Recall: 0.705128205128 #svm_args = {'class_weight': 'auto'} #svmsmote = SMOTE(ratio=ratio, kind='svm', **svm_args) #Accuracy: 0.972186119054 #Precision: 0.395683453237 #Recall: 0.705128205128 smote = SMOTE(ratio='auto', kind='regular') X, y = smote.fit_sample(X, y) # weights = np.array([1/y.mean() if i == 1 else 1 for i in y]) return super(RandomForestClassifier, self).fit(X,y)#,sample_weight=weights)
def train(addr_train, clf, sampling, add_estimators): with open(os.path.join(addr_train, "day_samp_bin.npy"), "r") as file_in: X = smio.load_sparse_csr(file_in) width = np.size(X, 1) X_train = X[:, :width-1] y_train = X[:, width-1] if sampling == "Over": sm = SMOTE(ratio=0.95) X_train, y_train = sm.fit_sample(X_train, y_train) elif sampling == "Under": X_train, y_train = US.undersample(X, 0.01) print "Fitting Model......" clf.n_estimators += add_estimators clf.fit(X_train, y_train) print "Done" if __SAVE_MODEL: model_name = "RF_" + onoff_line + "_" + sampling + "_Model.p" dir_out = os.path.join(addr_train, "Random_Forest_Models") if not os.path.isdir(dir_out): os.mkdir(dir_out) path_out = os.path.join(dir_out, model_name) with open(path_out, "w") as file_out: pickle.dump(clf, file_out) return clf
def test_sample_borderline2(): """Test sample function with borderline 2 SMOTE.""" # Create the object kind = 'borderline2' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.47436888, -0.2645749], [1.07844561, -0.19435291], [0.33339622, 0.49870937]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def get_data(month, day, hour=-1, mode="normal"): if hour != -1: if hour == 24: hour = 0 day += 1 addr_in = os.path.join("/mnt/rips2/2016", str(month).rjust(2, "0"), str(day).rjust(2, "0"), str(hour).rjust(2, "0"), "output_bin.npy") else: addr_in = os.path.join("/mnt/rips2/2016", str(month).rjust(2, "0"), str(day).rjust(2, "0"), "day_samp_newer_bin.npy") with open(addr_in, "r") as file_in: loader = np.load(file_in) data = csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']).toarray() X = data[:, :-1] y = data[:, -1] if mode == "over": sm = SMOTE(ratio=0.99, verbose=0) X, y = sm.fit_sample(X, y) return X, y
def resample_data(X, y, categorical_lst): ''' up-samples minority class ''' sm = SMOTE(kind='regular') X_train_re, y_train_re = sm.fit_sample(X,y) #rounding categorical variables X_train_re[:,categorical_lst] = np.round(X_train_re[:,categorical_lst]) return X_train_re, y_train_re
def Input_Preparing(Scaled_Input_Data, Surgery_Outcome, N_Feat): # Feature Selection MIFS = mifs.MutualInformationFeatureSelector(method='JMI', verbose=2, n_features = N_Feat) MIFS.fit(Scaled_Input_Data, Surgery_Outcome) Selected_Input_Data = Scaled_Input_Data.loc[:,MIFS.support_] # Balancing using SMOTE sm = SMOTE(kind='regular') Prep_Train_Data, Prep_Surgery_Outcome = sm.fit_sample(X, y) return(Prep_Train_Data, Prep_Surgery_Outcome, MIFS.support_)
def SMT(df, target): df1 = df.copy() y = df1.pop('anti_churn') X = df1 Xcols = df1.columns sm = SMOTE(kind='regular', ratio = target) X_resampled, y_resampled = sm.fit_sample(X, y) X_resampled = pd.DataFrame(X_resampled) y_resampled = pd.DataFrame(y_resampled) X_resampled.columns = Xcols y_resampled.columns = ['anti_churn'] return X_resampled, y_resampled
def transform(self, fp): fm, train_x, train_y = FeaturePool.to_train_arrays(fp) os = SMOTE(random_state = self.random_state) os_train_x, os_train_y = os.fit_sample(train_x, train_y[:, 0]) os_train_y = os_train_y.reshape((os_train_y.shape[0], 1)) for f in FeaturePool.from_train_arrays(fm, os_train_x, os_train_y): yield Feature.apply_config(f, is_over_sampled=True) for f in fp: if f.split_type == SplitType.TEST: yield f
def oversample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) # to use a random sampling seed at random: #ros = RandomOverSampler() ros = SMOTE() #ros = ADASYN() self.X, self.y = ros.fit_sample(self.X, self.y) self.Xview = self.X.view()[:, :self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def oversample(X, y, bal_strategy): if(bal_strategy == "SMOTESVN" or bal_strategy == "ALL"): # Apply SMOTE SVM sm = SMOTE(kind='svm') X_sampled, y_sampled = sm.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "SMOTE" or bal_strategy == "ALL"): # Apply regular SMOTE sm = SMOTE(kind='regular') X_sampled, y_sampled = sm.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "ADASYN" or bal_strategy == "ALL"): # Apply the random over-sampling ada = ADASYN() X_sampled, y_sampled = ada.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == 'NONE'): X_sampled = X y_sampled = y print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape else: print 'bal_stragegy not in SMOTESVN, SMOTE, ADASYN, ALL, NONE' sys.exit(1) return (X_sampled, y_sampled)
def test_sample_regular(): """Test sample function with regular SMOTE.""" # Create the object kind = 'regular' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def train(cutoffs): print "\n========== Start Training ==========" if __DATA_FROM == 2: list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1]) else: list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1]) clf = BernoulliNB(class_prior=[0.05, 0.95]) if __IF_TRAIN_WITHOUT_SAVE: print "Performing correlation explanation......" with open("/home/wlu/Desktop/day_samp_bin_1-2.npy", "r") as file_in: X = Sparse_Matrix_IO.load_sparse_csr(file_in) if len(cutoffs) > 0: X = discard_vars(X, cutoffs) layer = correlation_ex(X) for i in range(0, len(list_io_addr)): path_in = list_io_addr[i] print "\nGenerating training set from {}".format(path_in) with open(path_in, "r") as file_in: X = Sparse_Matrix_IO.load_sparse_csr(file_in) if len(cutoffs) > 0: X = discard_vars(X, cutoffs) vector_len = len(X[0]) X_train = X[:, 0:vector_len-1] y_train = X[:, vector_len-1] if __IF_TRAIN_WITHOUT_SAVE: print "Transforming training set according to CorEx......" X_train = corex_transform(layer, X_train) sm = SMOTE(ratio=0.95) X_train, y_train = sm.fit_sample(X_train, y_train) print "Fitting Model......" clf.partial_fit(X_train, y_train, classes=[0, 1]) print "Done" if __IF_TRAIN_WITHOUT_SAVE: return [clf, layer] else: with open(__ROOT_MODEL, "w") as file_out: pickle.dump(clf, file_out) return []
def get_data(ratio, sampling): list_io_addr = get_io_addr() data = [] for addr_in in list_io_addr: with open(addr_in, "r") as file_in: X = smio.load_sparse_csr(file_in) data.extend(X) data = np.array(data) n = 30000 if sampling == "Over": m = int(np.size(data, 1)) k = int(0.8*n) X = data[:n, :m-1] y = data[:n, m-1:] X_train = X[:k, :] y_train = y[:k] sm = SMOTE(ratio=ratio) X_train, y_train = sm.fit_sample(X_train, column_or_1d(y_train, warn=False)) X_test = X[k:, :] y_test = y[k:] elif sampling == "None": m = int(np.size(data, 1)) k = int(0.8*n) X = data[:n, :m-1] y = data[:n, m-1:].ravel() X_train = X[:k, :] y_train = y[:k] X_test = X[k:, :] y_test = y[k:] else: m = int(np.size(data, 1)) k = int(0.2*np.size(data, 0)) data_test = data[k:, :] data = data[:k, :] data = US.undersample(data, ratio) k = int(0.8*np.size(data, 0)) if np.size(data_test, 0) > k: data_test = data[:k, :] X_train = data[:, :m-1] y_train = data[:, m-1:].ravel() X_test = data_test[:, :m-1] y_test = data_test[:, m-1:].ravel() return X_train, y_train, X_test, y_test
def clf_extratree_predictor(item): (clf_args,idx,X,y,use_SMOTE) = item train_index, test_index = idx clf = sklearn.ensemble.ExtraTreesClassifier(**clf_args) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] if use_SMOTE: sampler = SMOTE(ratio='auto', kind='regular') X_train, y_train = sampler.fit_sample(X_train,y_train) clf.fit(X_train,y_train) pred = clf.predict(X_test) pred_proba = clf.predict_proba(X_test) return idx,pred,pred_proba
def train_and_test_dnn(args): for a in args: print(a) primitive = args[1] res = pickle.load(open(sys.argv[2], "rb" )) notes_with_truth_labels_for_query_primitives = pd.read_csv(args[3]) dl_results = pd.DataFrame(columns = ['primitive', 'avg_fit_time', 'avg_score_time', 'avg_score']) X = get_doc_term_matrix(res) y = notes_with_truth_labels_for_query_primitives.loc[:, primitive] clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 5, 2), random_state=1) try: sm = SMOTE(random_state=357) X_sm, y_sm = sm.fit_sample(X, y) except ValueError: print("value error, smote") X_sm = X y_sm = y cv_results = cross_validate(clf, X_sm, y_sm, cv=3, return_train_score=False) print(cv_results) dump(clf, './models/{}_trained_dnn.joblib'.format(primitive)) dl_results.loc[0, 'primitive'] = primitive dl_results.loc[0, 'avg_fit_time'] = np.mean(cv_results['fit_time']) dl_results.loc[0, 'avg_score_time'] = np.mean(cv_results['score_time']) dl_results.loc[0, 'avg_test_score'] = np.mean(cv_results['test_score']) with open(args[4], 'a') as f: f.write("{}, {}, {}, {}\n".format(dl_results.loc[0,'primitive'], dl_results.loc[0,'avg_fit_time'], dl_results.loc[0,'avg_score_time'], dl_results.loc[0,'avg_test_score'])) #f.write(dl_results.loc[0,:]) #f.write("\n") f.close() print("DONE w/ {}".format(primitive))
def test_sample_regular_half(): ratio = {0: 9, 1: 12} kind = 'regular' smote = SMOTE(ratio=ratio, random_state=RND_SEED, kind=kind) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.36784496, -0.1953161]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_borderline1(): kind = 'borderline1' smote = SMOTE(random_state=RND_SEED, kind=kind) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.3765279, -0.2009615], [0.55276636, -0.10550373], [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_with_nn(): kind = 'regular' nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def DataFormat(data): Data = smio.load_sparse_csr(data) m = int(np.size(Data, 1)) n = int(np.size(Data, 0)) X_train = Data[:50000, :m-1] y_train = Data[:50000, m-1] sm = SMOTE(ratio=0.95) X_train, y_train = sm.fit_sample(X_train, y_train) data_new = [] for i in range(np.size(X_train, 0)): row = list(X_train[i].tolist()) row.append(y_train[i]) data_new.append(row) shuffle(data_new) data_new = np.array(data_new) m = int(np.size(data_new, 1)) X_train = data_new[:, :m-1] y_train = data_new[:, m-1] K = np.count_nonzero(y_train) # Number of good data points return X_train, y_train, n, K # Training set plus some numbers useful for weighting
def get(addr_day, mode="normal", ratio=-1, sampling_method="None", bin=False): if "res" in mode: res_ratio = mode.split("-")[1] prefix = "day_samp_res" suffix = "_{}.npy".format(res_ratio) res = "Reservoir_Data" else: prefix = "day_samp_new" suffix = ".npy" res = "" if not ratio == -1: n = 100000 neg = int(n / (1+ratio)) pos = n - neg with open(os.path.join(addr_day, "PosNeg", res, prefix + "_neg" + suffix), "r") as file_neg: matrix_neg = smio.load_sparse_csr(file_neg) matrix_neg = matrix_neg[:neg, :] with open(os.path.join(addr_day, "PosNeg", res, prefix + "_pos" + suffix), "r") as file_pos: matrix_pos = smio.load_sparse_csr(file_pos) matrix_pos = matrix_pos[:pos, :] matrix = vstack((matrix_neg, matrix_pos)) np.random.shuffle(matrix) else: with open(os.path.join(addr_day, res, prefix + suffix), "r") as file_in: matrix = smio.load_sparse_csr(file_in) width = np.size(matrix, 1) X = matrix[:, :width-1] y = matrix[:, width-1] if "Over" in sampling_method: sm = SMOTE(ratio=0.95) X, y = sm.fit_sample(X, y) return X, y
def test_wrong_nn(): kind = 'borderline1' nn_m = 'rnd' nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) with raises(ValueError, match="has to be one of"): smote.fit_sample(X, Y) nn_k = 'rnd' nn_m = NearestNeighbors(n_neighbors=10) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) with raises(ValueError, match="has to be one of"): smote.fit_sample(X, Y) kind = 'regular' nn_k = 'rnd' smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k) with raises(ValueError, match="has to be one of"): smote.fit_sample(X, Y)
Feature_test = np.concatenate( (Positive_Features_test, Negative_Features_test)) Label_test = np.concatenate( (Positive_Labels_test, Negative_Labels_test)) # print(Label_test) clf = xgb.XGBClassifier() if m == "xGBoost": Feature_train = Features_train_o Label_train = Labels_train_o clf.fit(Feature_train, Label_train) Label_predict = clf.predict(Feature_test) Label_score = clf.predict_proba(Feature_test) elif m == "SMOTE": sm = SMOTE() Feature_train, Label_train = sm.fit_sample( Features_train_o, Labels_train_o) clf.fit(Feature_train, Label_train) Label_predict = clf.predict(Feature_test) Label_score = clf.predict_proba(Feature_test) elif m == "Bayesian": bayes = BayesianNetwork.from_json(bayes_name) Negative_Features_train_prob = bayes.probability( Negative_Features_train) Positive_Features_train_prob = np.zeros( (Num_Positive_train, 1)) for k in range(Num_Positive_train): try: Positive_Features_train_prob[k] = bayes.probability( Positive_Features_train[k]) except KeyError: Positive_Features_train_prob[k] = 0
##test train sets train, test = train_test_split(full_normalized, test_size=0.2, random_state=123) x_train = np.array(train.drop( 'BK', axis=1)) #needed as arrays so that we can "ravel" y_train = np.array(train.loc[:, ['BK']]) x_test = np.array(test.drop('BK', axis=1)) y_test = np.array(test.loc[:, ['BK']]) #Smote Data sm = SMOTE(random_state=123) x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train.ravel()) ##-------------------------------------------------------------------------- #RandomForest rfc = RandomForestClassifier(random_state=123) #parameters param_grid = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy'] } #scoring scoring = {
# 读取数据 data = load_svmlight_file(args.datapath) # 设定分类信息和特征矩阵 X, y = data[0], data[1] print("\nDataset shape: ", X.shape, " Number of features: ", X.shape[1]) # 不同 Class 统计 (根据 Target 列) num_categories = np.unique(y).size sum_y = np.asarray(np.unique(y.astype(int), return_counts=True)) df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None) print('\n', df_sum_y) # Apply SMOTE 生成 fake data sm = SMOTE(k_neighbors=2) x_resampled, y_resampled = sm.fit_sample(X, y) # after over sampleing 读取分类信息并返回数量 np_resampled_y = np.asarray( np.unique(y_resampled.astype(int), return_counts=True)) df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum']) print("\nNumber of samples after over sampleing:\n{0}".format( df_resampled_y)) # 初始化 classifier clf = SVC(kernel=args.kernel, gamma=args.gamma, C=args.c, max_iter=args.max_iter, random_state=args.randomseed) print("\nClassifier parameters:") print(clf.get_params())
def smote(x, y): print("----SMOTE----") sampler = SMOTE(random_state=42) X, y = sampler.fit_sample(x, y) return X, y
def fit(self, X , y = None): smote = SMOTE(ratio='auto', kind='regular') X, y = smote.fit_sample(X, y) # weights = np.array([1/y.mean() if i == 1 else 1 for i in y]) return super(AdaBoostClassifier, self).fit(X,y) #,sample_weight=weights)
#%% from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from imblearn.over_sampling import SMOTE import numpy as np import pandas as pd import seaborn as sns; sns.set() import matplotlib.pyplot as plt import utils #%% # prepare data x,y = utils.load_data_as_df('dataset/train.data') smote = SMOTE(sampling_strategy=0.5, random_state=100) x, y = smote.fit_sample(x, y) #%% # tuning parameters # C是加在损失函数前面的,而不是加在正则项前面,因此C越大,表示对系数的惩罚力度越小,模型越容易过拟合 params = {'C':[0.1, 1, 5, 10, 13, 15, 20, 25, 30], 'solver':['liblinear','sag','lbfgs','newton-cg'] } k = 5 clf = GridSearchCV(LogisticRegression(random_state=10, max_iter=1000), params, scoring='f1', n_jobs=-1, cv=k) search = clf.fit(x, y) results = search.cv_results_ print('best prarams', search.best_params_)
cbar=False) plt.xlabel("true label") plt.ylabel("predicted label") bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) plot_conf_mat(y_test, y_preds) #Plot ROC curve and calculate Auc metric plot_roc_curve(svm_model, test_matrix, y_test) # applying SMOTE to balance the data from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=42) X_train1, y_train1 = sm.fit_sample(train_matrix, y_train) y_train1.value_counts() y_train.value_counts() svm_model = LinearSVC() svm_model.fit(X_train1, y_train1) train_pred_svc = svm_model.predict(X_train1) accuracy_train_svc = np.mean(train_pred_svc == y_train1) y_preds = svm_model.predict(test_matrix) accuracy_test_svc = np.mean(y_preds == y_test) print(classification_report(y_test, y_preds)) pd.crosstab(y_test, y_preds) # Applying k-Fold Cross Validation
'solver': ['lbfgs', 'sag'], 'class_weight': ['balanced'], 'penalty': ['l2'], 'C': [.1, .01, .001, .001, .2, .02, .002], 'multi_class': ['multinomial', 'auto'] } gs = GridSearchCV(model, param_grid=lr_params) gs.fit(train_data_tfid, y_train) gs.best_params_ #Modeling (Smote and Regularization) sm = SMOTE() x_reb, y_reb = sm.fit_sample(train_data_tfid, y_train) model = LogisticRegression(C=.001, multi_class='multinomial', penalty='l2', solver='sag') model.fit(x_reb, y_reb) print(model.score(x_reb, y_reb)) print(model.score(test_data_tfid, y_test)) predictions = model.predict(test_data_tfid) pd.DataFrame(predictions, y_test) # Creating pipeline and pickleing
from preprocess.load_data.data_loader import load_production production_tb = load_production() # 下の行から本書スタート # SMOTE関数をライブラリから読み込み from imblearn.over_sampling import SMOTE # SMOTE関数の設定 # ratioは不均衡データにおける少ない例のデータを多い方のデータの何割まで増やすか設定 # (autoの場合は同じ数まで増やす、0.5と設定すると5割までデータを増やす) # k_neighborsはsmoteのkパラメータ # random_stateは乱数のseed(乱数の生成パターンの元) sm = SMOTE(ratio='auto', k_neighbors=5, random_state=71) # オーバーサンプリング実行 blance_data, balance_target = \ sm.fit_sample(production_tb[['length', 'thickness']], production_tb['fault_flg'])
needImpute=True, dropOrNot=True) X_test, y_test = load_data(ROOT_PATH + APS_TEST, skip_first_row=21, y_column_index=0, assignedColumnNames=APS_FULL_COLUMNS, missingSymbol='na', needImpute=True, dropOrNot=True) y_test = to_binary_numeric(y_test, classNeg="neg") y_train = to_binary_numeric(y_train, classNeg="neg") smote = SMOTE(random_state=2333) smote_train_fit = smote.fit_sample(X_train, y_train) smote_test_fit = smote.fit_sample(X_test, y_test) X_train_smote = pd.DataFrame(smote_train_fit[0]) y_train_smote = pd.DataFrame(smote_train_fit[1], columns=['class']) X_test_smote = pd.DataFrame(smote_test_fit[0]) y_test_smote = pd.DataFrame(smote_test_fit[1], columns=['class']) print("-----------\"After Using SMOTE: (Train)\"-------------") print(y_train_smote['class'].value_counts()) print("-----------\"After Using SMOTE: (Test)\"-------------") print(y_test_smote['class'].value_counts()) randForestClf = RandomForestClassifier(n_estimators=50, random_state=2333, oob_score=True) randForestClf.fit(X_train_smote, y_train_smote)
#traindata=pd.get_dummies(train) #x_train=traindata.drop('renewal',axis=1) #y_train=traindata['renewal'] oversampler=SMOTE(random_state=0, ratio=1) rocm=[1,2,3,4,5,6,7,8,9,10] num_folds = 10 subset_size = math.floor(len(traindata)/num_folds) for i in range(num_folds): print(i) training = traindata[:i*subset_size]. append(traindata[(i+1)*subset_size:]) train_data=training.drop(['id','renewal'], axis=1) sm_x,sm_y=oversampler.fit_sample(train_data,training['renewal']) #print(sm_y.sum()/len(sm_y)) lgtrain=lgb.Dataset(sm_x,label=sm_y) clf = lgb.train(params, lgtrain, 700) testing = traindata[i*subset_size:][:subset_size] testd=testing.drop(['renewal','id'], axis=1) testd=pd.get_dummies(testd) lgbpred=clf.predict(testd) for x in range(0,len(lgbpred)): if lgbpred[x]>=.5: # setting threshold to .5 lgbpred[x]=1 else:
from imblearn.over_sampling import SMOTE from sklearn import model_selection import xgboost import numpy as np from sklearn import metrics data = pd.read_table('D:/test/14/creditcard.csv',sep=',') X = data.drop({'Time','Class'},axis=1) Y = data.Class X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size=0.3,random_state=1234) # 检测到违约占比太低,数据出现严重的偏移,不适合用本数据直接建模,而应该使用SMOTE算法过度抽样 counts = data.Class.value_counts() print('违约占比%f'%(counts[1]/(counts[0]+counts[1]))) over_sample = SMOTE(random_state=1234) over_sample_X,over_sample_Y = over_sample.fit_sample(X_train,Y_train) xgboost = xgboost.XGBClassifier() xgboost.fit(over_sample_X,over_sample_Y) xgboost_predict = xgboost.predict(np.array(X_test)) cm = pd.crosstab(xgboost_predict,Y_test) print('混淆矩阵如下:') print(cm) print('基于混淆矩阵的方法检验的结果如下:') print(metrics.classification_report(Y_test,xgboost_predict)) Y_score = xgboost.predict_proba(np.array(X_test))[:,1] fpr, tpr,threshold =metrics.roc_curve(Y_test,Y_score) roc_auc = metrics.auc(fpr,tpr) print('AUC=%f'%roc_auc)
try: games = pd.read_csv(lg_data_path) games = games.dropna(how='any') dc_columns = get_config(file="dc_columns/{}".format(league)) played_data = games.loc[ (games.Season.isin([1415, 1516, 1617, 1718, 1819])) & (games.played == 1)] target_1x = played_data.FTR.map({"D": 0, "A": 1, "H": 0}) # Select significant columns dc_data = played_data[dc_columns] # Double chance model fit sm = SMOTE(random_state=2) dc_data_res, target_1x_res = sm.fit_sample(dc_data, target_1x.ravel()) model = LogisticRegression(C=1e5) model.fit(dc_data_res, target_1x_res) log.info("0: '1x', 1: 'A' League: {}\t DC score: {}".format( league, model.score(dc_data_res, target_1x_res))) model_filename = get_analysis_root_path( "tools/league_models/{}_dc".format(league)) joblib.dump(model, model_filename) except Exception as e: log.warn("New wdw model not built for {}".format(league).upper()) log.warn("See why:::::: {}".format(e)) log.info("Finished wdw training model")
labels = pd.unique(y) # split dataset into training/test portions X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0, stratify=y) # PCA part pca = PCA(n_components=3).fit(X) X_pca = pca.transform(X) pca = PCA(n_components=3).fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Over-sampling techniques sm = SMOTE(random_state=1) X_sm, y_sm = sm.fit_sample(X,y) X_train_sm, y_train_sm = sm.fit_sample(X_train,y_train) pca = PCA(n_components=3).fit(X_sm) X_sm_pca = pca.transform(X_sm) pca = PCA(n_components=3).fit(X_train_sm) X_train_sm_pca = pca.transform(X_train_sm) X_test_sm_pca = pca.transform(X_test) def draw_learning_curve(X, y, X_pca, filename): clf = GaussianNB() train_sizes,train_scores, test_scores = learning_curve( clf, X, y, cv=10, n_jobs=8) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1)
y, test_size=0.2, random_state=1) #Drop strain category column in x_test and x_all datasets. x_test = x_test.drop(['Strain Category'], axis=1) x_all = x_all.drop(['Strain Category'], axis=1) #Resampling training dataset train_dataset = pd.concat([x_train, y_train], axis=1, sort=False) train_strain_cat = train_dataset.loc[:, ['Strain Category']] train_dataset = train_dataset.drop(['Strain Category'], axis=1) #Resampling sampling_method = SMOTE() train_dataset, train_strain_cat = sampling_method.fit_sample( train_dataset, train_strain_cat) train_dataset = pd.DataFrame(train_dataset) train_dataset.columns = column_names #Drop Strain Category column in dataset variable. #The column is only useful for resampling dataset = dataset.drop(['Strain Category'], axis=1) if resample_on == True: #Not resampled train dataset x_train_no_resample = [] y_train_no_resample = [] x_train_no_resample = x_train x_train_no_resample = x_train_no_resample.drop(['Strain Category'], axis=1) y_train_no_resample = y_train
def ensembleSmote(xydev): xdevf,ydev = xydev sm = SMOTE(kind='svm',random_state=sh.getConst('smoteSeed')) xdevfr,ydevr = sm.fit_sample(xdevf,ydev) return (xdevfr,ydevr)
#PLotting the imbalanced dataset sn.FacetGrid(data=df_blob2,hue='labels',size=3).map(plt.scatter,'feature_1','feature_2') plt.legend() plt.xlabel('feature_1') plt.ylabel('feature_2') plt.show() ''' Technique 3:Creating synthetic features (SMOTE) Creates new synthetic features rather than just repeating the features as compared to traditional oversampling ''' from imblearn.over_sampling import SMOTE rus3 = SMOTE(ratio='minority',k_neighbors=3,random_state=42) X3_res, y3_res = rus3.fit_sample(X_res, y_res) y3_res=y3_res.reshape(20000,1) dataset_blob3= np.concatenate((X3_res,y3_res),axis=1) df_blob3=pd.DataFrame(dataset_blob3,columns=('feature_1','feature_2','labels')) print('The number of samples of class 0 and 1:',pd.value_counts(df_blob3['labels'].values, sort=False)) #PLotting the imbalanced dataset sn.FacetGrid(data=df_blob3,hue='labels',size=3).map(plt.scatter,'feature_1','feature_2') plt.legend() plt.xlabel('feature_1') plt.ylabel('feature_2') plt.show() '''
from ay_hw_4._global import ROOT_PATH, APS_SHRINK, APS_FULL_COLUMNS from ay_hw_4.util_data import load_data, to_binary_numeric GENERATED_SMOTE_TRAIN_DATA_FILE_PATH = './gen_smote_train_shrink_data_set.csv' if __name__ == "__main__": X_train, y_train = load_data(ROOT_PATH + APS_SHRINK, skip_first_row=21, y_column_index=0, assignedColumnNames=APS_FULL_COLUMNS, missingSymbol='na', needImpute=True, dropOrNot=True) smote = SMOTE(random_state=2333) smote_train_fit = smote.fit_sample(X_train, y_train) X_train_smote = pd.DataFrame(smote_train_fit[0]) y_train_smote = pd.DataFrame(smote_train_fit[1], columns=['class']) export_smote_train_data = pd.concat([y_train_smote, X_train_smote], axis=1) # export data to csv export_smote_train_data.to_csv(GENERATED_SMOTE_TRAIN_DATA_FILE_PATH, sep=',', index=False) smote_train_data = convert.load_any_file( filename=GENERATED_SMOTE_TRAIN_DATA_FILE_PATH) smote_train_data.class_is_first() # load logistic model tree algorithm log_tree = Classifier(classname="weka.classifiers.trees.LMT") eval_smote_train_obj = Evaluation(smote_train_data)
"""COUNT OF ORIGINAL DATA""" class_zero = 0 class_one = 0 for i in range(0,len(y_train)): if y_train[i]==0: class_zero = class_zero+1 else: class_one = class_one + 1 print(class_zero, class_one) """OVERSAMPLING""" smt = SMOTE() over_x_train, over_y_train = smt.fit_sample(x_train_mod, y_train) np.bincount(over_y_train) """UNDERSAMPLING""" nr = NearMiss() under_x_train, under_y_train = nr.fit_sample(x_train_mod, y_train) np.bincount(under_y_train) """CLASSIFIER""" def acc(pred, actual): tp = fp = tn = fn = 0 for i in range(0, len(pred)): if np.round(pred[i]) == actual[i]: if np.round(pred[i])==0:
labelencoder_y = LabelEncoder() df_product['gender'] = labelencoder_y.fit_transform(df_product['gender']) #Target varible - class proportion df_product.groupby(['gender'])['day_of_week_1'].count() #Creating X - feature and y - target. Train test split - 80:20 X = df_product.iloc[:, :-1].values y = df_product.iloc[:, -1].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) #Oversampling using SMOTE - Synthetic minority Oversampling technique smt = SMOTE() X_train, y_train = smt.fit_sample(X_train, y_train) print(X_train.shape) print(y_train.shape) #logistic Regression lr = LogisticRegression() lr.fit(X_train, y_train) y_pred = lr.predict(X_test) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() score = ((tp / (tp + fn)) + (tn / (tn + fp))) / 2 print("Score of the algorithm(computed as given in question):", score) print("General accuracy:", accuracy_score(y_test, y_pred)) print("confusion matrix:\n", confusion_matrix(y_test, y_pred)) print("classification report:\n", classification_report(y_test, y_pred)) #Neural network
# %% Feature selection dataProsSampleScaledSelected, classes = function.featureSelection( dataProsSampleScaled, y) dataProsSampleScaledSelected_n, classes_n = function.featureSelection( dataProsSample_n, y_n) # %% Transform data pca = PCA(n_components=2) X = pca.fit_transform(dataProsSampleScaledSelected) x_n = pca.fit_transform(dataProsSampleScaledSelected_n) plotter.plot_2d_space(X, y, 'Imbalanced dataset (2 PCA components)') # %% smote = SMOTE(ratio='minority') X_sm, y_sm = smote.fit_sample(X, y) plotter.plot_2d_space(X_sm, y_sm, 'SMOTE over-sampling') # %% matrixes = function.run(dataProsSampleScaledSelected_n, y_n, dataLabel.columns) # %% dataTail = data.tail(1000) # .where(data["poistunut"] == 1) dataTail = dataTail.dropna(subset=["poistunut"]) dataTailPros = pre.prepare(dataTail.copy()) y_tail = dataTailPros.loc[:, "poistunut"] drop = ["poistunut", "kasko_poistunut", "fetu_poistunut", "liikenne_poistunut"] dataLabelTail, labelsTail = pre.labeling(dataTailPros.drop(columns=drop))
cbar=False) plt.xlabel("true label") plt.ylabel("predicted label") bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) plot_conf_mat(y_test, y_preds) #Plot ROC curve and calculate Auc metric plot_roc_curve(svm_model, test_matrix, y_test) # applying SMOTE to balance the data from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=42) X_train1, y_train1 = sm.fit_sample(X_train_transformed, y_train) y_train1.value_counts() y_train.value_counts() svm_model = LinearSVC() svm_model.fit(X_train1, y_train1) train_pred_svc = svm_model.predict(X_train1) accuracy_train_svc = np.mean(train_pred_svc == y_train1) accuracy_train_svc y_preds = svm_model.predict(X_test_transformed) accuracy_test_svc = np.mean(y_preds == y_test) print(classification_report(y_test, y_preds)) pd.crosstab(y_test, y_preds) # Applying k-Fold Cross Validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #print("Number transactions training datasets: ", X_train.shape) #print("Number transactions testing datasets: ", X_test.shape) # Handling class imbalance #one_hot["Loan_Status"].value_counts() # Returns the number of elements in each category from imblearn.over_sampling import SMOTE #print("Before OverSampling - # of label 1: {}".format(sum(y_train==1))) #print("Before OverSampling - # of label 0: {} \n".format(sum(y_train==0))) sm = SMOTE(sampling_strategy=1.0, random_state=25) X_train_new, y_train_new = sm.fit_sample(X_train, y_train) #print("==============================================") #print('After OverSampling - X_train shape: {}'.format(X_train_new.shape)) #print('After OverSampling - t_train shape: {} \n'.format(y_train_new.shape)) #print("After OverSampling - # of label 1: {}".format(sum(y_train_new==1))) #print("After OverSampling - # of label 0: {}".format(sum(y_train_new==0))) # Building the model from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score, f1_score # Logistic Regression
X = data_final.loc[:, data_final.columns != 'y'] y = data_final.loc[:, data_final.columns == 'y'] # In[101]: from imblearn.over_sampling import SMOTE os = SMOTE(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) columns = X_train.columns os_data_X,os_data_y=os.fit_sample(X_train, y_train) os_data_X = pd.DataFrame(data=os_data_X,columns=columns ) os_data_y= pd.DataFrame(data=os_data_y,columns=['y']) # we can Check the numbers of our data print("length of oversampled data is ",len(os_data_X)) print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0])) print("Number of subscription",len(os_data_y[os_data_y['y']==1])) print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X)) print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X)) # ### Recursive feature elimination # In[102]:
def classifyUsers(profile,class_names = ["Negative","Positive"],stand_col_names=None,im_balance=False): """ 根据输入的用户特征,来进行分类。其中,数据集中最后一列为需要预测的特征,会自动进行正则化以及 丢掉缺失数据。还可以对不平衡的数据集进行插补。但是输入DataFrame必须无名义性或者字符型变量, 要提前进行编码。 目前默认采用三种分类算法,分别为决策树,随机森林,逻辑回归。 能自动生成结果,包括混淆矩阵的各个指标,以及重要的结果参数。 :param profile_path: 存储用户特征文件的路径,其中最后一个特征为预测特征 :param class_names: 类别名称 :param stand_col_names: 需要正则化的特征名,如果无,则自动正则化所有数值型特征 :param im_balance: 是否需要采用SMOT-Boderline使得两类数据变得均衡 :return: """ # profile = pd.read_csv("/home/maoan/maidianAnalysis/level3-growth/userProfile.csv") # profile = pd.read_csv("/home/maoan/maidianAnalysis/level3-growth/user_actions.csv") # profile = pd.read_csv(profile_path) ## Construct the features name. features = profile.columns.tolist()[:-1] pred_feature= profile.columns.tolist()[-1] ## basic data preprocessing profile.dropna(axis=0, how='any', inplace=True) # col_names = ['Freq','BattleRatio'] # profile = transfromFeatures(profile) # standarlization norm_profile, _ = standarization(profile,stand_col_names) print("Features are: {0}".format(features)) X = profile[features].as_matrix() y = profile[pred_feature].as_matrix() norm_x = norm_profile[features].as_matrix() norm_y = norm_profile[pred_feature].as_matrix() # dealing with the imbalanced data problem if im_balance: print('Original dataset shape {}'.format(Counter(norm_y))) print(np.median(norm_x, axis=0)) sm = SMOTE(random_state=42,kind="borderline2") X_res, y_res = sm.fit_sample(norm_x, norm_y) X,y = X_res, y_res norm_x,norm_y = X_res, y_res # class_names = ['Non-VIP', 'VIP'] ## choose the classifier and set the parameters min_split = 20 max_dep = 3 decisionTreeClassify(X,y,features,class_names,min_samples_split=min_split, max_depth=max_dep) # use normalized data logiRegressionClassify(norm_x,norm_y,features,class_names,penalty="l1") randomForestClassify(X,y,features,class_names,n_estimators=250)
X_ts3, y_ts3, _ = load_data(testing_data_path3) y_tr3 = np.array(map(int, y_tr3)) y_ts3 = np.array(map(int, y_ts3)) # generating a big dataset with training and testing samples X3 = np.concatenate((X_tr3, X_ts3)) Y3 = np.concatenate((y_tr3, y_ts3)) sm = SMOTE(random_state=42) ''' # Resampling only training data X1,Y1 = sm.fit_sample(X_tr1, y_tr1) X2,Y2 = sm.fit_sample(X_tr2, y_tr2) X3,Y3 = sm.fit_sample(X_tr3, y_tr3) ''' # Resampling the entire datasets X1, Y1 = sm.fit_sample(X1, Y1) X2, Y2 = sm.fit_sample(X2, Y2) X3, Y3 = sm.fit_sample(X3, Y3) pos_names = dict( zip(['A', 'C', 'D', 'F', 'I', 'N', 'P', 'R', 'S', 'V', 'W', 'Z'], [ 'adjective', 'conjunction', 'determiner', 'punctuation', 'interjection', 'noun', 'pronoun', 'adverb', 'adposition', 'verb', 'date', 'number' ])) clf1 = tree.DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4, max_features=None, max_leaf_nodes=None,
def KFoldCrossValidation(train_and_test_indexes, X_data_frame, y_data_frame, k_value=3, kcv_value=9, smote=True, debug=False): train_indexes = train_and_test_indexes[0] #print('Train Indexes:',train_indexes) test_indexes = train_and_test_indexes[1] #print('Test Indexes:',test_indexes) knn = KNeighborsClassifier(n_neighbors=k_value) #if debug: #print("Train Index: ", train_index, "\n") #print("Test Index: ", test_index, "\n") # STEP 1: split data between test and train sets if debug: print('* Starting train and test sets splitting... ', end='') y_data = np.ravel(y_data_frame) # Added to solve column-vector issue X_train, X_test, y_train, y_test = X_data_frame[ train_indexes], X_data_frame[test_indexes], y_data[ train_indexes], y_data[test_indexes] #print('y_data[test_indexes]:',y_data[test_indexes]) if debug: print('Done!') # print the shapes of the new X objects if debug: print('* Display X and y objects\'s shape:') print('\t X_train.shape: ', X_train.shape) print('\t X_test.shape: ', X_test.shape) print('\t y_train.shape: ', y_train.shape) print('\t y_test.shape: ', y_test.shape) # SMOTE HERE if smote: # Oversampling training data using SMOTE if debug: print('* Starting to oversample training data using SMOTE...') print( '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=', (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2))) print( '\t -Number of instances inside TEST set from each class BEFORE to apply SMOTE=', (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2))) print( '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=', (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2))) print( '\t -Number of instances inside TEST set from each class BEFORE to apply SMOTE=', (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2))) from imblearn.over_sampling import SMOTE smt = SMOTE() X_train, y_train = smt.fit_sample(X_train, y_train) if debug: print('\t -Instances amount from each class AFTER to apply SMOTE=', (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2))) #print('y_train:',y_train) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) #print('y_test:',y_data[test_indexes]) #print('y_pred=',y_pred) # comparing actual response values (y_test) with predicted response values (y_pred) this_accuracy = metrics.accuracy_score(y_test, y_pred) this_confusion_matrix = metrics.confusion_matrix(y_test, y_pred, labels=None, sample_weight=None) return this_accuracy, this_confusion_matrix
def runMODEL(X_data, y_data, k_value=3, knn_debug=False, use_smote=True, use_rescaling=True, cv_type='kcv', kcv_value=9, use_Pool=False, model=__MODEL): # Main variables # global __MODEL # model = __MODEL #knn = KNeighborsClassifier(n_neighbors=k_value) accuracy = 0 confusion_matrix = [] if knn_debug: print('* Checking arguments formating...') print('\t-X_data.shape=', X_data.shape) print('\t-y_data.shape=', y_data.shape) # Data preparation try: new_dimensions = (X_data.shape[0], X_data.shape[1] * X_data.shape[2]) except IndexError: print('** IndexValue exception') print('\tX_data.shape=', X_data.shape) print('\ty_data.shape=', y_data.shape) print('\t') sys.exit(-1) if knn_debug: print('* Reshaping for this data partition with these dimensions:', new_dimensions) new_partition = np.reshape(X_data, new_dimensions) if knn_debug: print('...done') if knn_debug: print('* The shape of a line data retrived from the new partition=', new_partition[0].shape) ## KNN preparation # Preparing data to use with PANDAS X_pandas = pd.DataFrame(data=new_partition) y_pandas = pd.DataFrame(data=y_data) if knn_debug: print('* Preparing data to use with Pandas...') print('X_pandas=\n', X_pandas) print('y_pandas=\n', y_pandas) # Rescalling data if use_rescaling: if knn_debug: print('* Rescalling data... ', end='') from sklearn import preprocessing scaler = preprocessing.StandardScaler() X_pandas = scaler.fit_transform( X_pandas) # Fit your data on the scaler object if knn_debug: print('done') print('Rescaled X_pandas=\n', X_pandas) ####################################################### if cv_type == 'kcv': # K-FOLD CROSS VALIDATION scores = [] matrices = [] all_results = [] cv = KFold(n_splits=kcv_value, random_state=42, shuffle=True) both_indexes = cv.split(X_pandas) #----------------------------------- if not use_Pool: # Single Thread KCrossValidation # Single Thread KCrossValidation for indexes in both_indexes: result = KFoldCrossValidation(indexes, X_pandas, y_pandas, k_value, kcv_value, use_smote, knn_debug) all_results.append(result) for acc_with_cmat in all_results: acc = acc_with_cmat[0] cmat = acc_with_cmat[1] scores.append(acc) matrices.append(cmat) np_scores = np.array(scores) best_pos = np_scores.argmax() accuracy = np.mean(np_scores) confusion_matrix = matrices[best_pos] #----------------------------------- else: # Multi Thread KCrossValidation # NOT WORKING YET!! cores_num = multiprocessing.cpu_count() with Pool(processes=cores_num) as p: from functools import partial all_results = p.map( partial(KFoldCrossValidation, X_data_frame=X_pandas, y_data_frame=y_pandas, k_value=k_value, kcv_value=kcv_value, smote=use_smote, debug=knn_debug), both_indexes) for acc_with_cmat in all_results: acc = acc_with_cmat[0] cmat = acc_with_cmat[1] scores.append(acc) matrices.append(cmat) np_scores = np.array(scores) best_pos = np_scores.argmax() accuracy = np.mean(np_scores) confusion_matrix = matrices[best_pos] ######################### else: # validation with simple split data between test and train sets if knn_debug: print('* Starting train and test sets splitting... ', end='') X_train, X_test, y_train, y_test = train_test_split(X_pandas, np.ravel(y_pandas), test_size=0.3, random_state=12) if knn_debug: print('done') # print the shapes of the new X objects if knn_debug: print('* Display X and y objects\'s shape:') print('\t X_train.shape: ', X_train.shape) print('\t X_test.shape: ', X_test.shape) print('\t y_train.shape: ', y_train.shape) print('\t y_test.shape: ', y_test.shape) if use_smote: # Oversampling training data using SMOTE if knn_debug: print('* Starting to oversample training data using SMOTE...') print( '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=', (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2))) print( '\t -Number of instances inside TEST set from each class BEFORE to apply SMOTE=', (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2))) print( '\t -Number of instances inside TRAIN set from each class BEFORE to apply SMOTE=', (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2))) print( '\t -Number of instances inside TEST set from each class BEFORE to apply SMOTE=', (sum(y_test == 0), sum(y_test == 1), sum(y_test == 2))) from imblearn.over_sampling import SMOTE smt = SMOTE() X_train, y_train = smt.fit_sample(X_train, y_train) if knn_debug: print( '\t -Instances amount from each class AFTER to apply SMOTE=', (sum(y_train == 0), sum(y_train == 1), sum(y_train == 2))) # print the shapes of the new X and y objects if knn_debug: print( '* Display X and y objects\'s shape after apply SMOTE to this sets:' ) print('\t X_train.shape: ', X_train.shape) print('\t X_test.shape (should be the same): ', X_test.shape) print('\t y_train.shape: ', y_train.shape) print('\t y_test.shape (should be the same): ', y_test.shape) # STEP 2: train the model on the training set #knn = KNeighborsClassifier(n_neighbors=k_value) model.fit(X_train, y_train) # STEP 3: make predictions on the testing set y_pred = model.predict(X_test) #if knn_debug: # print('y_pred=\n',y_pred) # print('y_pred.shape:',y_pred.shape) # compare actual response values (y_test) with predicted response values (y_pred) accuracy = metrics.accuracy_score(y_test, y_pred) confusion_matrix = metrics.confusion_matrix(y_test, y_pred, labels=None, sample_weight=None) return accuracy, confusion_matrix
def test_smote_wrong_kind(): kind = 'rnd' smote = SMOTE(kind=kind, random_state=RND_SEED) with raises(ValueError, match="Unknown kind for SMOTE"): smote.fit_sample(X, Y)
from sklearn.linear_model import LogisticRegression # code starts here model = LogisticRegression(random_state=6) model.fit(X_train, y_train) y_pred = model.predict(X_test) score = accuracy_score(y_pred, y_test) # Code ends here # -------------- from sklearn.preprocessing import StandardScaler from imblearn.over_sampling import SMOTE # code starts here smote = SMOTE(random_state=9) X_train, y_train = smote.fit_sample(X_train, y_train) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Code ends here # -------------- # Code Starts here model = LogisticRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) score = accuracy_score(y_pred, y_test) # Code ends here
del a[3] if (radon): del a[4] if (zipc): del a[5] a = np.array(a) b.append(a) b = np.array(b) return b X_train = deleteFeatures(False, False, False, False, False, False, X_train) X_test = deleteFeatures(False, False, False, False, False, False, X_test) smt = SMOTE() X_train, y_train = smt.fit_sample(X_train, y_train) X_test, y_test = smt.fit_sample(X_test, y_test) # Function to get metrics def evaluate_model(metrics, model, y_test, X_test): y_pred = model.predict(X_test, verbose=1) y_pred_coded = np.where(y_pred > 0.5, 1, 0) y_pred_coded = y_pred_coded.flatten() metric = [] metric.append(['f1score', f1_score(y_test, y_pred_coded)]) metric.append(['precision', precision_score(y_test, y_pred_coded)]) metric.append(['recall', recall_score(y_test, y_pred_coded)]) metric.append(['accuracy', accuracy_score(y_test, y_pred_coded)]) metrics.append(metric) return metrics, y_pred
logger.info('ROC AUC score: ' + str(roc_auc_score(y_val, prob1))) logger.info('Precision Recall AUC score: ' + str(funciones.precision_recall_auc_score(y_val, prob1))) logger.info('F1 score: ' + str(f1_score(y_val, pred1))) logger.info('Balanced accuracy score: ' + str(balanced_accuracy_score(y_val, pred1))) logger.info('Precission score: ' + str(precision_score(y_val, pred1))) logger.info('Recall score: ' + str(recall_score(y_val, pred1))) logger.info('***' * 20) ############ Metodo 8: Smote oversampling from imblearn.over_sampling import SMOTE logger.info('SMOTE: ') smote = SMOTE(sampling_strategy=samp, random_state=42, n_jobs=n_proc) X_rus, y_rus = smote.fit_sample(x_tra, y_tra) classifier8 = clone(classifier) classifier8.fit(X_rus, y_rus) pred1 = classifier8.predict(x_val) prob1 = classifier8.predict_proba(x_val)[:, 1] print('\n\nSMOTE: ') print('ROC AUC score: ' + str(roc_auc_score(y_val, prob1))) print('Precision Recall AUC score: ' + str(funciones.precision_recall_auc_score(y_val, prob1))) print('F1 score: ' + str(f1_score(y_val, pred1))) print('Balanced accuracy score: ' + str(balanced_accuracy_score(y_val, pred1))) print('Precission score: ' + str(precision_score(y_val, pred1))) print('Recall score: ' + str(recall_score(y_val, pred1))) logger.info('ROC AUC score: ' + str(roc_auc_score(y_val, prob1)))
from imblearn.over_sampling import SMOTE # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Borderline SMOTE 1 sm = SMOTE(kind='borderline1') X_resampled, y_resampled = sm.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
def run_program(hparams, FLAGS): # load dataset num_folds = FLAGS.num_folds data_dir = FLAGS.data_dir data_version = 2013 output_dir = FLAGS.output_dir classes = ['W', 'N1', 'N2', 'N3', 'REM'] n_classes = len(classes) path, channel_ename = os.path.split(data_dir) traindata_dir = os.path.join( os.path.abspath(os.path.join(data_dir, os.pardir)), 'traindata/') print(str(datetime.now())) def evaluate_model(hparams, X_test, y_test, classes): # acc_track = [] n_classes = len(classes) y_true = [] y_pred = [] alignments_alphas_all = [] # (batch_num,B,max_time_step,max_time_step) for batch_i, (source_batch, target_batch) in enumerate( batch_data(X_test, y_test, hparams.batch_size)): pred_outputs_ = sess.run(pred_outputs, feed_dict={ inputs: source_batch, keep_prob_: 1.0 }) alignments_alphas = sess.run(dec_states.alignment_history.stack(), feed_dict={ inputs: source_batch, dec_inputs: target_batch[:, :-1], keep_prob_: 1.0 }) # acc_track.append(np.mean(dec_input == target_batch)) pred_outputs_ = pred_outputs_[:, :hparams. max_time_step] # remove the last prediction <EOD> target_batch_ = target_batch[:, 1: -1] # remove the last <EOD> and the first <SOD> # acc_track.append(pred_outputs_ == target_batch_) alignments_alphas = alignments_alphas.transpose((1, 0, 2)) alignments_alphas = alignments_alphas[:, :hparams.max_time_step] alignments_alphas_all.append(alignments_alphas) _y_true = target_batch_.flatten() _y_pred = pred_outputs_.flatten() y_true.extend(_y_true) y_pred.extend(_y_pred) cm = confusion_matrix(y_true, y_pred, labels=range(n_classes)) ck_score = cohen_kappa_score(y_true, y_pred) acc_avg, acc, f1_macro, f1, sensitivity, specificity, PPV = evaluate_metrics( cm, classes) # print ("batch_i: {}").format(batch_i) print( 'Average Accuracy -> {:>6.4f}, Macro F1 -> {:>6.4f} and Cohen\'s Kappa -> {:>6.4f} on test set' .format(acc_avg, f1_macro, ck_score)) for index_ in range(n_classes): print( "\t{} rhythm -> Sensitivity: {:1.4f}, Specificity: {:1.4f}, Precision (PPV): {:1.4f}, F1 : {:1.4f} Accuracy: {:1.4f}" .format(classes[index_], sensitivity[index_], specificity[index_], PPV[index_], f1[index_], acc[index_])) print( "\tAverage -> Sensitivity: {:1.4f}, Specificity: {:1.4f}, Precision (PPV): {:1.4f}, F1-score: {:1.4f}, Accuracy: {:1.4f}" .format(np.mean(sensitivity), np.mean(specificity), np.mean(PPV), np.mean(f1), np.mean(acc))) return acc_avg, f1_macro, ck_score, y_true, y_pred, alignments_alphas_all def count_prameters(): print( '# of Params: ', np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ])) for fold_idx in range(num_folds): start_time_fold_i = time.time() data_loader = SeqDataLoader(data_dir, num_folds, fold_idx, classes=classes) X_train, y_train, X_test, y_test = data_loader.load_data( seq_len=hparams.max_time_step) # preprocessing char2numY = dict(zip(classes, range(len(classes)))) pre_f1_macro = 0 # <SOD> is a token to show start of decoding and <EOD> is a token to indicate end of decoding char2numY['<SOD>'] = len(char2numY) char2numY['<EOD>'] = len(char2numY) num2charY = dict(zip(char2numY.values(), char2numY.keys())) # over-sampling: SMOTE: X_train = np.reshape(X_train, [X_train.shape[0] * X_train.shape[1], -1]) y_train = y_train.flatten() nums = [] for cl in classes: nums.append(len(np.where(y_train == char2numY[cl])[0])) if (os.path.exists(traindata_dir) == False): os.mkdir(traindata_dir) fname = os.path.join( traindata_dir, 'trainData_' + channel_ename + '_SMOTE_all_10s_f' + str(fold_idx) + '.npz') if (os.path.isfile(fname)): X_train, y_train, _ = data_loader.load_npz_file(fname) else: n_osamples = nums[2] - 7000 ratio = { 0: n_osamples if nums[0] < n_osamples else nums[0], 1: n_osamples if nums[1] < n_osamples else nums[1], 2: nums[2], 3: n_osamples if nums[3] < n_osamples else nums[3], 4: n_osamples if nums[4] < n_osamples else nums[4] } sm = SMOTE(random_state=12, ratio=ratio) X_train, y_train = sm.fit_sample(X_train, y_train) data_loader.save_to_npz_file(X_train, y_train, data_loader.sampling_rate, fname) X_train = X_train[:(X_train.shape[0] // hparams.max_time_step) * hparams.max_time_step, :] y_train = y_train[:(X_train.shape[0] // hparams.max_time_step) * hparams.max_time_step] X_train = np.reshape(X_train, [-1, X_test.shape[1], X_test.shape[2]]) y_train = np.reshape(y_train, [ -1, y_test.shape[1], ]) # shuffle training data_2013 permute = np.random.permutation(len(y_train)) X_train = np.asarray(X_train) X_train = X_train[permute] y_train = y_train[permute] # add '<SOD>' to the beginning of each label sequence, and '<EOD>' to the end of each label sequence (both for training and test sets) y_train = [[char2numY['<SOD>']] + [y_ for y_ in date] + [char2numY['<EOD>']] for date in y_train] y_train = np.array(y_train) y_test = [[char2numY['<SOD>']] + [y_ for y_ in date] + [char2numY['<EOD>']] for date in y_test] y_test = np.array(y_test) print('The training set after oversampling: ', classes) for cl in classes: print(cl, len(np.where(y_train == char2numY[cl])[0])) # training and testing the model if (os.path.exists(FLAGS.checkpoint_dir) == False): os.mkdir(FLAGS.checkpoint_dir) if (os.path.exists(output_dir) == False): os.makedirs(output_dir) loss_track = [] with tf.Graph().as_default(), tf.Session() as sess: # Placeholders inputs = tf.placeholder( tf.float32, [None, hparams.max_time_step, hparams.input_depth], name='inputs') targets = tf.placeholder(tf.int32, (None, None), 'targets') dec_inputs = tf.placeholder(tf.int32, (None, None), 'decoder_inputs') keep_prob_ = tf.placeholder(tf.float32, name='keep') # model logits, pred_outputs, loss, optimizer, dec_states = build_whole_model( hparams, char2numY, inputs, targets, dec_inputs, keep_prob_) count_prameters() sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) saver = tf.train.Saver() print(str(datetime.now())) ckpt_name = "model_fold{:02d}.ckpt".format(fold_idx) ckpt_exist = False for file in os.listdir(FLAGS.checkpoint_dir): if file.startswith(ckpt_name): ckpt_exist = True ckpt_name = os.path.join(FLAGS.checkpoint_dir, ckpt_name) if ckpt_exist: saver.restore(sess, ckpt_name) evaluate_model(hparams, X_test, y_test, classes) else: for epoch_i in range(hparams.epochs): start_time = time.time() # train_acc = [] y_true = [] y_pred = [] for batch_i, (source_batch, target_batch) in enumerate( batch_data(X_train, y_train, hparams.batch_size)): _, batch_loss, batch_logits = sess.run( [optimizer, loss, logits], feed_dict={ inputs: source_batch, dec_inputs: target_batch[:, :-1], targets: target_batch[:, 1:], keep_prob_: 0.5 } #, ) loss_track.append(batch_loss) # train_acc.append(batch_logits.argmax(axis=-1) == target_batch[:,1:]) y_pred_ = batch_logits[:, :hparams. max_time_step].argmax(axis=-1) y_true_ = target_batch[:, 1:-1] y_true.extend(y_true_) y_pred.extend(y_pred_) # accuracy = np.mean(train_acc) y_true = np.asarray(y_true) y_pred = np.asarray(y_pred) y_true = y_true.flatten() y_pred = y_pred.flatten() n_examples = len(y_true) cm = confusion_matrix(y_true, y_pred, labels=range(len(char2numY) - 2)) accuracy = np.mean(y_true == y_pred) mf1 = f1_score(y_true, y_pred, average="macro") ck_score = cohen_kappa_score(y_true, y_pred) print( 'Epoch {:3} Loss: {:>6.3f} Accuracy: {:>6.4f} F1-score: {:>6.4f} Cohen\'s Kappa: {:>6.4f} Epoch duration: {:>6.3f}s' .format(epoch_i, np.mean(batch_loss), accuracy, mf1, ck_score, time.time() - start_time)) if (epoch_i + 1) % hparams.test_step == 0: acc_avg, f1_macro, ck_score, y_true, y_pred, alignments_alphas_all = evaluate_model( hparams, X_test, y_test, classes) if np.nan_to_num( f1_macro ) > pre_f1_macro: # save the better model based on the f1 score print( 'Loss {:.4f} after {} epochs (batch_size={})'. format(loss_track[-1], epoch_i + 1, hparams.batch_size)) pre_f1_macro = f1_macro ckpt_name = "model_fold{:02d}.ckpt".format( fold_idx) save_path = os.path.join(FLAGS.checkpoint_dir, ckpt_name) saver.save(sess, save_path) print( "The best model (till now) saved in path: %s" % save_path) # Save save_dict = { "y_true": y_true, "y_pred": y_pred, "ck_score": ck_score, "alignments_alphas_all": alignments_alphas_all[: 200], # we save just the first 200 batch results because it is so huge } filename = "output_" + channel_ename + "_fold{:02d}.npz".format( fold_idx) save_path = os.path.join(output_dir, filename) np.savez(save_path, **save_dict) print( "The best results (till now) saved in path: %s" % save_path) # plt.plot(loss_track) # plt.show() print(str(datetime.now())) print('Fold{} took: {:>6.3f}s'.format( fold_idx, time.time() - start_time_fold_i))
print(len(x_features_test)) return (x_features_train, x_features_test, x_labels_train, x_labels_test) data = pd.read_csv('creditcard.csv') os = SMOTE( random_state=0) # We are using SMOTE as the function for oversampling print(os) # now we can devided our data into training and test data # Call our method data prepration on our dataset data_train_X, data_test_X, data_train_y, data_test_y = data_prepration(data) print(type(data_test_X)) columns = data_train_X.columns os_data_X, os_data_y = os.fit_sample( data_train_X, data_train_y) # The array containing the resampled data. os_data_X = pd.DataFrame(data=os_data_X, columns=columns) os_data_y = pd.DataFrame(data=os_data_y, columns=["Class"]) # we can Check the numbers of our data print("length of oversampled data is ", len(os_data_X)) print("Number of normal transcation in oversampled data", len(os_data_y[os_data_y["Class"] == 0])) print("No.of fraud transcation", len(os_data_y[os_data_y["Class"] == 1])) print("Proportion of Normal data in oversampled data is ", len(os_data_y[os_data_y["Class"] == 0]) / len(os_data_X)) print("Proportion of fraud data in oversampled data is ", len(os_data_y[os_data_y["Class"] == 1]) / len(os_data_X)) os_data_X["Normalized Amount"] = StandardScaler().fit_transform( os_data_X['Amount'].values.reshape(-1, 1))
def smote_tech(X, Y): smote = SMOTE(ratio='minority') X_sm, Y_sm = smote.fit_sample(X, Y) return X_sm, Y_sm