def ReSampling(self, data, labels, over_s=True): label_status = Counter(labels) print(self.tasktype, "data " + self.tasktype, label_status) featurelen = len(data[0]) if 1 not in label_status.keys(): x, y = np.zeros(shape=featurelen, dtype=np.int), 1 elif 0 not in label_status.keys(): x, y = np.zeros(shape=featurelen, dtype=np.int), 0 else: x, y = None, None if x is not None: data = np.insert(data, 0, x, 0) labels = np.insert(labels, 0, y, 0) if len(label_status) < 2: print(self.tasktype, "no need to resample") return data, labels if label_status[1] / label_status[0] < 5. and label_status[ 1] / label_status[0] > 0.2: print("data are not biased too much") return data, labels maxSamples = label_status[0] if label_status[1] > label_status[0]: maxSamples = label_status[1] resampling = over_sampling.ADASYN(ratio={ 1: maxSamples, 0: int(0.4 * maxSamples) }) else: resampling = over_sampling.ADASYN(ratio={ 0: maxSamples, 1: int(0.4 * maxSamples) }) try: data, labels = resampling.fit_sample(data, labels) except: print(self.tasktype, "resampling using random method") if over_s: resampling = over_sampling.RandomOverSampler() else: resampling = under_sampling.RandomUnderSampler() data, labels = resampling.fit_sample(data, labels) label_status = Counter(labels) print(self.tasktype, "sampling status=", label_status) return data, labels
def execute_adasyn(df, label, minority_class): def worker(array, x): return list(0 if i < 10 else round(i, 2) for i in list(array[x])) X_df = df.ix[:, df.columns != label] features = X_df.columns y_df = df[[label]] X_mat = X_df.as_matrix() y_mat = y_df.as_matrix().ravel() adasyn_obj = over_sampling.ADASYN(k=30) X_mat_new, y_mat_new = adasyn_obj.fit_sample(X_mat, y_mat) new_examples_count = X_mat_new.shape[0] - X_mat.shape[0] if minority_class == "positive_class": new_rownames = list("pos_" + str(i) for i in range(new_examples_count)) else: new_rownames = list("neg_" + str(i) for i in range(new_examples_count)) X_mat_new_examples = X_mat_new[X_mat.shape[0]:] X_mat_new_examples = numpy.array( map(lambda x: worker(X_mat_new_examples, x), range(new_examples_count))) X_df_new = pandas.DataFrame(X_mat_new_examples, index=new_rownames, columns=features) y_df_new = pandas.DataFrame(y_mat_new[X_mat.shape[0]:], index=new_rownames, columns=['class']) new_examples_df = pandas.concat([y_df_new, X_df_new], axis=1) df = pandas.concat([df, new_examples_df], axis=0) return df
def _oversample(X, y, method='SMOTE', strat='not majority'): # compute minimum number of samples per class min_samples = len(y) for l in set(y): if y.tolist().count(l) < min_samples: min_samples = y.tolist().count(l) if min_samples <= 5: method = 'RNDM' if method == 'ADASYN': ios = imbover.ADASYN(sampling_strategy=strat, random_state=42) elif method == 'SMOTE': ios = imbover.SMOTE(sampling_strategy=strat, random_state=42) elif method == 'SMOTENC': ios = imbover.SMOTENC(sampling_strategy=strat, random_state=42) elif method == 'BORDERSMOTE': ios = imbover.BorderlineSMOTE(sampling_strategy=strat, random_state=42) elif method == 'SVMSMOTE': ios = imbover.SVMSMOTE(sampling_strategy=strat, random_state=42) elif method == 'KMEANSSMOTE': ios = imbover.KMeansSMOTE(sampling_strategy=strat, random_state=42) elif method == 'RNDM': ios = imbover.RandomOverSampler(sampling_strategy=strat, random_state=42) X_resampled, y_resampled = ios.fit_resample(X, y) return X_resampled, y_resampled
def imbalance_set(X, y, operation): methods = {'smoteen' : imb.SMOTEENN(), 'smotetom' : imb.SMOTETomek(), 'adasyn' : imbov.ADASYN(), 'randomunder' : imbun.RandomUnderSampler(), 'condensed' : imbun.CondensedNearestNeighbour(n_jobs=-1)} sm = methods[str(operation)] X_resampl, y_resampl = sm.fit_sample(X, y) return X_resampl, y_resampl
def __init__(self, inputs, targets, batch_size=100, max_num_batches=-1, shuffle_order=True, rng=None, oversample=None): """Create a new recognition data provider object. Args: inputs (ndarray): Array of data input features of shape (num_data, input_dim). targets (ndarray): Array of data output targets of shape (num_data, output_dim) or (num_data,) if output_dim == 1. batch_size (int): Number of data points to include in each batch. max_num_batches (int): Maximum number of batches to iterate over in an epoch. If `max_num_batches * batch_size > num_data` then only as many batches as the data can be split into will be used. If set to -1 all of the data will be used. shuffle_order (bool): Whether to randomly permute the order of the data before each epoch. rng (RandomState): A seeded random number generator. """ if not oversample is None: oversample = oversample.lower() self.initialize_seed(rng) if oversample == "smote": oversampler = imbl.SMOTE(random_state=self.rng) elif oversample == "smote-cat": # Need method for specifying categorical attributes, e.g., imbl.SMOTENC(random_state=self.rng, categorical_features=range(4200, 4348)) raise (NotImplementedError) elif oversample == "smote-svm": oversampler = imbl.SVMSMOTE(random_state=self.rng) elif oversample == "smote-borderline-1": oversampler = imbl.BorderlineSMOTE(random_state=self.rng, kind='borderline-1') elif oversample == "smote-borderline-2": oversampler = imbl.BorderlineSMOTE(random_state=self.rng, kind='borderline-2') elif oversample == "adasyn": oversampler = imbl.ADASYN(random_state=self.rng) else: raise (Exception( "Unrecognized oversampling method: {0}".format(oversample)) ) inputs, targets = oversampler.fit_resample(inputs, targets) self.num_classes = 3 inputs = inputs.astype(np.float32) # pass the loaded data to the parent class __init__ super(RecognitionDataProvider, self).__init__(inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
def split_data(Xdata, Ydata, oversample, K_neighbors=4): if oversample == False: X_train, X_test, y_train, y_test = train_test_split( Xdata, Ydata, train_size=0.70, random_state=RANDOM_STATE) elif oversample == True: print('Data was oversampled using the ADASYN method') smote = over_sampling.ADASYN(random_state=RANDOM_STATE, n_neighbors=K_neighbors) # split X_train, X_test, y_train, y_test = train_test_split( Xdata, Ydata, train_size=0.70, random_state=RANDOM_STATE) X_train, y_train = smote.fit_sample(X_train, y_train) # oversample the train sets #X_over, y_over = smote.fit_sample(Xdata,Ydata) #X_train, X_test, y_train, y_test = train_test_split(X_over, y_over,train_size = 0.70,random_state=RANDOM_STATE, stratify = y_over) return X_train, X_test, y_train, y_test
def init(bsize): data, label = load("GermanCredit.npz") #转换到球极坐标 # norm=np.sqrt(np.sum(data**2,axis=1,keepdims=True)) # ag=data/norm # data=np.concatenate([data,norm,ag],axis=1) #使用sin和cos信息 # data=np.concatenate([np.sin(data),np.cos(data)],axis=1) # 下采样 制造平衡样本 # cr = under_sampling.NearMiss(version=3) # data,label=cr.fit_sample(data,label) #上采样 制造平衡样本 ocr = over_sampling.ADASYN() data, label = ocr.fit_sample(data, label) #混肴 idx = list(range(len(data))) random.shuffle(idx) data, label = data[idx], label[idx] #onehot olabel = np.zeros(shape=(len(label), 2)) for i, l in enumerate(label): olabel[i][int(l - 1)] = 1 #类型转换 data = data.astype("float32") olabel = olabel.astype("float32") # train_sum = int(len(data) / 1.3) tdata, tlabel = data[:train_sum], olabel[:train_sum] test_data, test_label = data[train_sum:], olabel[train_sum:] train_set = mxdata.ArrayDataset(nd.array(tdata), nd.array(tlabel)) test_set = mxdata.ArrayDataset(nd.array(test_data), nd.array(test_label)) # #loader train_loader = mxdata.DataLoader(train_set, batch_size=bsize) test_loader = mxdata.DataLoader(test_set, batch_size=bsize) return train_loader, test_loader
gender = df0.gender nd1 = preprocessing.scale(df1.values) logger.info(f"Data loaded") jn = pushbulletNotifier.JobNotification(devices="phone") processes = 25 try: X_train, X_test, y_train, y_test = model_selection.train_test_split(nd1, gender.values, test_size=0.2, stratify=gender.values) logger.info(f"Split data in to training set and validation set.") classifier = ['logisticregression', linear_model.LogisticRegression(max_iter=250)] sampler_lst = [['smote', over_sampling.SMOTE()], ['adasyn', over_sampling.ADASYN()], ['random¬oversampler', over_sampling.RandomOverSampler()]] pipeline_lst = [ [f'{sampler[0]}-{classifier[0]}', make_pipeline(sampler[1], classifier[1])] for sampler in sampler_lst ] # noqa param_grid = { 'logisticregression__C': 2.0**np.linspace(-8, 5, 15) } # noqa for name, pipe in pipeline_lst: jn.send(message=f"Starding cross validation with resampling method {name}") logger.info(f"Starting cross validation") est = model_selection.GridSearchCV(pipe, param_grid, scoring='roc_auc', cv=5, verbose=49, refit=True, n_jobs=processes, pre_dispatch=processes, return_train_score=True) est.fit(X_train, y_train) _, yhat = est.predict_proba(X_test).T try: logger.info(f"Cross validation done, best score was {est.best_score_}")
for i in tqdm(range(100), desc="Preprocessing", leave=False): # Apply over-sampling sm_reg = over_sampling.SMOTE(kind='regular', random_state=RANDOM_STATE, k_neighbors=5) sm_b1 = over_sampling.SMOTE(kind='borderline1', random_state=RANDOM_STATE, k_neighbors=5) sm_b2 = over_sampling.SMOTE(kind='borderline2', random_state=RANDOM_STATE, k_neighbors=5) sm_enn = combine.SMOTEENN(random_state=RANDOM_STATE, smote=over_sampling.SMOTE(k_neighbors=5)) sm_tomek = combine.SMOTETomek(random_state=RANDOM_STATE, smote=over_sampling.SMOTE(k_neighbors=5)) ada = over_sampling.ADASYN(random_state=RANDOM_STATE, n_neighbors=5) X_reg, y_reg = sm_reg.fit_sample(X_train, y_train) X_b1, y_b1 = sm_b1.fit_sample(X_train, y_train) X_b2, y_b2 = sm_b2.fit_sample(X_train, y_train) X_enn, y_enn = sm_enn.fit_sample(X_train, y_train) X_tomek, y_tomek = sm_tomek.fit_sample(X_train, y_train) X_ada, y_ada = ada.fit_sample(X_train, y_train) os_list = [[X_train, y_train], [X_reg, y_reg], [X_b1, y_b1], [X_b2, y_b2], [X_enn, y_enn], [X_tomek, y_tomek], [X_ada, y_ada], [X_mndo, y_mndo]] # scaling os_list, X_test_scaled = preprocessing.normalization(os_list, X_test) #os_list, X_test_scaled = preprocessing.standardization(os_list, X_test)
from student import egitimGirdi, egitimCikti, valGirdi, valCikti print(egitimGirdi.shape) #### SENTETİK VERİ ÜRETİMİ ros = over_sampling.RandomOverSampler() rosEgitimGirdi, rosEgitimCikti = ros.fit_sample(egitimGirdi, egitimCikti) print(rosEgitimGirdi.shape) smote = over_sampling.SMOTE() smoteEgitimGirdi, smoteEgitimCikti = smote.fit_sample(egitimGirdi, egitimCikti) print(smoteEgitimGirdi.shape) ada = over_sampling.ADASYN(ratio='minority') adasynEgitimGirdi, adasynEgitimCikti = ada.fit_sample(egitimGirdi, egitimCikti) print(adasynEgitimGirdi.shape) #print(adasynEgitimGirdi.shape) #alınan verileri modellerle analiz etme models = [] models.append(("LR", LogisticRegression())) models.append(("LDA", LinearDiscriminantAnalysis())) models.append(("KNN", KNeighborsClassifier())) models.append(("DCT", DecisionTreeClassifier())) models.append(("GNB", GaussianNB())) models.append(("SVC", SVC())) models.append(("GPC", GaussianProcessClassifier(1.0 * RBF(1.0)))) models.append(("MLP", MLPClassifier()))
def oversampling_adasyn(features, labels): adasyn = over_sampling.ADASYN(random_state=0) return adasyn.fit_resample(X=features, y=labels)
# Preprocessing #----------------- # Multivariate over-sampling mndo_df = mndo(pos, num_minority, file_name) X_mndo, y_mndo = append_mndo(X_train, y_train, mndo_df) #print('y_mndo: {}'.format(Counter(y_mndo))) for i in tqdm(range(100), desc="Preprocessing", leave=False): # Apply over-sampling sm_reg = over_sampling.SMOTE(kind='regular', random_state=RANDOM_STATE) sm_b1 = over_sampling.SMOTE(kind='borderline1', random_state=RANDOM_STATE) sm_b2 = over_sampling.SMOTE(kind='borderline2', random_state=RANDOM_STATE) sm_enn = combine.SMOTEENN(random_state=RANDOM_STATE) sm_tomek = combine.SMOTETomek(random_state=RANDOM_STATE) ada = over_sampling.ADASYN(random_state=RANDOM_STATE) X_reg, y_reg = sm_reg.fit_sample(X_train, y_train) X_b1, y_b1 = sm_b1.fit_sample(X_train, y_train) X_b2, y_b2 = sm_b2.fit_sample(X_train, y_train) X_enn, y_enn = sm_enn.fit_sample(X_train, y_train) X_tomek, y_tomek = sm_tomek.fit_sample(X_train, y_train) X_ada, y_ada = ada.fit_sample(X_train, y_train) os_list = [[X_reg, y_reg], [X_b1, y_b1], [X_b2, y_b2], [X_enn, y_enn], [X_tomek, y_tomek], [X_ada, y_ada], [X_mndo, y_mndo]] # scaling os_list, X_test_scaled = preprocessing.normalization(os_list, X_test) #os_list, X_test_scaled = preprocessing.standardization(os_list, X_test) #-------------
train_nan.loc[i, fill_col] = train.loc[id_, fill_col] #test for i in test_nan.index: fill_col, id_ = impute(i, train, test_nan) test_nan.loc[i, fill_col] = train.loc[id_, fill_col] train = pd.concat([train, train_nan], axis=0) del train_nan #test test = pd.concat([test, test_nan], axis=0) del test_nan y = train['renewal'] x = train.drop('renewal', axis=1) ros = over_sampling.ADASYN() rus = under_sampling.NearMiss() rcs = combine.SMOTEENN() rcs2 = combine.SMOTETomek() log = BaggingClassifier(LogisticRegressionCV(Cs=6)) rf = BaggingClassifier(RandomForestClassifier()) gbc = BaggingClassifier( GradientBoostingClassifier(n_estimators=250, learning_rate=0.01)) sv = SVC(C=0.8, probability=True) for sample, sample_name in zip([rcs2, ros, rus, rcs, rcs2], ['rcs2', 'ros', 'rus', 'rcs']): print(sample_name) x_rs, y_rs = sample.fit_sample(x, y) for model, model_name in zip([log, rf, gbc], ['log', 'rf', 'gbc']): model.fit(x_rs, y_rs)
metod, name, train_result, test_result) print train_x.shape ROS = over_sampling.RandomOverSampler() ROS_x, ROS_y = ROS.fit_sample(train_x, train_y) print ROS_x.shape smote = over_sampling.SMOTE() smote_x, smote_y = smote.fit_sample(train_x, train_y) print smote_x.shape adasyn = over_sampling.ADASYN() adasyn_x, adasyn_y = adasyn.fit_sample(train_x, train_y) print adasyn_x.shape models = [] models.append(("LR", LogisticRegression())) models.append(("LDA", LinearDiscriminantAnalysis())) models.append(("KNN", KNeighborsClassifier())) models.append(("DCT", DecisionTreeClassifier())) models.append(("GNB", GaussianNB())) models.append(("SVC", SVC())) models.append(("GPC", GaussianProcessClassifier(1.0 * RBF(1.0)))) models.append(("MLP", MLPClassifier())) models.append(("ADB", AdaBoostClassifier()))
def resample_classes(X, Y, how='und1', random_state=None, test_size=0.3, n_jobs=2, split=True, verbose=True): """ """ if how == 'und1': if verbose: msg = 'Under-sampling the majority class(es) by randomly picking ' msg += 'samples without replacement' print msg samp = imbus.RandomUnderSampler(random_state=random_state, replacement=False) X_res, y_res = samp.fit_sample(X, Y) elif how == 'und2': if verbose: msg = 'Under-sampling by generating centroids based on clustering ' msg += 'methods' print msg samp = imbus.ClusterCentroids(ratio='auto', random_state=random_state, estimator=None, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X, Y) elif how == 'und3': if verbose: print 'Under-sampling based on NearMiss methods' samp = imbus.NearMiss(ratio='auto', return_indices=False, random_state=random_state, version=1, size_ngh=None, n_neighbors=3, ver3_samp_ngh=None, n_neighbors_ver3=3, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X, Y) elif how == 'over1': if verbose: msg = 'Over-sampling the minority class(es) by picking samples at ' msg += 'random with replacement' print samp = imbov.RandomOverSampler(random_state=random_state) X_res, y_res = samp.fit_sample(X, Y) elif how == 'over2': if verbose: msg = 'Over-sapmling using SMOTE - Synthetic Minority Over-sampling ' msg += 'Technique' print msg X_res, y_res = X, Y for i in range(3): samp = imbov.SMOTE(random_state=random_state, ratio=.99, k=None, k_neighbors=5, m=None, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator=None, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X_res, y_res) elif how == 'over3': if verbose: msg = 'Over-sampling using ADASYN - Adaptive Synthetic Sampling ' msg += 'Approach for Imbalanced Learning' print msg X_res, y_res = X, Y for i in range(3): samp = imbov.ADASYN(ratio=.93, random_state=random_state, k=None, n_neighbors=5, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X_res, y_res) elif how == 'comb1': if verbose: print 'Combine over- and under-sampling using SMOTE and Tomek links.' X_res, y_res = X, Y for i in range(3): samp = imbcom.SMOTETomek(ratio=.99, random_state=random_state, smote=None, tomek=None, k=None, m=None, out_step=None, kind_smote=None, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X_res, y_res) else: print 'Sampling approach not recognized' return if verbose: print '\t\t\t1\t2\t3\t4' val_y = pd.Series(Y).value_counts(sort=False).values msg = 'Counts in y_init:\t{}\t{}\t{}\t{} ' print msg.format(val_y[0], val_y[1], val_y[2], val_y[3]) val_yres = pd.Series(y_res).value_counts(sort=False).values msg = 'Counts in y_resamp:\t{}\t{}\t{}\t{} ' print msg.format(val_yres[0], val_yres[1], val_yres[2], val_yres[3]) if split: X_train, X_test, y_train, y_test = train_test_split( X_res, y_res, test_size=test_size, random_state=random_state) if verbose: val_ytr = pd.Series(y_train).value_counts(sort=False).values msg = 'Counts in y_train:\t{}\t{}\t{}\t{} ' print msg.format(val_ytr[0], val_ytr[1], val_ytr[2], val_ytr[3]) val_yte = pd.Series(y_test).value_counts(sort=False).values msg = 'Counts in y_test:\t{}\t{}\t{}\t{} ' print msg.format(val_yte[0], val_yte[1], val_yte[2], val_yte[3]) print 'X_train:', X_train.shape, ', X_test:', X_test.shape return X_train, X_test, y_train, y_test else: return X_res, y_res