def under_sample_train(x_train, y_train, random=False, seed=666): if (random): model_under_sample = under_sampling.RandomUnderSampler( random_state=seed) x_train, y_train = model_under_sample.fit_resample(x_train, y_train) else: model_under_sample = under_sampling.NearMiss(version=2, random_state=seed, n_jobs=-1) x_train, y_train = model_under_sample.fit_resample(x_train, y_train) return x_train, y_train
def f_NearMiss(X_train, y_train, seed): """ Use: X_train, y_train, seed returns X_train, y_train """ nm = us.NearMiss(version=3, return_indices=True, n_neighbors=10, random_state=seed) X_train, y_train, idx_res = nm.fit_sample(X_train, y_train) return (X_train, y_train)
def init(bsize): data, label = load("Kaggle.npz") #转换到球极坐标 # norm=np.sqrt(np.sum(data**2,axis=1,keepdims=True)) # ag=data/norm # data=np.concatenate([data,norm,ag],axis=1) #使用sin和cos信息 # data=np.concatenate([np.sin(data),np.cos(data)],axis=1) # 下采样 制造平衡样本 cr = under_sampling.NearMiss() data, label = cr.fit_sample(data, label) #上采样 制造平衡样本 # ocr=over_sampling.ADASYN() # data,label=ocr.fit_sample(data,label) #混肴 idx = list(range(len(data))) random.shuffle(idx) data, label = data[idx], label[idx] #onehot olabel = np.zeros(shape=(len(label), 2)) for i, l in enumerate(label): olabel[i][int(l - 1)] = 1 #类型转换 data = data.astype("float32") olabel = olabel.astype("float32") # train_sum = int(len(data) / 1.3) tdata, tlabel = data[:train_sum], olabel[:train_sum] test_data, test_label = data[train_sum:], olabel[train_sum:] train_set = mxdata.ArrayDataset(nd.array(tdata), nd.array(tlabel)) test_set = mxdata.ArrayDataset(nd.array(test_data), nd.array(test_label)) # #loader train_loader = mxdata.DataLoader(train_set, batch_size=bsize) test_loader = mxdata.DataLoader(test_set, batch_size=bsize) return train_loader, test_loader
dce_features = ['ese-dce'] # Define the extension of each features ext_features = ['_ese__dce.npy'] # Define the path of the ground for the prostate path_gt = ['GT_inv/prostate', 'GT_inv/pz', 'GT_inv/cg', 'GT_inv/cap'] # Define the label of the ground-truth which will be provided label_gt = ['prostate', 'pz', 'cg', 'cap'] # Define the path where to store the data path_store = '/data/prostate/balanced/mp-mri-prostate/exp-3' N_JOBS = -1 # Create the under_samplers and over_samplers list to use samplers = [ under_sampling.InstanceHardnessThreshold(n_jobs=N_JOBS, estimator='random-forest'), under_sampling.NearMiss(version=1, n_jobs=N_JOBS), under_sampling.NearMiss(version=2, n_jobs=N_JOBS), under_sampling.NearMiss(version=3, n_jobs=N_JOBS), under_sampling.RandomUnderSampler(), over_sampling.SMOTE(kind='regular', n_jobs=N_JOBS), over_sampling.SMOTE(kind='borderline1', n_jobs=N_JOBS), over_sampling.SMOTE(kind='borderline2', n_jobs=N_JOBS), over_sampling.RandomOverSampler() ] # Define the sub-folder to use sub_folder = [ 'iht', 'nm1', 'nm2', 'nm3', 'rus', 'smote', 'smote-b1', 'smote-b2', 'ros' ] # Generate the different path to be later treated path_patients_list_gt = []
#test for i in test_nan.index: fill_col, id_ = impute(i, train, test_nan) test_nan.loc[i, fill_col] = train.loc[id_, fill_col] train = pd.concat([train, train_nan], axis=0) del train_nan #test test = pd.concat([test, test_nan], axis=0) del test_nan y = train['renewal'] x = train.drop('renewal', axis=1) ros = over_sampling.ADASYN() rus = under_sampling.NearMiss() rcs = combine.SMOTEENN() rcs2 = combine.SMOTETomek() log = BaggingClassifier(LogisticRegressionCV(Cs=6)) rf = BaggingClassifier(RandomForestClassifier()) gbc = BaggingClassifier( GradientBoostingClassifier(n_estimators=250, learning_rate=0.01)) sv = SVC(C=0.8, probability=True) for sample, sample_name in zip([rcs2, ros, rus, rcs, rcs2], ['rcs2', 'ros', 'rus', 'rcs']): print(sample_name) x_rs, y_rs = sample.fit_sample(x, y) for model, model_name in zip([log, rf, gbc], ['log', 'rf', 'gbc']): model.fit(x_rs, y_rs) filename = 'C:/Users/cheekati/Desktop/ml/AV Mck/' + str(
def resample_classes(X, Y, how='und1', random_state=None, test_size=0.3, n_jobs=2, split=True, verbose=True): """ """ if how == 'und1': if verbose: msg = 'Under-sampling the majority class(es) by randomly picking ' msg += 'samples without replacement' print msg samp = imbus.RandomUnderSampler(random_state=random_state, replacement=False) X_res, y_res = samp.fit_sample(X, Y) elif how == 'und2': if verbose: msg = 'Under-sampling by generating centroids based on clustering ' msg += 'methods' print msg samp = imbus.ClusterCentroids(ratio='auto', random_state=random_state, estimator=None, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X, Y) elif how == 'und3': if verbose: print 'Under-sampling based on NearMiss methods' samp = imbus.NearMiss(ratio='auto', return_indices=False, random_state=random_state, version=1, size_ngh=None, n_neighbors=3, ver3_samp_ngh=None, n_neighbors_ver3=3, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X, Y) elif how == 'over1': if verbose: msg = 'Over-sampling the minority class(es) by picking samples at ' msg += 'random with replacement' print samp = imbov.RandomOverSampler(random_state=random_state) X_res, y_res = samp.fit_sample(X, Y) elif how == 'over2': if verbose: msg = 'Over-sapmling using SMOTE - Synthetic Minority Over-sampling ' msg += 'Technique' print msg X_res, y_res = X, Y for i in range(3): samp = imbov.SMOTE(random_state=random_state, ratio=.99, k=None, k_neighbors=5, m=None, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator=None, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X_res, y_res) elif how == 'over3': if verbose: msg = 'Over-sampling using ADASYN - Adaptive Synthetic Sampling ' msg += 'Approach for Imbalanced Learning' print msg X_res, y_res = X, Y for i in range(3): samp = imbov.ADASYN(ratio=.93, random_state=random_state, k=None, n_neighbors=5, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X_res, y_res) elif how == 'comb1': if verbose: print 'Combine over- and under-sampling using SMOTE and Tomek links.' X_res, y_res = X, Y for i in range(3): samp = imbcom.SMOTETomek(ratio=.99, random_state=random_state, smote=None, tomek=None, k=None, m=None, out_step=None, kind_smote=None, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X_res, y_res) else: print 'Sampling approach not recognized' return if verbose: print '\t\t\t1\t2\t3\t4' val_y = pd.Series(Y).value_counts(sort=False).values msg = 'Counts in y_init:\t{}\t{}\t{}\t{} ' print msg.format(val_y[0], val_y[1], val_y[2], val_y[3]) val_yres = pd.Series(y_res).value_counts(sort=False).values msg = 'Counts in y_resamp:\t{}\t{}\t{}\t{} ' print msg.format(val_yres[0], val_yres[1], val_yres[2], val_yres[3]) if split: X_train, X_test, y_train, y_test = train_test_split( X_res, y_res, test_size=test_size, random_state=random_state) if verbose: val_ytr = pd.Series(y_train).value_counts(sort=False).values msg = 'Counts in y_train:\t{}\t{}\t{}\t{} ' print msg.format(val_ytr[0], val_ytr[1], val_ytr[2], val_ytr[3]) val_yte = pd.Series(y_test).value_counts(sort=False).values msg = 'Counts in y_test:\t{}\t{}\t{}\t{} ' print msg.format(val_yte[0], val_yte[1], val_yte[2], val_yte[3]) print 'X_train:', X_train.shape, ', X_test:', X_test.shape return X_train, X_test, y_train, y_test else: return X_res, y_res