def main(): # Create 2 artificial clusters that partially overlap X,y = createCluster() # Plot the clusters colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) pl.scatter(np.array(X)[:, 0], np.array(X)[:, 1], color=colors[y].tolist(), s=10) pl.show() # Get the minority and majority count ms,ml = ADASYN.getClassCount(X,y) d = ADASYN.getd(X,y,ms,ml) G = ADASYN.getG(X,y,ms,ml,1) # Get the list of r values, which indicate how many samples will be made per data point in the minority dataset rlist = ADASYN.getRis(X,y,0,5) # Generate the synthetic data newX,newy = ADASYN.generateSamples(rlist,X,y,G,0,5) # Plot the dataset again pl.scatter(np.array(X)[:, 0], np.array(X)[:, 1], color=colors[y].tolist(), s=10) pl.scatter(np.array(newX)[:, 0], np.array(newX)[:, 1], color='red', s=10) pl.show() X,y = ADASYN.joinwithmajorityClass(X,y,newX,newy,1) print 'test'
def main(): # Create 2 artificial clusters that partially overlap X, y = createCluster() # Plot the clusters colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) pl.scatter(np.array(X)[:, 0], np.array(X)[:, 1], color=colors[y].tolist(), s=10) pl.show() # Get the minority and majority count ms, ml = ADASYN.getClassCount(X, y) d = ADASYN.getd(X, y, ms, ml) G = ADASYN.getG(X, y, ms, ml, 1) # Get the list of r values, which indicate how many samples will be made per data point in the minority dataset rlist = ADASYN.getRis(X, y, 0, 5) # Generate the synthetic data newX, newy = ADASYN.generateSamples(rlist, X, y, G, 0, 5) # Plot the dataset again pl.scatter(np.array(X)[:, 0], np.array(X)[:, 1], color=colors[y].tolist(), s=10) pl.scatter(np.array(newX)[:, 0], np.array(newX)[:, 1], color='red', s=10) pl.show() X, y = ADASYN.joinwithmajorityClass(X, y, newX, newy, 1) print 'test'
def templet(sampler_name, sample_ratio): """ 模板方法 :param sampler_name: 采样算法名 :param sample_ratio: 采样比例 :return: """ dataset = fetch_datasets()['satimage'] X = dataset.data y = dataset.target # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) sb = None if sampler_name == 'CART': sb = DummySampler() elif sampler_name == 'SMOTE': sb = SMOTE(N=sample_ratio, k_neighbors=5, random_state=42) elif sampler_name == 'Border1': sb = BorderSMOTE(N=sample_ratio, m_neighbors=9, k_neighbors=5, random_state=42, kind='borderline1') elif sampler_name == 'Border2': sb = BorderSMOTE(N=sample_ratio, m_neighbors=9, k_neighbors=5, random_state=42, kind='borderline2') elif sampler_name == 'ADASYN': sb = ADASYN(bata=sample_ratio, k_neighbors=5, random_state=42) elif sampler_name == 'Safe-level': sb = SafeLevelSMOTE(N=sample_ratio, k_neighbors=5, random_state=42) else: pass X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) # 采样 model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) # write2dic fill_dic('precision', sampler_name, sample_ratio, precision) fill_dic('recall', sampler_name, sample_ratio, recall) fill_dic('f1', sampler_name, sample_ratio, f1) fill_dic('auc', sampler_name, sample_ratio, auc) fill_dic('gmean', sampler_name, sample_ratio, gmean) print('%s %.1f building id transforming took %fs!' % (sampler_name, sample_ratio, time.time() - start_time))
n = 10 # repeat the CV procedure 10 times to get more precise results #=============================================================================== for i in range(n): print "======================================= CROSS VALIDATION LOOP: ", (i+1) # for each iteration, randomly hold out 20% of the data as CV set X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( X, y, test_size=.20, random_state=i*25) print "training size: ", X_train.shape print "label size: ", y_train.shape print "the ADASYN goodie..." y_train = list(np.array(y_train).reshape(-1,)) ms,ml = ADASYN.getClassCount(X_train,y_train) d = ADASYN.getd(X_train,y_train,ms,ml) G = ADASYN.getG(X_train,y_train,ms,ml,1) # Get the list of r values, which indicate how many samples will be made per data point in the minority dataset rlist = ADASYN.getRis(X_train,y_train,0,2) # Generate the synthetic data newX,newy = ADASYN.generateSamples(rlist,X_train,y_train,G,0,2) X_train,y_train = ADASYN.joinwithmajorityClass(X_train,y_train,newX,newy,1) print "new training size: ", X_train.shape print "new label size: ", y_train.shape
def test(): dic = {'recall': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'precision': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'f1': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'auc': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'gmean': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}} results = prettytable.PrettyTable(["Classifier", "Precision", 'Recall', 'AUC', 'F-measure', 'G-mean']) # 加载数据 dataset = fetch_datasets()['satimage'] X = dataset.data y = dataset.target print(Counter(y)) # 随机种子,保证每次实验结果相同 np.random.seed(42) # -------------------------------------------CART---------------------------------------------------- # 起始时间 start_time = time.time() # 交叉验证CART cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # initialize CART cart = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) # 归一化 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练 cart.fit(X_train_minmax, y[train]) # 测试 predict = cart.predict(X_test_minmax) probability = cart.predict_proba(X_test_minmax) cart_auc = metrics.roc_auc_score(y[test], probability[:, 1]) cart_precision = metrics.precision_score(y[test], predict) cart_recall = metrics.recall_score(y[test], predict) if cart_precision == 0: cart_f1 = 0 else: cart_f1 = 2 * (cart_precision * cart_recall) / (cart_precision + cart_recall) cart_gmean = geometric_mean_score(y[test], predict) dic['precision']['CART'].append(cart_precision) dic['recall']['CART'].append(cart_recall) dic['f1']['CART'].append(cart_f1) dic['auc']['CART'].append(cart_auc) dic['gmean']['CART'].append(cart_gmean) print('CART building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------------SMOTE---------------------------------------------------------- # 起始时间 start_time = time.time() # 交叉验证 cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=42) for train, test in cv.split(X, y): # preprocess scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # initialize sampler sb = SMOTE(N=100, k_neighbors=5, random_state=42) # sampling X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) # initialize classifier model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) # model = svm.SVC(class_weight={1: 20}) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['SMOTE'].append(precision) dic['recall']['SMOTE'].append(recall) dic['f1']['SMOTE'].append(f1) dic['auc']['SMOTE'].append(auc) dic['gmean']['SMOTE'].append(gmean) print('SMOTE building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------Borderline-SMOTE1---------------------------------------------------- # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 初始化采样器 sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline1') # 采样 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['Border1'].append(precision) dic['recall']['Border1'].append(recall) dic['f1']['Border1'].append(f1) dic['auc']['Border1'].append(auc) dic['gmean']['Border1'].append(gmean) print('BorderSmote1 building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------Borderline-SMOTE2---------------------------------------------------- # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 初始化采样器 sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline2') # 采样 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['Border2'].append(precision) dic['recall']['Border2'].append(recall) dic['f1']['Border2'].append(f1) dic['auc']['Border2'].append(auc) dic['gmean']['Border2'].append(gmean) print('BorderSmote2 building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------ADASYN--------------------------------------------------------------- # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练 sb = ADASYN(bata=0.1, k_neighbors=5, random_state=42) # 预测 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['ADASYN'].append(precision) dic['recall']['ADASYN'].append(recall) dic['f1']['ADASYN'].append(f1) dic['auc']['ADASYN'].append(auc) dic['gmean']['ADASYN'].append(gmean) print('ADASYN building id transforming took %fs!' % (time.time() - start_time)) # ------------------------------------------------Safe-Level-SMOTE---------------------------------------------- cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练 sb = SafeLevelSMOTE(N=100, k_neighbors=5, random_state=42) # 预测 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['Safe-level'].append(precision) dic['recall']['Safe-level'].append(recall) dic['f1']['Safe-level'].append(f1) dic['auc']['Safe-level'].append(auc) dic['gmean']['Safe-level'].append(gmean) print('Safe-level building id transforming took %fs!' % (time.time() - start_time)) # display results.add_row(['CART', np.mean(np.array(dic['precision']['CART'])), np.mean(np.array(dic['recall']['CART'])), np.mean(np.array(dic['auc']['CART'])), np.mean(np.array(dic['f1']['CART'])), np.mean(np.array(dic['gmean']['CART']))]) results.add_row(['SMOTE', np.mean(np.array(dic['precision']['SMOTE'])), np.mean(np.array(dic['recall']['SMOTE'])), np.mean(np.array(dic['auc']['SMOTE'])), np.mean(np.array(dic['f1']['SMOTE'])), np.mean(np.array(dic['gmean']['SMOTE']))]) results.add_row(['Border1', np.mean(np.array(dic['precision']['Border1'])), np.mean(np.array(dic['recall']['Border1'])), np.mean(np.array(dic['auc']['Border1'])), np.mean(np.array(dic['f1']['Border1'])), np.mean(np.array(dic['gmean']['Border1']))]) results.add_row(['Border2', np.mean(np.array(dic['precision']['Border2'])), np.mean(np.array(dic['recall']['Border2'])), np.mean(np.array(dic['auc']['Border2'])), np.mean(np.array(dic['f1']['Border2'])), np.mean(np.array(dic['gmean']['Border2']))]) results.add_row(['ADASYN', np.mean(np.array(dic['precision']['ADASYN'])), np.mean(np.array(dic['recall']['ADASYN'])), np.mean(np.array(dic['auc']['ADASYN'])), np.mean(np.array(dic['f1']['ADASYN'])), np.mean(np.array(dic['gmean']['ADASYN']))]) results.add_row(['Safe-level', np.mean(np.array(dic['precision']['Safe-level'])), np.mean(np.array(dic['recall']['Safe-level'])), np.mean(np.array(dic['auc']['Safe-level'])), np.mean(np.array(dic['f1']['Safe-level'])), np.mean(np.array(dic['gmean']['Safe-level']))]) print(results)