def main(filename, xtrains_percent = 0.2, maxfeature = None, fit_ylabel = False, nn_estimator = 100, sepaLabel = True, treeLabel = False, seed = 42, pcaLabel = False, n_comp = 2, sepa2 = False, time_label = False, stream = False, sfl = False, anomaly_rate = None, max_samples = None, sample_nor = None): inf = float("inf") all_start = time.time() rng = np.random.RandomState(seed) #httpとsmtpのみ別の方法でデータ取得 if filename == 'C:\Users\Riku Anegawa\Desktop/Dropbox/http.mat' or filename == 'C:\Users\Riku Anegawa\Desktop/Dropbox/smtp.mat': mat = {} f = h5py.File(filename) for k, v in f.items(): mat[k] = np.array(v) X = mat['X'].T y2 = mat['y'][0] y3 = [] for i in range(len(y2)): y3.append(int(y2[i])) y = np.reshape(y3, [len(y3), 1]) else: mat = scipy.io.loadmat(filename) X = mat['X'] y = mat['y'] rate = xtrains_percent if maxfeature == None: max_feat = len(X[0]) else: max_feat = int(maxfeature) if not treeLabel: print('X_train\'s rate : ' + str(rate)) print('max_features : ' + str(max_feat)) print('fit_ylabel : ' + str(fit_ylabel)) print('nn_estimator : ' + str(nn_estimator)) print('sepaLabel : ' + str(sepaLabel)) clf = IsolationForest(random_state=rng) clf.n_estimators = nn_estimator clf.verbose = 0 clf.max_features = max_feat if max_samples != None: clf.max_samples = max_samples else: clf.max_samples = 1. # print(X.shape) if (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/shuttle.mat'): clf.contamination = 0.07 elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/http.mat'): clf.contamination = 0.004 elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/pima.mat'): clf.contamination = 0.35 elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/mammography.mat'): clf.contamination = 0.02 elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/cover.mat'): clf.contamination = 0.009 elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/breastw.mat'): clf.contamination = 0.35 elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/arrhythmia.mat'): clf.contamination = 0.15 elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/ionosphere.mat'): clf.contamination = 0.36 elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/satellite.mat'): clf.contamination = 0.32 elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/annthyroid.mat'): clf.contamination = 0.07 elif (str(filename) == 'C:\Users\Riku Anegawa\Desktop/Dropbox/smtp.mat'): clf.contamination = 0.03 / 100 else: raise Exception("error! cannot file it.") #交差検証を何回行うか(例:8:2なら5回) #もっとうまい方法ありそう hoge = 1 / rate cross_count = int(np.ceil(hoge)) if cross_count > hoge: cross_count = cross_count - 1 #cross_count分のauc,acc合計 sum_auc_roc = 0 sum_accuracy = 0 FPR_sum = 0 TPR_sum = 0 TNR_sum = 0 FNR_sum = 0 pca_fit_time = 0 pca_transform_train_time = 0 pca_transform_test_time = 0 test_time = 0 fit_time = 0 #ここはデータを交差検証用に分割するだけ if sepaLabel == True: # separated # data cut X_anomaly = [] X_normal = [] for i in range(len(X)): if y[i] == 1: X_anomaly.append(X[i]) else: X_normal.append(X[i]) #データ全体のcontaminationを操作 #基本使わないゾ! zentai = False anomaly_rate_all = None if zentai: if anomaly_rate_all != clf.contamination: clf.contamination = anomaly_rate_all if anomaly_rate_all < clf.contamination: #異常系をカットしますよ〜〜 k = int(np.ceil(len(X_normal) * (anomaly_rate_all / (1 - anomaly_rate_all)))) anomaly_hoge = random.sample(X_anomaly, k) # ランダムに抽出 normal_hoge = X_normal else: #正常系をカットしましょうね〜〜 n_normal = int(len(X_anomaly) / anomaly_rate_all) - len(X_anomaly) normal_rate = n_normal / len(X_normal) k = int(np.ceil(len(X_normal) * normal_rate)) normal_hoge = random.sample(X_normal, k) # ランダムに抽出 anomaly_hoge = X_anomaly X_anomaly = anomaly_hoge X_normal = normal_hoge if sample_nor != None: X_normal = random.sample(X_normal, sample_nor) cont = clf.contamination sample_ano = (cont/(1-cont)) * sample_nor X_anomaly = random.sample(X_anomaly, int(sample_ano)) # print("size : " + str(len(X_normal)+len(X_anomaly))) cutter_anomaly = len(X_anomaly) * rate cutter_normal = len(X_normal) * rate X_sepa_ano = [] X_sepa_nor = [] for i in range(cross_count): head2 = int(cutter_normal * i) tail2 = int(cutter_normal * (i+1)) - 1 X_sepa_nor.append(X_normal[head2:tail2+1]) # print(len(X_sepa_nor)) # print(len(X_sepa_nor[0])) # print(len(X_sepa_nor[0][0])) head = int(cutter_anomaly * i) tail = int(cutter_anomaly * (i+1)) - 1 X_sepa_ano.append(X_anomaly[head:tail+1]) # print(len(X_sepa_nor)) # print(len(X_sepa_nor[0])) # print(len(X_sepa_ano)) # print(len(X_sepa_ano[0])) # print("") else: X_sepa = [] y_sepa = [] cutter = len(X)*rate for i in range(cross_count): head = int(cutter*i) tail = int(cutter*(i+1))-1 X_sepa.append(X[head:tail+1]) y_sepa.append(y[head:tail+1]) for count in range(cross_count): if sepaLabel: X_train = [] X_train_correct = [] X_test = [] X_test_correct = [] # 学習データの異常系含有率を変更 for i in range(cross_count): if i != count: train_flag = True if anomaly_rate is not None: if clf.contamination != anomaly_rate: train_flag = False if clf.contamination != anomaly_rate: all = len(X_sepa_nor[i]) + len(X_sepa_ano[i]) cont = len(X_sepa_ano[i]) / all # print(cont) # print(anomaly_rate) if cont > anomaly_rate:# 異常系を減らす k = int(np.ceil(len(X_sepa_nor[i]) * (anomaly_rate / (1 - anomaly_rate)))) if len(X_sepa_ano[i]) < k: k = len(X_sepa_ano[i]) anomaly_hoge = random.sample(X_sepa_ano[i], k) # ランダムに抽出 normal_hoge = X_sepa_nor[i] else: # 正常系を減らす k = int(len(X_sepa_ano[i]) / anomaly_rate) - len(X_sepa_ano[i]) normal_hoge = random.sample(X_sepa_nor[i], k) # ランダムに抽出 anomaly_hoge = X_sepa_ano[i] # k = int(len(X_sepa_ano[i]) * (1-anomaly_rate)/anomaly_rate) # normal_rate = n_normal / len(X_sepa_nor[i]) # k = int(np.ceil(len(X_sepa_nor[i]) * normal_rate)) # X_sepa_nor[i] = normal_hoge X_train.extend(anomaly_hoge) # print("aaaaa") # print(X_train) for j in range(len(anomaly_hoge)): X_train_correct.append(-1) X_train.extend(normal_hoge) for j in range(len(normal_hoge)): X_train_correct.append(1) if train_flag: X_train.extend(X_sepa_ano[i]) for j in range(len(X_sepa_ano[i])): X_train_correct.append(-1) X_train.extend(X_sepa_nor[i]) for j in range(len(X_sepa_nor[i])): X_train_correct.append(1) else: # X_test.extend(X_sepa_ano[i]) # for j in range(len(X_sepa_ano[i])): # X_test_correct.append(-1) # X_test.extend(X_sepa_nor[i]) # for j in range(len(X_sepa_nor[i])): # X_test_correct.append(1) #テストデータの含有率も変えます?? anomaly_rate2 = None test_flag = True if anomaly_rate2 is not None: clf.contamination = anomaly_rate2 if clf.contamination != anomaly_rate2: test_flag = False if clf.contamination > anomaly_rate2: # 異常系を減らす k = int(np.ceil(len(X_sepa_nor[i]) * (anomaly_rate / (1 - anomaly_rate)))) anomaly_hoge = random.sample(X_sepa_ano[i], k) # ランダムに抽出 normal_hoge = X_sepa_nor[i] # X_sepa_ano[i] = anomaly_hoge # X_sepa_ano[i] = [] # X_sepa_ano[i].extend(anomaly_hoge) else: # 正常系を減らす n_normal = int(len(X_sepa_ano[i]) / anomaly_rate) - len(X_sepa_ano[i]) normal_rate = n_normal / len(X_sepa_nor[i]) k = int(np.ceil(len(X_sepa_nor[i]) * normal_rate)) normal_hoge = random.sample(X_sepa_nor[i], k) # ランダムに抽出 anomaly_hoge = X_sepa_ano[i] # X_sepa_nor[i] = normal_hoge X_test.extend(anomaly_hoge) for j in range(len(anomaly_hoge)): X_test_correct.append(-1) X_test.extend(normal_hoge) for j in range(len(normal_hoge)): X_test_correct.append(1) if test_flag: X_test.extend(X_sepa_ano[i]) for j in range(len(X_sepa_ano[i])): X_test_correct.append(-1) X_test.extend(X_sepa_nor[i]) for j in range(len(X_sepa_nor[i])): X_test_correct.append(1) #シャッフルするかどうか if sfl: X_train_set = [] X_test_set = [] for i in range(len(X_train)): buf = [] buf.append(X_train[i]) buf.append(X_train_correct[i]) X_train_set.append(buf) for i in range(len(X_test)): buf = [] buf.append(X_test[i]) buf.append(X_test_correct[i]) X_test_set.append(buf) random.shuffle(X_train_set) random.shuffle(X_test_set) X_train = [] X_test = [] X_train_correct = [] X_test_correct = [] for i in range(len(X_train_set)): X_train.append(X_train_set[i][0]) X_train_correct.append(X_train_set[i][1]) for i in range(len(X_test_set)): X_test.append(X_test_set[i][0]) X_test_correct.append(X_test_set[i][1]) else: #mixed X_train = [] X_train_correct = [] X_test = [] X_test_correct = [] for i in range(cross_count): if i != count: # print(X_train) X_train.extend(X_sepa[i]) X_train_correct.extend(y_sepa[i]) else:#i == count # print(i, 1111111) X_test.extend(X_sepa[i]) X_test_correct.extend(y_sepa[i]) for q in range(len(X_train_correct)): j = X_train_correct[q] if (j == 1): X_train_correct[q] = -1 else: X_train_correct[q] = 1 for w in range(len(X_test_correct)): j = X_test_correct[w] if (j == 1): X_test_correct[w] = -1 else: X_test_correct[w] = 1 # owari # finished cutting data if pcaLabel: pca_fit_start = time.time() pca = PCA(copy=True, iterated_power='auto', n_components=n_comp, random_state=None, svd_solver='auto', tol=0.0, whiten=False) # print("X_train: " + str(X_train)) pca.fit(X_train) pca_fit_finish = time.time() pca_transform_train_start = time.time() X_train = pca.transform(X_train) pca_transform_train_finish = time.time() clf.max_features = n_comp pca_fit_time += (pca_fit_finish - pca_fit_start) pca_transform_train_time += (pca_transform_train_finish - pca_transform_train_start) #variance and kurtosis #尖度や分散の大きな軸を選択したい方はこちら varLabel = False kurtosisLabel = False if varLabel or kurtosisLabel: var = [] kurtosis_set = [] skewness_set = [] for i in range(len(X_train[0])): data = [] for j in range(len(X_train)): data.append(X_train[j][i]) data = np.array(data) ave = np.average(data) std = np.std(data) # if math.isnan(((data - ave) ** 3 / (std ** 3))[0]): # if ave != 0 or std != 0: # print("ave : " + str(ave)) # print("std : " + str(std)) if std == 0: kurtosis = -10000 else: kurtosis = np.average((data - ave) ** 3) / (std ** 3) skewness = np.average((data - ave) ** 4) / (std ** 4) - 3 var.append(std) kurtosis_set.append(kurtosis) skewness_set.append(skewness) var_rank = np.argsort(var)[::-1] kurtosis_rank = np.argsort(kurtosis_set)[::-1] skewness_rank = np.argsort(skewness_set)[::-1] #歪度を使う予定は今のとこないかな〜 hoge = [] for i in range(clf.max_features): if varLabel: hoge.append(np.array(X_train)[:,var_rank[i]]) elif kurtosisLabel: hoge.append(np.array(X_train)[:,kurtosis_rank[i]]) X_train = np.array(hoge).T fit_start = time.time() #fit_ylabelはFalseで固定 if fit_ylabel: clf.fit(X_train, X_train_correct, sample_weight=None) else : # print(X_train) clf.fit(X_train, y = None, sample_weight=None) fit_finish = time.time() fit_time += (fit_finish - fit_start) if stream: sum_score_auc = [] sum_score_acc = [] for i in range(len(X_test)): if pcaLabel: pca_transform_test_start = time.time() a = [X_test[i]] X_test_pca = pca.transform(a) pca_transform_test_finish = time.time() pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start) else: X_test_pca = [X_test[i]] test_start = time.time() y_pred_test, a_score = clf.predict(X_test_pca) test_finish = time.time() test_time += (test_finish - test_start) sum_score_auc.append(a_score) sum_score_acc.append(y_pred_test) a_score = sum_score_auc y_pred_test = sum_score_acc else: #batch if pcaLabel: pca_transform_test_start = time.time() X_test = pca.transform(X_test) # stream version pca_transform_test_finish = time.time() pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start) test_start = time.time() y_pred_test, a_score = clf.predict(X_test) # a_score = clf.decision_function(X_test) test_finish = time.time() test_time += (test_finish - test_start) acc = calc_accuracy(X_test_correct, y_pred_test, treeLabel) AUC_roc = calc_AUC(X_test_correct, a_score, treeLabel) FPR, TPR, TNR, FNR = calc_FN(X_test_correct, y_pred_test) FNR_sum += FNR FPR_sum += FPR sum_auc_roc += AUC_roc sum_accuracy += acc # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # plot the line, the samples, and the nearest vectors to the plane # # X_train = np.array(X_train) # X_test = np.array(X_test) # # lim = True # x = (-200, 200) # y = (-200, 300) # # for i,j in zip(range(2), [True, False]): # small = j # trueがsmallestね # # plt.subplot(2, 2, i+1) # if small: # plt.title("smallest") # else: # plt.title("largest") # # if small: # # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k') # else: # # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # plt.legend([b2],["testing observations"], # loc="upper left") # # plt.legend([b1], ["training observations"], # # loc="upper left") # # # # plt.subplot(2, 2, i+3) # if small: # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k') # else: # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # # plt.legend([b2], ["testing observations"], # # loc="upper left") # plt.legend([b1], ["training observations"], # loc="upper left") # plt.show()ter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k') # else: # # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # plt.legend([b2],["testing observations"], # loc="upper left") # # plt.legend([b1], ["training observations"], # # loc="upper left") # # # # plt.subplot(2, 2, i+3) # if small: # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k') # else: # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # # plt.legend([b2], ["testing observations"], # # loc="upper left") # plt.legend([b1], ["training observations"], # loc="upper left") # plt.show() # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ auc2_roc = sum_auc_roc / cross_count acc2 = sum_accuracy / cross_count fnr = FNR_sum / cross_count fpr = FPR_sum / cross_count #calc time all_finish = time.time() all_time = all_finish - all_start pca_fit_time = pca_fit_time / cross_count pca_transform_train_time = pca_transform_train_time / cross_count pca_transform_test_time = pca_transform_test_time / cross_count test_time = test_time / cross_count fit_time = fit_time / cross_count sum_train_time = fit_time + pca_fit_time + pca_transform_train_time sum_test_time = pca_transform_test_time + test_time # print("sum_train_time : " + str(sum_train_time)) # print("pca_transform_train_time : " + str(pca_transform_train_time)) # print("pca_fit_time : " + str(pca_fit_time)) # print("test_time : " + str(test_time)) # print("fit_time : " + str(fit_time)) # print("all_time : " + str(all_time)) if time_label: # return all_time, pca_fit_time + pca_transform_train_time, fit_time, pca_transform_test_time, test_time, sum_train_time, sum_test_time return all_time elif treeLabel: # if math.isnan(auc2_roc): # raise Exception("error! auc is NaN!.") # return auc2_roc return fnr # return fpr else: return auc2_roc, acc2
def main(filename, xtrains_percent=0.8, maxfeature=3, fit_ylabel=False, nn_estimator=100, sepaLabel=True, treeLabel=False, seed=42, pcaLabel=False, n_comp=2, sepa2=False, time_label=False, stream=False, sfl=False): inf = float("inf") all_start = time.time() rng = np.random.RandomState(seed) # httpとsmtpのみ別の方法でデータ取得 if filename == '/home/anegawa/Dropbox/http.mat' or filename == '/home/anegawa/Dropbox/smtp.mat': mat = {} f = h5py.File(filename) for k, v in f.items(): mat[k] = np.array(v) X = mat['X'].T y2 = mat['y'][0] y3 = [] for i in range(len(y2)): y3.append(int(y2[i])) y = np.reshape(y3, [len(y3), 1]) else: mat = scipy.io.loadmat(filename) X = mat['X'] y = mat['y'] rate = xtrains_percent max_feat = int(maxfeature) if max_feat == 3: max_feat = X.shape[1] if not treeLabel: print('X_train\'s rate : ' + str(rate)) print('max_features : ' + str(max_feat)) print('fit_ylabel : ' + str(fit_ylabel)) print('nn_estimator : ' + str(nn_estimator)) print('sepaLabel : ' + str(sepaLabel)) clf = IsolationForest(random_state=rng) clf.n_estimators = nn_estimator clf.verbose = 0 clf.max_features = max_feat if (str(filename) == '/home/anegawa/Dropbox/shuttle.mat'): clf.contamination = 0.07 elif (str(filename) == '/home/anegawa/Dropbox/http.mat'): clf.contamination = 0.004 elif (str(filename) == '/home/anegawa/Dropbox/pima.mat'): clf.contamination = 0.35 elif (str(filename) == '/home/anegawa/Dropbox/mammography.mat'): clf.contamination = 0.02 elif (str(filename) == '/home/anegawa/Dropbox/cover.mat'): clf.contamination = 0.009 elif (str(filename) == '/home/anegawa/Dropbox/breastw.mat'): clf.contamination = 0.35 elif (str(filename) == '/home/anegawa/Dropbox/arrhythmia.mat'): clf.contamination = 0.15 elif (str(filename) == '/home/anegawa/Dropbox/ionosphere.mat'): clf.contamination = 0.36 elif (str(filename) == '/home/anegawa/Dropbox/satellite.mat'): clf.contamination = 0.32 elif (str(filename) == '/home/anegawa/Dropbox/annthyroid.mat'): clf.contamination = 0.07 elif (str(filename) == '/home/anegawa/Dropbox/smtp.mat'): clf.contamination = 0.03 / 100 else: raise Exception("error! cannot file it.") # 交差検証を何回行うか(例:8:2なら5回) # もっとうまい方法ありそう hoge = 1 / (1 - rate) cross_count = int(np.ceil(hoge)) if cross_count > hoge: cross_count = cross_count - 1 # cross_count分のauc,acc合計 sum_auc = 0 sum_accuracy = 0 pca_fit_time = 0 pca_transform_train_time = 0 pca_transform_test_time = 0 test_time = 0 fit_time = 0 if sepaLabel == True: # separated # data cut X_anomaly = [] X_normal = [] for i in range(len(X)): if y[i] == 1: X_anomaly.append(X[i]) else: X_normal.append(X[i]) cutter_anomaly = int(np.ceil(len(X_anomaly) * rate)) cutter_normal = int(np.ceil(len(X_normal) * rate)) for count in range(cross_count): if sepaLabel: part_anomaly = int(np.ceil(cutter_anomaly * count)) part_normal = int(np.ceil(cutter_normal * count)) X_train = [] X_train_correct = [] X_test = [] X_test_correct = [] for i, k in zip(range(len(X_anomaly)), range(part_anomaly, part_anomaly + len(X_anomaly))): while k >= len(X_anomaly): k = k - len(X_anomaly) if i < cutter_anomaly: X_train.append(X_anomaly[k]) X_train_correct.append(-1) else: X_test.append(X_anomaly[k]) X_test_correct.append(-1) for i, k in zip(range(len(X_normal)), range(part_normal, part_normal + len(X_normal))): while k >= len(X_normal): k = k - len(X_normal) if i < cutter_normal: X_train.append(X_normal[k]) X_train_correct.append(1) else: X_test.append(X_normal[k]) X_test_correct.append(1) # シャッフルするかどうか if sfl: X_train_set = [] X_test_set = [] for i in range(len(X_train)): buf = [] buf.append(X_train[i]) buf.append(X_train_correct[i]) X_train_set.append(buf) for i in range(len(X_test)): buf = [] buf.append(X_test[i]) buf.append(X_test_correct[i]) X_test_set.append(buf) random.shuffle(X_train_set) random.shuffle(X_test_set) X_train = [] X_test = [] X_train_correct = [] X_test_correct = [] for i in range(len(X_train_set)): X_train.append(X_train_set[i][0]) X_train_correct.append(X_train_set[i][1]) for i in range(len(X_test_set)): X_test.append(X_test_set[i][0]) X_test_correct.append(X_test_set[i][1]) else: # mixed cutter = len(X) * rate part = int(np.ceil(cutter * count)) X_train = [] X_train_correct = [] X_test = [] X_test_correct = [] for i, k in zip(range(len(X)), range(part, part + len(X))): while k >= len(X): k = k - len(X) if i < len(X) * rate: X_train.append(X[k]) X_train_correct.append(y[k]) else: X_test.append(X[k]) X_test_correct.append(y[k]) for q in range(len(X_train_correct)): j = X_train_correct[q] if (j == 1): X_train_correct[q] = -1 else: X_train_correct[q] = 1 for w in range(len(X_test_correct)): j = X_test_correct[w] if (j == 1): X_test_correct[w] = -1 else: X_test_correct[w] = 1 # owari # finished cutting data if pcaLabel: if sepa2: # if False: pca2 = PCA(copy=True, iterated_power='auto', random_state=None, svd_solver='auto', tol=0.0, whiten=False) pca2.fit(X_train_normal) component = pca2.components_ component2 = np.sort(pca2.components_) if n_comp < len(component2): pca2.components_ = component2[0:n_comp] X_train = pca2.transform(X_train) X_test = pca2.transform(X_test) else: pca_fit_start = time.time() pca = PCA(copy=True, iterated_power='auto', n_components=n_comp, random_state=None, svd_solver='auto', tol=0.0, whiten=False) pca.fit(X_train) pca_fit_finish = time.time() pca_transform_train_start = time.time() X_train = pca.transform(X_train) pca_transform_train_finish = time.time() clf.max_features = n_comp pca_fit_time += (pca_fit_finish - pca_fit_start) pca_transform_train_time += (pca_transform_train_finish - pca_transform_train_start) fit_start = time.time() # fit_ylabelはFalseで固定 if fit_ylabel: clf.fit(X_train, X_train_correct, sample_weight=None) else: clf.fit(X_train, y=None, sample_weight=None) fit_finish = time.time() fit_time += (fit_finish - fit_start) if stream: sum_score_auc = [] sum_score_acc = [] for i in range(len(X_test)): if pcaLabel: pca_transform_test_start = time.time() a = [X_test[i]] X_test_pca = pca.transform(a) pca_transform_test_finish = time.time() pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start) else: X_test_pca = [X_test[i]] test_start = time.time() y_pred_test, a_score = clf.predict(X_test_pca) test_finish = time.time() test_time += (test_finish - test_start) sum_score_auc.append(a_score) sum_score_acc.append(y_pred_test) a_score = sum_score_auc y_pred_test = sum_score_acc else: # batch if pcaLabel: pca_transform_test_start = time.time() X_test = pca.transform(X_test) # stream version pca_transform_test_finish = time.time() pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start) test_start = time.time() y_pred_test, a_score = clf.predict(X_test) # a_score = clf.decision_function(X_test) test_finish = time.time() test_time += (test_finish - test_start) acc = calc_accuracy(X_test_correct, y_pred_test, treeLabel) AUC = calc_AUC(X_test_correct, a_score, treeLabel) sum_auc += AUC sum_accuracy += acc # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # plot the line, the samples, and the nearest vectors to the plane # # X_train = np.array(X_train) # X_test = np.array(X_test) # # lim = True # x = (-200, 200) # y = (-200, 300) # # for i,j in zip(range(2), [True, False]): # small = j # trueがsmallestね # # plt.subplot(2, 2, i+1) # if small: # plt.title("smallest") # else: # plt.title("largest") # # if small: # # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k') # else: # # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # plt.legend([b2],["testing observations"], # loc="upper left") # # plt.legend([b1], ["training observations"], # # loc="upper left") # # # # plt.subplot(2, 2, i+3) # if small: # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k') # else: # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # # plt.legend([b2], ["testing observations"], # # loc="upper left") # plt.legend([b1], ["training observations"], # loc="upper left") # plt.show()ter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k') # else: # # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # plt.legend([b2],["testing observations"], # loc="upper left") # # plt.legend([b1], ["training observations"], # # loc="upper left") # # # # plt.subplot(2, 2, i+3) # if small: # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k') # else: # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # # plt.legend([b2], ["testing observations"], # # loc="upper left") # plt.legend([b1], ["training observations"], # loc="upper left") # plt.show() # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ auc2 = sum_auc / cross_count acc2 = sum_accuracy / cross_count # calc time all_finish = time.time() all_time = all_finish - all_start pca_fit_time = pca_fit_time / cross_count pca_transform_train_time = pca_transform_train_time / cross_count pca_transform_test_time = pca_transform_test_time / cross_count test_time = test_time / cross_count fit_time = fit_time / cross_count sum_train_time = fit_time + pca_fit_time + pca_transform_train_time sum_test_time = pca_transform_test_time + test_time # print("sum_train_time : " + str(sum_train_time)) # print("pca_transform_train_time : " + str(pca_transform_train_time)) # print("pca_fit_time : " + str(pca_fit_time)) # print("test_time : " + str(test_time)) # print("fit_time : " + str(fit_time)) # print("all_time : " + str(all_time)) if time_label: return all_time, pca_fit_time + pca_transform_train_time, fit_time, pca_transform_test_time, test_time, sum_train_time, sum_test_time elif treeLabel: if math.isnan(auc2): raise Exception("error! auc is NaN!.") return auc2 else: return auc2, acc2
def main(filename, xtrains_percent=0.8, maxfeature=1, fit_ylabel=False, nn_estimator=100, sepaLabel=True, treeLabel=False, seed=42, pcaLabel=False, n_comp=2, sepa2=False, time_label=False, stream=False, sfl=False): mugen = float("inf") all_start = time.time() rng = np.random.RandomState(seed) # httpとsmtpは別の方法でデータ取得 if filename == '/home/anegawa/Dropbox/http.mat' or filename == '/home/anegawa/Dropbox/smtp.mat': mat = {} f = h5py.File(filename) for k, v in f.items(): mat[k] = np.array(v) X = mat['X'].T y2 = mat['y'][0] y3 = [] for i in range(len(y2)): y3.append(int(y2[i])) y = np.reshape(y3, [len(y3), 1]) else: mat = scipy.io.loadmat(filename) X = mat['X'] y = mat['y'] rate = xtrains_percent max_feat = int(maxfeature) if max_feat == 3: max_feat = X.shape[1] if treeLabel: anegawa = 0 else: print('X_train\'s rate : ' + str(rate)) print('max_features : ' + str(max_feat)) print('fit_ylabel : ' + str(fit_ylabel)) print('nn_estimator : ' + str(nn_estimator)) print('sepaLabel : ' + str(sepaLabel)) clf = IsolationForest(random_state=rng) clf.n_estimators = nn_estimator clf.verbose = 0 clf.max_features = max_feat if (str(filename) == '/home/anegawa/Dropbox/shuttle.mat'): clf.contamination = 0.07 elif (str(filename) == '/home/anegawa/Dropbox/http.mat'): clf.contamination = 0.004 elif (str(filename) == '/home/anegawa/Dropbox/pima.mat'): clf.contamination = 0.35 elif (str(filename) == '/home/anegawa/Dropbox/mammography.mat'): clf.contamination = 0.02 elif (str(filename) == '/home/anegawa/Dropbox/cover.mat'): clf.contamination = 0.009 elif (str(filename) == '/home/anegawa/Dropbox/breastw.mat'): clf.contamination = 0.35 elif (str(filename) == '/home/anegawa/Dropbox/arrhythmia.mat'): clf.contamination = 0.15 elif (str(filename) == '/home/anegawa/Dropbox/ionosphere.mat'): clf.contamination = 0.36 elif (str(filename) == '/home/anegawa/Dropbox/satellite.mat'): clf.contamination = 0.32 elif (str(filename) == '/home/anegawa/Dropbox/annthyroid.mat'): clf.contamination = 0.07 elif (str(filename) == '/home/anegawa/Dropbox/smtp.mat'): clf.contamination = 0.03 / 100 else: print('cannot file it.') # Generate train data a = rng.randn(400, 2) X = 0.3 * a X_train = np.r_[X + 2, X - 2] # X_train = np.ones([400, 2]) # Generate some regular novel observations X = 0.3 * rng.randn(400, 2) X_test = np.r_[X + 2, X - 2] # X_test = np.ones([400, 2]) # Generate some abnormal novel observations X_outliers = np.random.exponential(1. / 0.001, size=[20, 2]) # X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) # X_outliers = np.zeros([20, 2]) X_test = np.r_[X_test, X_outliers] X_train_correct = np.ones([X_train.shape]) hoge = 1 / (1 - rate) cross_count = int(np.ceil(hoge)) if cross_count > hoge: cross_count = cross_count - 1 sum_auc = 0 sum_accuracy = 0 pca_fit_time = 0 pca_transform_train_time = 0 pca_transform_test_time = 0 test_time = 0 fit_time = 0 sum_train_time = 0 # for count in range(cross_count): if sepaLabel == True: # separated # data cut X_anomaly = [] X_normal = [] for i in range(len(X)): if y[i] == 1: X_anomaly.append(X[i]) else: X_normal.append(X[i]) cutter_anomaly = int(np.ceil(len(X_anomaly) * rate)) cutter_normal = int(np.ceil(len(X_normal) * rate)) for count in range(cross_count): part_anomaly = int(np.ceil(cutter_anomaly * count)) part_normal = int(np.ceil(cutter_normal * count)) X_train = [] X_train_correct = [] X_test = [] X_test_correct = [] for i, k in zip(range(len(X_anomaly)), range(part_anomaly, part_anomaly + len(X_anomaly))): while k >= len(X_anomaly): k = k - len(X_anomaly) if i < cutter_anomaly: X_train.append(X_anomaly[k]) X_train_correct.append(-1) else: X_test.append(X_anomaly[k]) X_test_correct.append(-1) for i, k in zip(range(len(X_normal)), range(part_normal, part_normal + len(X_normal))): while k >= len(X_normal): k = k - len(X_normal) if i < cutter_normal: X_train.append(X_normal[k]) X_train_correct.append(1) else: X_test.append(X_normal[k]) X_test_correct.append(1) if sfl: X_train_set = [] X_test_set = [] for i in range(len(X_train)): buf = [] buf.append(X_train[i]) buf.append(X_train_correct[i]) X_train_set.append(buf) for i in range(len(X_test)): buf = [] buf.append(X_test[i]) buf.append(X_test_correct[i]) X_test_set.append(buf) random.shuffle(X_train_set) random.shuffle(X_test_set) X_train = [] X_test = [] X_train_correct = [] X_test_correct = [] for i in range(len(X_train_set)): X_train.append(X_train_set[i][0]) X_train_correct.append(X_train_set[i][1]) for i in range(len(X_test_set)): X_test.append(X_test_set[i][0]) X_test_correct.append(X_test_set[i][1]) else: # mixed cutter = len(X) * rate # test start this index at the first time for count in range(cross_count): part = int(np.ceil(cutter * count)) # while part >= len(X): # part = part - len(X) X_train = [] X_train_correct = [] X_test = [] X_test_correct = [] for i, k in zip(range(len(X)), range(part, part + len(X))): while k >= len(X): k = k - len(X) if i < len(X) * rate: X_train.append(X[k]) X_train_correct.append(y[k]) else: X_test.append(X[k]) X_test_correct.append(y[k]) for q in range(len(X_train_correct)): j = X_train_correct[q] if (j == 1): X_train_correct[q] = -1 else: X_train_correct[q] = 1 for w in range(len(X_test_correct)): j = X_test_correct[w] if (j == 1): X_test_correct[w] = -1 else: X_test_correct[w] = 1 # owari # finished cutting data if pcaLabel: pca_fit_start = time.time() pca = PCA(copy=True, iterated_power='auto', n_components=n_comp, random_state=None, svd_solver='auto', tol=0.0, whiten=False) pca2 = PCA(copy=True, iterated_power='auto', random_state=None, svd_solver='auto', tol=0.0, whiten=False) if sepa2: # if False: print("こっち入ってるけどええんか!?") pca2.fit(X_train_normal) component = pca2.components_ component2 = np.sort(pca2.components_) if n_comp < len(component2): pca2.components_ = component2[0:n_comp] # print(pca2.components_.shape) X_train = pca2.transform(X_train) X_test = pca2.transform(X_test) else: pca.fit(X_train) pca_fit_finish = time.time() pca_transform_train_start = time.time() X_train = pca.transform(X_train) pca_transform_train_finish = time.time() # a = X_test[0] # X_test = pca.transform(a) # if not stream: # pca_transform_test_start = time.time() # X_test = pca.transform(X_test) #stream version # pca_transform_test_finish = time.time() # pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start) clf.max_features = n_comp pca_fit_time += (pca_fit_finish - pca_fit_start) pca_transform_train_time += (pca_transform_train_finish - pca_transform_train_start) fit_start = time.time() if fit_ylabel: clf.fit(X_train, X_train_correct, sample_weight=None) else: clf.fit(X_train, y=None, sample_weight=None) fit_finish = time.time() fit_time += (fit_finish - fit_start) # if pcaLabel and stream: if stream: sum_score_auc = [] sum_score_acc = [] # print(X_test[0:1]) for i in range(len(X_test)): if pcaLabel: pca_transform_test_start = time.time() a = [X_test[i]] X_test_pca = pca.transform(a) pca_transform_test_finish = time.time() pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start) else: X_test_pca = [X_test[i]] test_start = time.time() y_pred_test, a_score = clf.predict(X_test_pca) test_finish = time.time() test_time += (test_finish - test_start) sum_score_auc.append(a_score) sum_score_acc.append(y_pred_test) a_score = sum_score_auc y_pred_test = sum_score_acc else: if pcaLabel: pca_transform_test_start = time.time() X_test = pca.transform(X_test) # stream version pca_transform_test_finish = time.time() pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start) test_start = time.time() y_pred_test, a_score = clf.predict(X_test) test_finish = time.time() test_time += (test_finish - test_start) # a_score = clf.decision_function(X_test) acc = calc_accuracy(X_test_correct, y_pred_test, treeLabel) AUC = calc_AUC(X_test_correct, a_score, treeLabel) sum_auc += AUC sum_accuracy = acc # return AUC # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # plot the line, the samples, and the nearest vectors to the plane # xx, yy = np.meshgrid(np.linspace(-200, 200, 1000), np.linspace(-200, 200, 1000)) # # clf.max_features = 2 # # print(yy.ravel()) # # Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) # # # Z = Z.reshape(xx.shape) # # plt.figure(figsize=(100, 200)) # plt.suptitle("satellite") # # plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) # # X_train = np.array(X_train) # X_test = np.array(X_test) # # lim = True # x = (-200, 200) # y = (-200, 300) # # for i,j in zip(range(2), [True, False]): # small = j # trueがsmallestね # # plt.subplot(2, 2, i+1) # if small: # plt.title("smallest") # else: # plt.title("largest") # # if small: # # b1 = plt.scat # # plot the line, the samples, and the nearest vectors to the plane # xx, yy = np.meshgrid(np.linspace(-200, 200, 1000), np.linspace(-200, 200, 1000)) # # clf.max_features = 2 # # print(yy.ravel()) # # Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) # # # Z = Z.reshape(xx.shape) # # plt.figure(figsize=(100, 200)) # plt.suptitle("satellite") # # plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) # # X_train = np.array(X_train) # X_test = np.array(X_test) # # lim = True # x = (-200, 200) # y = (-200, 300) # # for i,j in zip(range(2), [True, False]): # small = j # trueがsmallestね # # plt.subplot(2, 2, i+1) # if small: # plt.title("smallest") # else: # plt.title("largest") # # if small: # # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k') # else: # # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # plt.legend([b2],["testing observations"], # loc="upper left") # # plt.legend([b1], ["training observations"], # # loc="upper left") # # # # plt.subplot(2, 2, i+3) # if small: # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k') # else: # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # # plt.legend([b2], ["testing observations"], # # loc="upper left") # plt.legend([b1], ["training observations"], # loc="upper left") # plt.show()ter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k') # else: # # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # plt.legend([b2],["testing observations"], # loc="upper left") # # plt.legend([b1], ["training observations"], # # loc="upper left") # # # # plt.subplot(2, 2, i+3) # if small: # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k') # else: # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') # # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k') # # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k') # plt.axis('tight') # if lim: # plt.xlim(x) # plt.ylim(y) # # plt.legend([b1, b2], # # ["training observations", # # "testing observations"], # # loc="upper left") # # plt.legend([b2], ["testing observations"], # # loc="upper left") # plt.legend([b1], ["training observations"], # loc="upper left") # plt.show() # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ auc2 = sum_auc / cross_count # print(sum_accuracy) acc2 = sum_accuracy / cross_count # calc time all_finish = time.time() all_time = all_finish - all_start pca_fit_time = pca_fit_time / cross_count pca_transform_train_time = pca_transform_train_time / cross_count pca_transform_test_time = pca_transform_test_time / cross_count test_time = test_time / cross_count fit_time = fit_time / cross_count sum_train_time = fit_time + pca_fit_time + pca_transform_train_time sum_test_time = pca_transform_test_time + test_time # print("sum_train_time : " + str(sum_train_time)) # print("pca_transform_train_time : " + str(pca_transform_train_time)) # print("pca_fit_time : " + str(pca_fit_time)) # print("test_time : " + str(test_time)) # print("fit_time : " + str(fit_time)) # print("all_time : " + str(all_time)) # return if time_label: return all_time, pca_fit_time + pca_transform_train_time, fit_time, pca_transform_test_time, test_time, sum_train_time, sum_test_time elif treeLabel: if math.isnan(auc2): majikayo = True return auc2 else: return auc2, acc2