def train_cv(sess, graph, config): all_data, info = load_data( config, filename=config["dataset"], prohibit_shuffle=True) # shuffle is done by KFold model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) # Training if config["stratified_kfold"]: print("[INFO] use stratified K-fold") kf = StratifiedKFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) else: kf = KFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) kf_count = 1 fold_data_list = [] output_data_list = [] if all_data["labels"] is not None: split_base = all_data["labels"] else: split_base = all_data["label_list"][0] if config["stratified_kfold"]: split_base = np.argmax(split_base, axis=1) score_metrics = [] if config["task"] == "regression": metric_name = "mse" elif config["task"] == "regression_gmfe": metric_name = "gmfe" else: metric_name = "accuracy" split_data_generator = kf.split( split_base, split_base) if config["stratified_kfold"] else kf.split(split_base) for train_valid_list, test_list in split_data_generator: print(f"starting fold: {kf_count}") train_valid_data, test_data = split_data( all_data, indices_for_train_data=train_valid_list, indices_for_valid_data=test_list) train_data, valid_data = split_data( train_valid_data, valid_data_rate=config["validation_data_rate"]) # Training print(train_valid_list) print(test_list) start_t = time.time() model.fit(train_data, valid_data, k_fold_num=kf_count) train_time = time.time() - start_t print(f"training time: {train_time}[sec]") # Test print("== valid data ==") start_t = time.time() valid_cost, valid_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print(f"final cost = {valid_cost}\n" f"{metric_name} = {valid_metrics[metric_name]}\n" f"infer time: {infer_time}[sec]\n") print("== test data ==") start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval( test_data) infer_time = time.time() - start_t print(f"final cost = {test_cost}\n" f"{metric_name} = {test_metrics[metric_name]}\n") score_metrics.append(test_metrics[metric_name]) print(f"infer time: {infer_time}[sec]") if config["export_model"]: try: name, ext = os.path.splitext(config["export_model"]) filename = name + "." + str(kf_count) + ext print(f"[SAVE] {filename}") graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', filename, as_text=False) except: print('[ERROR] output has been not found') if "save_edge_result_cv" in config: output_data = model.output(test_data) output_data_list.append(output_data) # save fold data fold_data = dotdict({}) fold_data.prediction_data = prediction_data if all_data["labels"] is not None: fold_data.test_labels = test_data.labels else: fold_data.test_labels = test_data.label_list fold_data.test_data_idx = test_list if config["task"] == "regression": fold_data.training_mse = [ el["training_mse"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_mse"] for el in model.validation_metrics_list ] elif config["task"] == "regression_gmfe": fold_data.training_mse = [ el["training_gmfe"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_gmfe"] for el in model.validation_metrics_list ] else: fold_data.training_acc = [ el["training_accuracy"] for el in model.training_metrics_list ] fold_data.validation_acc = [ el["validation_accuracy"] for el in model.validation_metrics_list ] fold_data.test_acc = test_metrics[metric_name] fold_data.training_cost = model.training_cost_list fold_data.validation_cost = model.validation_cost_list fold_data.test_cost = test_cost fold_data.train_time = train_time fold_data.infer_time = infer_time fold_data_list.append(fold_data) kf_count += 1 print(f"cv {metric_name}(mean) = {np.mean(score_metrics)}\n" f"cv {metric_name}(std.) = {np.std(score_metrics)}\n") if "save_info_cv" in config and config["save_info_cv"] is not None: save_path = config["save_info_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold_data_list, save_path, compress=True) # if "save_edge_result_cv" in config and config[ "save_edge_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): pred_score = np.array(fold_data.prediction_data) true_label = np.array(fold_data.test_labels) test_idx = fold_data.test_data_idx score_list = [] for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data_list[j][0] fold["score"] = np.array(score_list) fold["test_data_idx"] = test_idx result_cv.append(fold) save_path = config["save_edge_result_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(result_cv, save_path, compress=True) # if "save_result_cv" in config and config["save_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): v = compute_metrics(config, info, fold_data.prediction_data, fold_data.test_labels) result_cv.append(v) save_path = config["save_result_cv"] print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) # for i, fold_data in enumerate(fold_data_list): prefix = "fold" + str(i) + "_" result_path = config["plot_path"] os.makedirs(result_path, exist_ok=True) if config["make_plot"]: if config["task"] == "regression": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "regression_gmfe": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "link_prediction": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path, prefix=prefix) else: make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_auc(config, fold_data.test_labels, pred_score, prefix=prefix)
def train_cv(sess, graph, config): from sklearn.model_selection import KFold, StratifiedKFold from kgcn.make_plots import make_auc_plot, make_cost_acc_plot import sklearn from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support from scipy import interp batch_size = config["batch_size"] learning_rate = config["learning_rate"] all_data, info = load_data( config, filename=config["dataset"], prohibit_shuffle=True) # shuffle is done by KFold model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) # Training if config["stratified_kfold"]: print("[INFO] use stratified K-fold") kf = StratifiedKFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) else: kf = KFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) kf_count = 1 fold_data_list = [] output_data_list = [] if all_data["labels"] is not None: split_base = all_data["labels"] else: split_base = all_data["label_list"][0] if config["stratified_kfold"]: split_base = np.argmax(split_base, axis=1) score_metrics = [] if config["task"] == "regression": metric_name = "mse" elif config["task"] == "regression_gmfe": metric_name = "gmfe" else: metric_name = "accuracy" split_data_generator = kf.split( split_base, split_base) if config["stratified_kfold"] else kf.split(split_base) for train_valid_list, test_list in split_data_generator: print("starting fold:{0}".format(kf_count)) train_valid_data, test_data = split_data( all_data, indices_for_train_data=train_valid_list, indices_for_valid_data=test_list) train_data, valid_data = split_data( train_valid_data, valid_data_rate=config["validation_data_rate"]) # Training print(train_valid_list) print(test_list) start_t = time.time() model.fit(train_data, valid_data, k_fold_num=kf_count) train_time = time.time() - start_t print("traing time:{0}".format(train_time) + "[sec]") # Test print("== valid data ==") start_t = time.time() valid_cost, valid_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print("final cost =", valid_cost) print("%s =%f" % (metric_name, valid_metrics[metric_name])) print("infer time:{0}".format(infer_time) + "[sec]") print("== test data ==") start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval( test_data) infer_time = time.time() - start_t print("final cost =", test_cost) print("%s =%f" % (metric_name, test_metrics[metric_name])) score_metrics.append(test_metrics[metric_name]) print("infer time:{0}".format(infer_time) + "[sec]") if config["export_model"]: try: name, ext = os.path.splitext(config["export_model"]) filename = name + "." + str(kf_count) + ext print("[SAVE]", filename) graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', filename, as_text=False) except: print('[ERROR] output has been not found') if "save_edge_result_cv" in config: output_data = model.output(test_data) output_data_list.append(output_data) # save fold data fold_data = dotdict({}) fold_data.prediction_data = prediction_data if all_data["labels"] is not None: fold_data.test_labels = test_data.labels else: fold_data.test_labels = test_data.label_list fold_data.test_data_idx = test_list if config["task"] == "regression": fold_data.training_mse = [ el["training_mse"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_mse"] for el in model.validation_metrics_list ] elif config["task"] == "regression_gmfe": fold_data.training_mse = [ el["training_gmfe"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_gmfe"] for el in model.validation_metrics_list ] else: fold_data.training_acc = [ el["training_accuracy"] for el in model.training_metrics_list ] fold_data.validation_acc = [ el["validation_accuracy"] for el in model.validation_metrics_list ] fold_data.test_acc = test_metrics[metric_name] fold_data.training_cost = model.training_cost_list fold_data.validation_cost = model.validation_cost_list fold_data.test_cost = test_cost fold_data.train_time = train_time fold_data.infer_time = infer_time fold_data_list.append(fold_data) kf_count += 1 print("cv %s(mean) =%f" % (metric_name, np.mean(score_metrics))) print("cv %s(std.) =%f" % (metric_name, np.std(score_metrics))) if "save_info_cv" in config and config["save_info_cv"] is not None: save_path = config["save_info_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) _, ext = os.path.splitext(save_path) if ext == ".json": fp = open(save_path, "w") json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold_data_list, save_path, compress=True) ## if "save_edge_result_cv" in config and config[ "save_edge_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): pred_score = np.array(fold_data.prediction_data) true_label = np.array(fold_data.test_labels) test_idx = fold_data.test_data_idx score_list = [] for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data_list[j][0] fold["score"] = np.array(score_list) fold["test_data_idx"] = test_idx result_cv.append(fold) save_path = config["save_edge_result_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) _, ext = os.path.splitext(save_path) if ext == ".json": fp = open(save_path, "w") json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(result_cv, save_path, compress=True) # # if "save_result_cv" in config and config["save_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): pred_score = np.array(fold_data.prediction_data) if len(pred_score.shape) == 3: # multi-label-multi-task # #data x # task x #class # => this program supports only 2 labels pred_score = pred_score[:, :, 1] true_label = np.array(fold_data.test_labels) # #data x # task x #class if len(pred_score.shape) == 1: pred_score = pred_score[:, np.newaxis] if len(true_label.shape) == 1: true_label = true_label[:, np.newaxis] v = [] for i in range(info.label_dim): el = {} if config["task"] == "regression": el["r2"] = sklearn.metrics.r2_score( true_label[:, i], pred_score[:, i]) el["mse"] = sklearn.metrics.mean_squared_error( true_label[:, i], pred_score[:, i]) elif config["task"] == "regression_gmfe": el["gmfe"] = np.exp( np.mean(np.log(true_label[:, i] / pred_score[:, i]))) else: pred = np.zeros(pred_score.shape) pred[pred_score > 0.5] = 1 fpr, tpr, _ = roc_curve(true_label[:, i], pred_score[:, i], pos_label=1) roc_auc = auc(fpr, tpr) acc = accuracy_score(true_label[:, i], pred[:, i]) scores = precision_recall_fscore_support(true_label[:, i], pred[:, i], average='binary') el["auc"] = roc_auc el["acc"] = acc el["pre"] = scores[0] el["rec"] = scores[1] el["f"] = scores[2] el["sup"] = scores[3] v.append(el) result_cv.append(v) save_path = config["save_result_cv"] print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) # for i, fold_data in enumerate(fold_data_list): prefix = "fold" + str(i) + "_" result_path = config["plot_path"] os.makedirs(result_path, exist_ok=True) if config["make_plot"]: if config["task"] == "regression": # plot cost make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path + prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "regression_gmfe": # plot cost make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path + prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) else: # plot cost make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path + prefix) # plot AUC pred_score = np.array(fold_data.prediction_data) plot_auc(config, fold_data.test_labels, pred_score, prefix=prefix)