示例#1
0
文件: gcn.py 项目: UnixJunkie/kGCN
def train_cv(sess, graph, config):
    all_data, info = load_data(
        config, filename=config["dataset"],
        prohibit_shuffle=True)  # shuffle is done by KFold
    model = CoreModel(sess, config, info)
    load_model_py(model, config["model.py"])
    # Training
    if config["stratified_kfold"]:
        print("[INFO] use stratified K-fold")
        kf = StratifiedKFold(n_splits=config["k-fold_num"],
                             shuffle=config["shuffle_data"],
                             random_state=123)
    else:
        kf = KFold(n_splits=config["k-fold_num"],
                   shuffle=config["shuffle_data"],
                   random_state=123)

    kf_count = 1
    fold_data_list = []
    output_data_list = []
    if all_data["labels"] is not None:
        split_base = all_data["labels"]
    else:
        split_base = all_data["label_list"][0]
    if config["stratified_kfold"]:
        split_base = np.argmax(split_base, axis=1)
    score_metrics = []
    if config["task"] == "regression":
        metric_name = "mse"
    elif config["task"] == "regression_gmfe":
        metric_name = "gmfe"
    else:
        metric_name = "accuracy"
    split_data_generator = kf.split(
        split_base,
        split_base) if config["stratified_kfold"] else kf.split(split_base)
    for train_valid_list, test_list in split_data_generator:
        print(f"starting fold: {kf_count}")
        train_valid_data, test_data = split_data(
            all_data,
            indices_for_train_data=train_valid_list,
            indices_for_valid_data=test_list)

        train_data, valid_data = split_data(
            train_valid_data, valid_data_rate=config["validation_data_rate"])
        # Training
        print(train_valid_list)
        print(test_list)
        start_t = time.time()
        model.fit(train_data, valid_data, k_fold_num=kf_count)
        train_time = time.time() - start_t
        print(f"training time: {train_time}[sec]")
        # Test
        print("== valid data ==")
        start_t = time.time()
        valid_cost, valid_metrics, prediction_data = model.pred_and_eval(
            valid_data)
        infer_time = time.time() - start_t
        print(f"final cost = {valid_cost}\n"
              f"{metric_name} = {valid_metrics[metric_name]}\n"
              f"infer time: {infer_time}[sec]\n")
        print("== test data ==")
        start_t = time.time()
        test_cost, test_metrics, prediction_data = model.pred_and_eval(
            test_data)
        infer_time = time.time() - start_t
        print(f"final cost = {test_cost}\n"
              f"{metric_name} = {test_metrics[metric_name]}\n")
        score_metrics.append(test_metrics[metric_name])
        print(f"infer time: {infer_time}[sec]")

        if config["export_model"]:
            try:
                name, ext = os.path.splitext(config["export_model"])
                filename = name + "." + str(kf_count) + ext
                print(f"[SAVE] {filename}")
                graph_def = graph_util.convert_variables_to_constants(
                    sess, graph.as_graph_def(), ['output'])
                tf.train.write_graph(graph_def, '.', filename, as_text=False)
            except:
                print('[ERROR] output has been not found')
        if "save_edge_result_cv" in config:
            output_data = model.output(test_data)
            output_data_list.append(output_data)
        # save fold data
        fold_data = dotdict({})
        fold_data.prediction_data = prediction_data
        if all_data["labels"] is not None:
            fold_data.test_labels = test_data.labels
        else:
            fold_data.test_labels = test_data.label_list
        fold_data.test_data_idx = test_list
        if config["task"] == "regression":
            fold_data.training_mse = [
                el["training_mse"] for el in model.training_metrics_list
            ]
            fold_data.validation_mse = [
                el["validation_mse"] for el in model.validation_metrics_list
            ]
        elif config["task"] == "regression_gmfe":
            fold_data.training_mse = [
                el["training_gmfe"] for el in model.training_metrics_list
            ]
            fold_data.validation_mse = [
                el["validation_gmfe"] for el in model.validation_metrics_list
            ]
        else:
            fold_data.training_acc = [
                el["training_accuracy"] for el in model.training_metrics_list
            ]
            fold_data.validation_acc = [
                el["validation_accuracy"]
                for el in model.validation_metrics_list
            ]
        fold_data.test_acc = test_metrics[metric_name]
        fold_data.training_cost = model.training_cost_list
        fold_data.validation_cost = model.validation_cost_list
        fold_data.test_cost = test_cost
        fold_data.train_time = train_time
        fold_data.infer_time = infer_time
        fold_data_list.append(fold_data)
        kf_count += 1

    print(f"cv {metric_name}(mean) = {np.mean(score_metrics)}\n"
          f"cv {metric_name}(std.)   = {np.std(score_metrics)}\n")
    if "save_info_cv" in config and config["save_info_cv"] is not None:
        save_path = config["save_info_cv"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print(f"[SAVE] {save_path}")
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            with open(save_path, "w") as fp:
                json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(fold_data_list, save_path, compress=True)
    #
    if "save_edge_result_cv" in config and config[
            "save_edge_result_cv"] is not None:
        result_cv = []
        for j, fold_data in enumerate(fold_data_list):
            pred_score = np.array(fold_data.prediction_data)
            true_label = np.array(fold_data.test_labels)
            test_idx = fold_data.test_data_idx
            score_list = []
            for pair in true_label[0]:
                i1, _, j1, i2, _, j2 = pair
                s1 = pred_score[0, i1, j1]
                s2 = pred_score[0, i2, j2]
                score_list.append([s1, s2])
            fold = {}
            fold["output"] = output_data_list[j][0]
            fold["score"] = np.array(score_list)
            fold["test_data_idx"] = test_idx
            result_cv.append(fold)
        save_path = config["save_edge_result_cv"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print(f"[SAVE] {save_path}")
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            with open(save_path, "w") as fp:
                json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(result_cv, save_path, compress=True)
    #
    if "save_result_cv" in config and config["save_result_cv"] is not None:
        result_cv = []
        for j, fold_data in enumerate(fold_data_list):
            v = compute_metrics(config, info, fold_data.prediction_data,
                                fold_data.test_labels)
            result_cv.append(v)
        save_path = config["save_result_cv"]
        print(f"[SAVE] {save_path}")
        with open(save_path, "w") as fp:
            json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder)
    #
    for i, fold_data in enumerate(fold_data_list):
        prefix = "fold" + str(i) + "_"
        result_path = config["plot_path"]
        os.makedirs(result_path, exist_ok=True)
        if config["make_plot"]:
            if config["task"] == "regression":
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_mse,
                                   fold_data.validation_mse,
                                   result_path,
                                   prefix=prefix)
                pred_score = np.array(fold_data.prediction_data)
                plot_r2(config,
                        fold_data.test_labels,
                        pred_score,
                        prefix=prefix)
            elif config["task"] == "regression_gmfe":
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_mse,
                                   fold_data.validation_mse,
                                   result_path,
                                   prefix=prefix)
                pred_score = np.array(fold_data.prediction_data)
                plot_r2(config,
                        fold_data.test_labels,
                        pred_score,
                        prefix=prefix)
            elif config["task"] == "link_prediction":
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_acc,
                                   fold_data.validation_acc,
                                   result_path,
                                   prefix=prefix)
            else:
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_acc,
                                   fold_data.validation_acc,
                                   result_path,
                                   prefix=prefix)
                pred_score = np.array(fold_data.prediction_data)
                plot_auc(config,
                         fold_data.test_labels,
                         pred_score,
                         prefix=prefix)
示例#2
0
文件: gcn.py 项目: 0h-n0/kGCN
def train_cv(sess, graph, config):
    from sklearn.model_selection import KFold, StratifiedKFold
    from kgcn.make_plots import make_auc_plot, make_cost_acc_plot
    import sklearn
    from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support
    from scipy import interp

    batch_size = config["batch_size"]
    learning_rate = config["learning_rate"]

    all_data, info = load_data(
        config, filename=config["dataset"],
        prohibit_shuffle=True)  # shuffle is done by KFold
    model = CoreModel(sess, config, info)
    load_model_py(model, config["model.py"])

    # Training
    if config["stratified_kfold"]:
        print("[INFO] use stratified K-fold")
        kf = StratifiedKFold(n_splits=config["k-fold_num"],
                             shuffle=config["shuffle_data"],
                             random_state=123)
    else:
        kf = KFold(n_splits=config["k-fold_num"],
                   shuffle=config["shuffle_data"],
                   random_state=123)

    kf_count = 1
    fold_data_list = []
    output_data_list = []
    if all_data["labels"] is not None:
        split_base = all_data["labels"]
    else:
        split_base = all_data["label_list"][0]
    if config["stratified_kfold"]:
        split_base = np.argmax(split_base, axis=1)
    score_metrics = []
    if config["task"] == "regression":
        metric_name = "mse"
    elif config["task"] == "regression_gmfe":
        metric_name = "gmfe"
    else:
        metric_name = "accuracy"
    split_data_generator = kf.split(
        split_base,
        split_base) if config["stratified_kfold"] else kf.split(split_base)
    for train_valid_list, test_list in split_data_generator:
        print("starting fold:{0}".format(kf_count))
        train_valid_data, test_data = split_data(
            all_data,
            indices_for_train_data=train_valid_list,
            indices_for_valid_data=test_list)

        train_data, valid_data = split_data(
            train_valid_data, valid_data_rate=config["validation_data_rate"])
        # Training
        print(train_valid_list)
        print(test_list)
        start_t = time.time()
        model.fit(train_data, valid_data, k_fold_num=kf_count)
        train_time = time.time() - start_t
        print("traing time:{0}".format(train_time) + "[sec]")
        # Test
        print("== valid data ==")
        start_t = time.time()
        valid_cost, valid_metrics, prediction_data = model.pred_and_eval(
            valid_data)
        infer_time = time.time() - start_t
        print("final cost =", valid_cost)
        print("%s   =%f" % (metric_name, valid_metrics[metric_name]))
        print("infer time:{0}".format(infer_time) + "[sec]")

        print("== test data ==")
        start_t = time.time()
        test_cost, test_metrics, prediction_data = model.pred_and_eval(
            test_data)
        infer_time = time.time() - start_t
        print("final cost =", test_cost)
        print("%s   =%f" % (metric_name, test_metrics[metric_name]))
        score_metrics.append(test_metrics[metric_name])
        print("infer time:{0}".format(infer_time) + "[sec]")

        if config["export_model"]:
            try:
                name, ext = os.path.splitext(config["export_model"])
                filename = name + "." + str(kf_count) + ext
                print("[SAVE]", filename)
                graph_def = graph_util.convert_variables_to_constants(
                    sess, graph.as_graph_def(), ['output'])
                tf.train.write_graph(graph_def, '.', filename, as_text=False)
            except:
                print('[ERROR] output has been not found')
        if "save_edge_result_cv" in config:
            output_data = model.output(test_data)
            output_data_list.append(output_data)
        # save fold data
        fold_data = dotdict({})
        fold_data.prediction_data = prediction_data
        if all_data["labels"] is not None:
            fold_data.test_labels = test_data.labels
        else:
            fold_data.test_labels = test_data.label_list
        fold_data.test_data_idx = test_list
        if config["task"] == "regression":
            fold_data.training_mse = [
                el["training_mse"] for el in model.training_metrics_list
            ]
            fold_data.validation_mse = [
                el["validation_mse"] for el in model.validation_metrics_list
            ]
        elif config["task"] == "regression_gmfe":
            fold_data.training_mse = [
                el["training_gmfe"] for el in model.training_metrics_list
            ]
            fold_data.validation_mse = [
                el["validation_gmfe"] for el in model.validation_metrics_list
            ]
        else:
            fold_data.training_acc = [
                el["training_accuracy"] for el in model.training_metrics_list
            ]
            fold_data.validation_acc = [
                el["validation_accuracy"]
                for el in model.validation_metrics_list
            ]
        fold_data.test_acc = test_metrics[metric_name]
        fold_data.training_cost = model.training_cost_list
        fold_data.validation_cost = model.validation_cost_list
        fold_data.test_cost = test_cost
        fold_data.train_time = train_time
        fold_data.infer_time = infer_time
        fold_data_list.append(fold_data)
        kf_count += 1

    print("cv %s(mean)   =%f" % (metric_name, np.mean(score_metrics)))
    print("cv %s(std.)   =%f" % (metric_name, np.std(score_metrics)))
    if "save_info_cv" in config and config["save_info_cv"] is not None:
        save_path = config["save_info_cv"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print("[SAVE] ", save_path)
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            fp = open(save_path, "w")
            json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(fold_data_list, save_path, compress=True)
    ##
    if "save_edge_result_cv" in config and config[
            "save_edge_result_cv"] is not None:
        result_cv = []
        for j, fold_data in enumerate(fold_data_list):
            pred_score = np.array(fold_data.prediction_data)
            true_label = np.array(fold_data.test_labels)
            test_idx = fold_data.test_data_idx
            score_list = []
            for pair in true_label[0]:
                i1, _, j1, i2, _, j2 = pair
                s1 = pred_score[0, i1, j1]
                s2 = pred_score[0, i2, j2]
                score_list.append([s1, s2])
            fold = {}
            fold["output"] = output_data_list[j][0]
            fold["score"] = np.array(score_list)
            fold["test_data_idx"] = test_idx
            result_cv.append(fold)
        save_path = config["save_edge_result_cv"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print("[SAVE] ", save_path)
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            fp = open(save_path, "w")
            json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(result_cv, save_path, compress=True)
    #
    #
    if "save_result_cv" in config and config["save_result_cv"] is not None:
        result_cv = []
        for j, fold_data in enumerate(fold_data_list):
            pred_score = np.array(fold_data.prediction_data)
            if len(pred_score.shape) == 3:  # multi-label-multi-task
                # #data x # task x #class
                # => this program supports only 2 labels
                pred_score = pred_score[:, :, 1]
            true_label = np.array(fold_data.test_labels)
            # #data x # task x #class
            if len(pred_score.shape) == 1:
                pred_score = pred_score[:, np.newaxis]
            if len(true_label.shape) == 1:
                true_label = true_label[:, np.newaxis]
            v = []
            for i in range(info.label_dim):
                el = {}
                if config["task"] == "regression":
                    el["r2"] = sklearn.metrics.r2_score(
                        true_label[:, i], pred_score[:, i])
                    el["mse"] = sklearn.metrics.mean_squared_error(
                        true_label[:, i], pred_score[:, i])
                elif config["task"] == "regression_gmfe":
                    el["gmfe"] = np.exp(
                        np.mean(np.log(true_label[:, i] / pred_score[:, i])))
                else:
                    pred = np.zeros(pred_score.shape)
                    pred[pred_score > 0.5] = 1
                    fpr, tpr, _ = roc_curve(true_label[:, i],
                                            pred_score[:, i],
                                            pos_label=1)
                    roc_auc = auc(fpr, tpr)
                    acc = accuracy_score(true_label[:, i], pred[:, i])
                    scores = precision_recall_fscore_support(true_label[:, i],
                                                             pred[:, i],
                                                             average='binary')
                    el["auc"] = roc_auc
                    el["acc"] = acc
                    el["pre"] = scores[0]
                    el["rec"] = scores[1]
                    el["f"] = scores[2]
                    el["sup"] = scores[3]
                v.append(el)
            result_cv.append(v)
        save_path = config["save_result_cv"]
        print("[SAVE] ", save_path)
        fp = open(save_path, "w")
        json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder)
    #
    for i, fold_data in enumerate(fold_data_list):
        prefix = "fold" + str(i) + "_"
        result_path = config["plot_path"]
        os.makedirs(result_path, exist_ok=True)
        if config["make_plot"]:
            if config["task"] == "regression":
                # plot cost
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_mse,
                                   fold_data.validation_mse,
                                   result_path + prefix)
                pred_score = np.array(fold_data.prediction_data)
                plot_r2(config,
                        fold_data.test_labels,
                        pred_score,
                        prefix=prefix)
            elif config["task"] == "regression_gmfe":
                # plot cost
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_mse,
                                   fold_data.validation_mse,
                                   result_path + prefix)
                pred_score = np.array(fold_data.prediction_data)
                plot_r2(config,
                        fold_data.test_labels,
                        pred_score,
                        prefix=prefix)
            else:
                # plot cost
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_acc,
                                   fold_data.validation_acc,
                                   result_path + prefix)
                # plot AUC
                pred_score = np.array(fold_data.prediction_data)
                plot_auc(config,
                         fold_data.test_labels,
                         pred_score,
                         prefix=prefix)