def train(sess,graph,config): batch_size=config["batch_size"] learning_rate=config["learning_rate"] if config["validation_dataset"] is None: _, train_data,valid_data,info = load_and_split_data(config,filename=config["dataset"],valid_data_rate=config["validation_data_rate"]) else: print("[INFO] training") train_data, info = load_data(config, filename=config["dataset"]) print("[INFO] validation") valid_data, valid_info = load_data(config, filename=config["validation_dataset"]) info["graph_node_num"] = max(info["graph_node_num"], valid_info["graph_node_num"]) info["graph_num"] = info["graph_num"] + valid_info["graph_num"] model = CoreModel(sess,config,info) model.build(importlib.import_module(config["model.py"])) if config["profile"]: vars_to_train = tf.trainable_variables() print(vars_to_train) writer = tf.summary.FileWriter('logs', sess.graph) # Training start_t = time.time() model.fit(train_data,valid_data) train_time = time.time() - start_t print("traing time:{0}".format(train_time) + "[sec]") if valid_data.num>0: # Validation start_t = time.time() validation_cost,validation_metrics,prediction_data=model.pred_and_eval(valid_data) infer_time = time.time() - start_t print("final cost =",validation_cost) print("accuracy =",validation_metrics["accuracy"]) print("validation time:{0}".format(infer_time) + "[sec]") # Saving if config["save_info_valid"] is not None: result={} result["validation_cost"]=validation_cost result["validation_accuracy"]=validation_metrics result["train_time"]=train_time result["infer_time"]=infer_time save_path=config["save_info_valid"] print("[SAVE] ",save_path) fp=open(save_path,"w") json.dump(result,fp, indent=4, cls=NumPyArangeEncoder) if config["export_model"]: try: print("[SAVE]",config["export_model"]) graph_def = graph_util.convert_variables_to_constants(sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', config["export_model"], as_text=False) except: print('[ERROR] output has been not found') if config["save_result_valid"] is not None: filename=config["save_result_valid"] save_prediction(filename,prediction_data) if config["make_plot"]: plot_cost(config,valid_data,model) plot_auc(config,valid_data.labels,np.array(prediction_data))
def reconstruct(sess, config): batch_size = config["batch_size"] model = importlib.import_module(config["model.py"]) dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename) graph_index_list = [] for i in range(all_data.num): graph_index_list.append([i, i]) info.graph_index_list = graph_index_list info.pos_weight = get_pos_weight(all_data) info.norm = get_norm(all_data) print("pos_weight=", info.pos_weight) print("norm=", info.pos_weight) model = CoreModel(sess, config, info, construct_feed_callback=construct_feed) model.build(importlib.import_module(config["model.py"]), is_train=False) vars_to_train = tf.trainable_variables() for v in vars_to_train: print(v) # initialize session saver = tf.train.Saver() #sess.run(tf.global_variables_initializer()) print("[load]", config["load_model"]) saver.restore(sess, config["load_model"]) start_t = time.time() cost, acc, pred_data = model.pred_and_eval(all_data) recons_data = pred_data """ recons_data=[] for i in range(3): print(i) cost,acc,pred_data=model.pred_and_eval(all_data) recons_data.append(pred_data) """ if "reconstruction_test" in config: filename = config["reconstruction_test"] os.makedirs(os.path.dirname(filename), exist_ok=True) print("[SAVE]", filename) joblib.dump(recons_data, filename)
def infer(sess,graph,config): batch_size=config["batch_size"] model = importlib.import_module(config["model.py"]) dataset_filename=config["dataset"] if "dataset_test" in config: dataset_filename=config["dataset_test"] all_data,info=load_data(config,filename=dataset_filename) model = CoreModel(sess,config,info) model.build(importlib.import_module(config["model.py"]),is_train=False) # Initialize session saver = tf.train.Saver() #sess.run(tf.global_variables_initializer()) print("[LOAD]",config["load_model"]) saver.restore(sess,config["load_model"]) # Validation start_t = time.time() test_cost,test_metrics,prediction_data=model.pred_and_eval(all_data) infer_time = time.time() - start_t print("final cost =",test_cost) print("accuracy =",test_metrics["accuracy"]) print("infer time:{0}".format(infer_time) + "[sec]") if config["save_info_test"] is not None: result={} result["test_cost"]=test_cost result["test_accuracy"]=test_metrics result["infer_time"]=infer_time save_path=config["save_info_test"] print("[SAVE] ",save_path) fp=open(save_path,"w") json.dump(result,fp, indent=4, cls=NumPyArangeEncoder) if config["prediction_data"] is None: print("[ERROR] prediction_data is required") quit() obj = {} obj["prediction_data"] = prediction_data obj["labels" ] = all_data.labels os.makedirs(os.path.dirname(config["prediction_data"]), exist_ok=True) joblib.dump(obj,config["prediction_data"])
def reconstruct(sess, config): dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename) graph_index_list = [] for i in range(all_data.num): graph_index_list.append([i, i]) info.graph_index_list = graph_index_list info.pos_weight = get_pos_weight(all_data) info.norm = get_norm(all_data) print(f"pos_weight={info.pos_weight}") print(f"norm={info.norm}") model = CoreModel(sess, config, info, construct_feed_callback=construct_feed) load_model_py(model, config["model.py"], is_train=False) vars_to_train = tf.trainable_variables() for v in vars_to_train: print(v) # initialize session restore_ckpt(sess, config["load_model"]) start_t = time.time() cost, acc, pred_data = model.pred_and_eval(all_data) recons_data = pred_data """ recons_data=[] for i in range(3): print(i) cost,acc,pred_data=model.pred_and_eval(all_data) recons_data.append(pred_data) """ if "reconstruction_test" in config: filename = config["reconstruction_test"] os.makedirs(os.path.dirname(filename), exist_ok=True) print(f"[SAVE] {filename}") joblib.dump(recons_data, filename)
def visualize(sess, config, args): from tensorflow.python import debug as tf_debug from kgcn.visualization import cal_feature_IG # 入力は1分子づつ batch_size = 1 # 入力データから、全データの情報, 学習用データの情報, 検証用データの情報, および # グラフに関する情報を順に取得する all_data, info = load_data(config, filename=config["dataset"], prohibit_shuffle=True) #all_data.labels = tf.one_hot(tf.cast(tf.squeeze(all_data.labels), tf.int32), depth=2) model = importlib.import_module(config["model.py"]) placeholders = model.build_placeholders(info, config, batch_size=batch_size) try: _model, prediction, _, _, _ = model.build_model( placeholders, info, config, batch_size=batch_size, feed_embedded_layer=True) except: _model, prediction, _, _, _ = model.build_model(placeholders, info, config, batch_size=batch_size) #--- セッションの初期化 saver = tf.train.Saver() print("[LOAD]", config["load_model"]) saver.restore(sess, config["load_model"]) #--- integrated gradientsの計算 cal_feature_IG(sess, all_data, placeholders, info, prediction, args.ig_modal_target, args.ig_label_target, logger=tf.logging, model=_model)
def visualize(sess, config, args): from kgcn.visualization import cal_feature_IG, cal_feature_IG_for_kg # input a molecule at a time batch_size = 1 dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename, prohibit_shuffle=True) model = CoreModel(sess, config, info) load_model_py(model, config["model.py"], is_train=False, feed_embedded_layer=True, batch_size=batch_size) placeholders = model.placeholders restore_ckpt(sess, config['load_model']) # calculate integrated gradients if config['visualize_type'] == 'graph': cal_feature_IG(sess, all_data, placeholders, info, config, model.prediction, args.ig_modal_target, args.ig_label_target, logger=tf.logging, model=model.nn, args=args) else: cal_feature_IG_for_kg(sess, all_data, placeholders, info, config, model.prediction, logger=tf.logging, model=model.nn, args=args)
def generate(sess, config): batch_size = config["batch_size"] dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename) graph_index_list = [] for i in range(all_data.num): graph_index_list.append([i, i]) info.graph_index_list = graph_index_list info.pos_weight = get_pos_weight(all_data) info.norm = get_norm(all_data) print("pos_weight=", info.pos_weight) print("norm=", info.pos_weight) model = CoreModel(sess, config, info, construct_feed_callback=construct_feed) load_model_py(model, config["model.py"], is_train=False) # initialize session saver = tf.train.Saver() #sess.run(tf.global_variables_initializer()) restore_ckpt(sess, config["load_model"]) start_t = time.time() generated_data = None #for i in range(3): #print(i) cost, acc, pred_data = model.pred_and_eval(all_data) generated_data = pred_data if "generation_test" in config: filename = config["generation_test"] dirname = os.path.dirname(filename) if dirname != "": os.makedirs(dirname, exist_ok=True) print("[SAVE]", filename) joblib.dump(generated_data, filename)
def train_cv(sess, graph, config): all_data, info = load_data( config, filename=config["dataset"], prohibit_shuffle=True) # shuffle is done by KFold model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) # Training if config["stratified_kfold"]: print("[INFO] use stratified K-fold") kf = StratifiedKFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) else: kf = KFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) kf_count = 1 fold_data_list = [] output_data_list = [] if all_data["labels"] is not None: split_base = all_data["labels"] else: split_base = all_data["label_list"][0] if config["stratified_kfold"]: split_base = np.argmax(split_base, axis=1) score_metrics = [] if config["task"] == "regression": metric_name = "mse" elif config["task"] == "regression_gmfe": metric_name = "gmfe" else: metric_name = "accuracy" split_data_generator = kf.split( split_base, split_base) if config["stratified_kfold"] else kf.split(split_base) for train_valid_list, test_list in split_data_generator: print(f"starting fold: {kf_count}") train_valid_data, test_data = split_data( all_data, indices_for_train_data=train_valid_list, indices_for_valid_data=test_list) train_data, valid_data = split_data( train_valid_data, valid_data_rate=config["validation_data_rate"]) # Training print(train_valid_list) print(test_list) start_t = time.time() model.fit(train_data, valid_data, k_fold_num=kf_count) train_time = time.time() - start_t print(f"training time: {train_time}[sec]") # Test print("== valid data ==") start_t = time.time() valid_cost, valid_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print(f"final cost = {valid_cost}\n" f"{metric_name} = {valid_metrics[metric_name]}\n" f"infer time: {infer_time}[sec]\n") print("== test data ==") start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval( test_data) infer_time = time.time() - start_t print(f"final cost = {test_cost}\n" f"{metric_name} = {test_metrics[metric_name]}\n") score_metrics.append(test_metrics[metric_name]) print(f"infer time: {infer_time}[sec]") if config["export_model"]: try: name, ext = os.path.splitext(config["export_model"]) filename = name + "." + str(kf_count) + ext print(f"[SAVE] {filename}") graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', filename, as_text=False) except: print('[ERROR] output has been not found') if "save_edge_result_cv" in config: output_data = model.output(test_data) output_data_list.append(output_data) # save fold data fold_data = dotdict({}) fold_data.prediction_data = prediction_data if all_data["labels"] is not None: fold_data.test_labels = test_data.labels else: fold_data.test_labels = test_data.label_list fold_data.test_data_idx = test_list if config["task"] == "regression": fold_data.training_mse = [ el["training_mse"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_mse"] for el in model.validation_metrics_list ] elif config["task"] == "regression_gmfe": fold_data.training_mse = [ el["training_gmfe"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_gmfe"] for el in model.validation_metrics_list ] else: fold_data.training_acc = [ el["training_accuracy"] for el in model.training_metrics_list ] fold_data.validation_acc = [ el["validation_accuracy"] for el in model.validation_metrics_list ] fold_data.test_acc = test_metrics[metric_name] fold_data.training_cost = model.training_cost_list fold_data.validation_cost = model.validation_cost_list fold_data.test_cost = test_cost fold_data.train_time = train_time fold_data.infer_time = infer_time fold_data_list.append(fold_data) kf_count += 1 print(f"cv {metric_name}(mean) = {np.mean(score_metrics)}\n" f"cv {metric_name}(std.) = {np.std(score_metrics)}\n") if "save_info_cv" in config and config["save_info_cv"] is not None: save_path = config["save_info_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold_data_list, save_path, compress=True) # if "save_edge_result_cv" in config and config[ "save_edge_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): pred_score = np.array(fold_data.prediction_data) true_label = np.array(fold_data.test_labels) test_idx = fold_data.test_data_idx score_list = [] for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data_list[j][0] fold["score"] = np.array(score_list) fold["test_data_idx"] = test_idx result_cv.append(fold) save_path = config["save_edge_result_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(result_cv, save_path, compress=True) # if "save_result_cv" in config and config["save_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): v = compute_metrics(config, info, fold_data.prediction_data, fold_data.test_labels) result_cv.append(v) save_path = config["save_result_cv"] print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) # for i, fold_data in enumerate(fold_data_list): prefix = "fold" + str(i) + "_" result_path = config["plot_path"] os.makedirs(result_path, exist_ok=True) if config["make_plot"]: if config["task"] == "regression": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "regression_gmfe": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "link_prediction": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path, prefix=prefix) else: make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_auc(config, fold_data.test_labels, pred_score, prefix=prefix)
def train(sess, graph, config): if config["validation_dataset"] is None: _, train_data, valid_data, info = load_and_split_data( config, filename=config["dataset"], valid_data_rate=config["validation_data_rate"]) else: print("[INFO] training") train_data, info = load_data(config, filename=config["dataset"]) print("[INFO] validation") valid_data, valid_info = load_data( config, filename=config["validation_dataset"]) info["graph_node_num"] = max(info["graph_node_num"], valid_info["graph_node_num"]) info["graph_num"] = info["graph_num"] + valid_info["graph_num"] model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) metric_name = ("mse" if config["task"] == "regression" else "gmfe" if config["task"] == "regression_gmfe" else "accuracy") if config["profile"]: vars_to_train = tf.trainable_variables() print(vars_to_train) # Training start_t = time.time() model.fit(train_data, valid_data) train_time = time.time() - start_t print(f"training time: {train_time}[sec]") if valid_data.num > 0: # Validation start_t = time.time() valid_cost, valid_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print(f"final cost = {valid_cost}\n" f"{metric_name} = {valid_metrics[metric_name]}\n" f"validation time: {infer_time}[sec]\n") # Saving if config["save_info_valid"] is not None: result = {} result["validation_cost"] = valid_cost result["validation_accuracy"] = valid_metrics result["train_time"] = train_time result["infer_time"] = infer_time if config["task"] != "link_prediction": result["valid_metrics"] = compute_metrics( config, info, prediction_data, valid_data.labels) ## save_path = config["save_info_valid"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["export_model"]: try: print(f"[SAVE] {config['export_model']}") graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', config["export_model"], as_text=False) except: print('[ERROR] output has been not found') if config["save_result_valid"] is not None: filename = config["save_result_valid"] save_prediction(filename, prediction_data) if config["make_plot"]: if config["task"] == "regression" or config[ "task"] == "regression_gmfe": # plot_cost(config, valid_data, model) plot_r2(config, valid_data.labels, np.array(prediction_data)) elif config["task"] == "link_prediction": plot_cost(config, valid_data, model) else: plot_cost(config, valid_data, model) plot_auc(config, valid_data.labels, np.array(prediction_data))
def train(sess, config): batch_size = config["batch_size"] learning_rate = config["learning_rate"] model = importlib.import_module(config["model.py"]) all_data, info = load_data(config, filename=config["dataset"]) placeholders = model.build_placeholders( info, batch_size=batch_size, adj_channel_num=info.adj_channel_num) _, prediction, cost, cost_sum, metrics = model.build_model( placeholders, info, batch_size=batch_size, adj_channel_num=info.adj_channel_num, embedding_dim=config["embedding_dim"]) train_step = tf.train.AdamOptimizer(learning_rate).minimize(cost) #train_step = tf.train.MomentumOptimizer(learning_rate,0.01).minimize(cost) # Initialize session saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Train model all_list_num = len(info.graph_index_list) print("#graph_index list = ", all_list_num) print("#data = ", all_data.num) data_idx = list(range(len(info.graph_index_list))) n = int(all_list_num * 0.8) train_idx = data_idx[:n] train_num = len(train_idx) valid_idx = data_idx[n:] valid_num = len(valid_idx) eary_stopping = EarlyStopping(config) start_t = time.time() for epoch in range(config["epoch"]): #[range(FLAGS.epochs): np.random.shuffle(train_idx) # training itr_num = int(np.ceil(train_num / batch_size)) training_cost = 0 training_correct_count = 0 for itr in range(itr_num): offset_b = itr * batch_size batch_idx = train_idx[offset_b:offset_b + batch_size] feed_dict = construct_feed(batch_idx, placeholders, all_data, info.graph_index_list, batch_size=batch_size, dropout_rate=0.5) # running parameter update with tensorflow out_prediction = sess.run([prediction], feed_dict=feed_dict) #print(out_prediction) _, out_cost_sum, out_metrics = sess.run( [train_step, cost_sum, metrics], feed_dict=feed_dict) training_cost += out_cost_sum training_correct_count += out_metrics["correct_count"] #print(out_metrics["correct_count"]) #print(batch_size) training_cost /= train_num training_accuracy = training_correct_count / train_num # validation itr_num = int(np.ceil(valid_num / batch_size)) validation_cost = 0 validation_correct_count = 0 for itr in range(itr_num): offset_b = itr * batch_size batch_idx = valid_idx[offset_b:offset_b + batch_size] feed_dict = construct_feed(batch_idx, placeholders, all_data, info.graph_index_list, batch_size=batch_size) out_cost_sum, out_metrics = sess.run([cost_sum, metrics], feed_dict=feed_dict) validation_cost += out_cost_sum validation_correct_count += out_metrics["correct_count"] validation_cost /= valid_num validation_accuracy = validation_correct_count / valid_num # check point save_path = None if (epoch) % config["save_interval"] == 0: # save save_path = config["save_model_path"] + "/model.%05d.ckpt" % ( epoch) saver.save(sess, save_path) # early stopping and printing information if eary_stopping.evaluate_validation( validation_cost, { "epoch": epoch, "validation_accuracy": validation_accuracy, "validation_cost": validation_cost, "training_accuracy": training_accuracy, "training_cost": training_cost, "save_path": save_path }): break train_time = time.time() - start_t print("traing time:{0}".format(train_time) + "[sec]") # saving last model #save_path = config["save_model_path"]+"/model.last.ckpt" if "save_model" in config and config["save_model"] is not None: save_path = config["save_model"] print("[SAVE] ", save_path) saver.save(sess, save_path) # validation start_t = time.time() data_idx = list(range(all_data.num)) itr_num = int(np.ceil(all_data.num / batch_size)) validation_cost = 0 validation_correct_count = 0 prediction_data = [] for itr in range(itr_num): offset_b = itr * batch_size batch_idx = data_idx[offset_b:offset_b + batch_size] feed_dict = construct_feed(batch_idx, placeholders, all_data, batch_size=batch_size) out_cost_sum, out_metrics, out_prediction = sess.run( [cost_sum, metrics, prediction], feed_dict=feed_dict) validation_cost += out_cost_sum validation_correct_count += out_metrics["correct_count"] prediction_data.append(out_prediction) validation_cost /= all_data.num validation_accuracy = validation_correct_count / all_data.num print("final cost =", validation_cost) print("accuracy =", validation_accuracy) train_time = time.time() - start_t print("infer time:{0}".format(train_time) + "[sec]") if "save_result_train" in config: filename = config["save_result_train"] save_prediction(filename, prediction_data)
def infer(sess, graph, config): batch_size = config["batch_size"] dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename) model = CoreModel(sess, config, info) load_model_py(model, config["model.py"], is_train=False) # Initialize session saver = tf.train.Saver() #sess.run(tf.global_variables_initializer()) print("[LOAD]", config["load_model"]) saver.restore(sess, config["load_model"]) # Validation start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval(all_data) infer_time = time.time() - start_t print("final cost =", test_cost) print("accuracy =", test_metrics["accuracy"]) print("infer time:{0}".format(infer_time) + "[sec]") if config["save_info_test"] is not None: result = {} result["test_cost"] = test_cost result["test_accuracy"] = test_metrics result["infer_time"] = infer_time save_path = config["save_info_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["save_result_test"] is not None: filename = config["save_result_test"] save_prediction(filename, prediction_data) if config["make_plot"]: plot_auc(config, all_data.labels, np.array(prediction_data)) if "save_edge_result_test" in config and config[ "save_edge_result_test"] is not None: output_data = model.output(all_data) pred_score = np.array(prediction_data) true_label = np.array(all_data.label_list) test_idx = all_data.test_data_idx score_list = [] print(true_label.shape) for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data[0] fold["score"] = np.array(score_list) save_path = config["save_edge_result_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) _, ext = os.path.splitext(save_path) if ext == ".json": fp = open(save_path, "w") json.dump(fold, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold, save_path, compress=True)
def train_cv(sess, graph, config): from sklearn.model_selection import KFold, StratifiedKFold from kgcn.make_plots import make_auc_plot, make_cost_acc_plot import sklearn from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support from scipy import interp batch_size = config["batch_size"] learning_rate = config["learning_rate"] all_data, info = load_data( config, filename=config["dataset"], prohibit_shuffle=True) # shuffle is done by KFold model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) # Training if config["stratified_kfold"]: print("[INFO] use stratified K-fold") kf = StratifiedKFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) else: kf = KFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) kf_count = 1 fold_data_list = [] output_data_list = [] if all_data["labels"] is not None: split_base = all_data["labels"] else: split_base = all_data["label_list"][0] if config["stratified_kfold"]: split_base = np.argmax(split_base, axis=1) score_metrics = [] if config["task"] == "regression": metric_name = "mse" elif config["task"] == "regression_gmfe": metric_name = "gmfe" else: metric_name = "accuracy" split_data_generator = kf.split( split_base, split_base) if config["stratified_kfold"] else kf.split(split_base) for train_valid_list, test_list in split_data_generator: print("starting fold:{0}".format(kf_count)) train_valid_data, test_data = split_data( all_data, indices_for_train_data=train_valid_list, indices_for_valid_data=test_list) train_data, valid_data = split_data( train_valid_data, valid_data_rate=config["validation_data_rate"]) # Training print(train_valid_list) print(test_list) start_t = time.time() model.fit(train_data, valid_data, k_fold_num=kf_count) train_time = time.time() - start_t print("traing time:{0}".format(train_time) + "[sec]") # Test print("== valid data ==") start_t = time.time() valid_cost, valid_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print("final cost =", valid_cost) print("%s =%f" % (metric_name, valid_metrics[metric_name])) print("infer time:{0}".format(infer_time) + "[sec]") print("== test data ==") start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval( test_data) infer_time = time.time() - start_t print("final cost =", test_cost) print("%s =%f" % (metric_name, test_metrics[metric_name])) score_metrics.append(test_metrics[metric_name]) print("infer time:{0}".format(infer_time) + "[sec]") if config["export_model"]: try: name, ext = os.path.splitext(config["export_model"]) filename = name + "." + str(kf_count) + ext print("[SAVE]", filename) graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', filename, as_text=False) except: print('[ERROR] output has been not found') if "save_edge_result_cv" in config: output_data = model.output(test_data) output_data_list.append(output_data) # save fold data fold_data = dotdict({}) fold_data.prediction_data = prediction_data if all_data["labels"] is not None: fold_data.test_labels = test_data.labels else: fold_data.test_labels = test_data.label_list fold_data.test_data_idx = test_list if config["task"] == "regression": fold_data.training_mse = [ el["training_mse"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_mse"] for el in model.validation_metrics_list ] elif config["task"] == "regression_gmfe": fold_data.training_mse = [ el["training_gmfe"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_gmfe"] for el in model.validation_metrics_list ] else: fold_data.training_acc = [ el["training_accuracy"] for el in model.training_metrics_list ] fold_data.validation_acc = [ el["validation_accuracy"] for el in model.validation_metrics_list ] fold_data.test_acc = test_metrics[metric_name] fold_data.training_cost = model.training_cost_list fold_data.validation_cost = model.validation_cost_list fold_data.test_cost = test_cost fold_data.train_time = train_time fold_data.infer_time = infer_time fold_data_list.append(fold_data) kf_count += 1 print("cv %s(mean) =%f" % (metric_name, np.mean(score_metrics))) print("cv %s(std.) =%f" % (metric_name, np.std(score_metrics))) if "save_info_cv" in config and config["save_info_cv"] is not None: save_path = config["save_info_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) _, ext = os.path.splitext(save_path) if ext == ".json": fp = open(save_path, "w") json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold_data_list, save_path, compress=True) ## if "save_edge_result_cv" in config and config[ "save_edge_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): pred_score = np.array(fold_data.prediction_data) true_label = np.array(fold_data.test_labels) test_idx = fold_data.test_data_idx score_list = [] for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data_list[j][0] fold["score"] = np.array(score_list) fold["test_data_idx"] = test_idx result_cv.append(fold) save_path = config["save_edge_result_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) _, ext = os.path.splitext(save_path) if ext == ".json": fp = open(save_path, "w") json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(result_cv, save_path, compress=True) # # if "save_result_cv" in config and config["save_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): pred_score = np.array(fold_data.prediction_data) if len(pred_score.shape) == 3: # multi-label-multi-task # #data x # task x #class # => this program supports only 2 labels pred_score = pred_score[:, :, 1] true_label = np.array(fold_data.test_labels) # #data x # task x #class if len(pred_score.shape) == 1: pred_score = pred_score[:, np.newaxis] if len(true_label.shape) == 1: true_label = true_label[:, np.newaxis] v = [] for i in range(info.label_dim): el = {} if config["task"] == "regression": el["r2"] = sklearn.metrics.r2_score( true_label[:, i], pred_score[:, i]) el["mse"] = sklearn.metrics.mean_squared_error( true_label[:, i], pred_score[:, i]) elif config["task"] == "regression_gmfe": el["gmfe"] = np.exp( np.mean(np.log(true_label[:, i] / pred_score[:, i]))) else: pred = np.zeros(pred_score.shape) pred[pred_score > 0.5] = 1 fpr, tpr, _ = roc_curve(true_label[:, i], pred_score[:, i], pos_label=1) roc_auc = auc(fpr, tpr) acc = accuracy_score(true_label[:, i], pred[:, i]) scores = precision_recall_fscore_support(true_label[:, i], pred[:, i], average='binary') el["auc"] = roc_auc el["acc"] = acc el["pre"] = scores[0] el["rec"] = scores[1] el["f"] = scores[2] el["sup"] = scores[3] v.append(el) result_cv.append(v) save_path = config["save_result_cv"] print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) # for i, fold_data in enumerate(fold_data_list): prefix = "fold" + str(i) + "_" result_path = config["plot_path"] os.makedirs(result_path, exist_ok=True) if config["make_plot"]: if config["task"] == "regression": # plot cost make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path + prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "regression_gmfe": # plot cost make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path + prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) else: # plot cost make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path + prefix) # plot AUC pred_score = np.array(fold_data.prediction_data) plot_auc(config, fold_data.test_labels, pred_score, prefix=prefix)
def infer(sess, graph, config): from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support batch_size = config["batch_size"] model = importlib.import_module(config["model.py"]) dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename) model = CoreModel(sess, config, info) load_model_py(model, config["model.py"], is_train=False) # Initialize session saver = tf.train.Saver() #sess.run(tf.global_variables_initializer()) print("[LOAD]", config["load_model"]) saver.restore(sess, config["load_model"]) # Validation start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval(all_data) infer_time = time.time() - start_t print("final cost =", test_cost) print("accuracy =", test_metrics["accuracy"]) print("infer time:{0}".format(infer_time) + "[sec]") if config["save_info_test"] is not None: result = {} result["test_cost"] = test_cost result["test_accuracy"] = test_metrics result["infer_time"] = infer_time ## pred_score = np.array(prediction_data) if len(pred_score.shape) == 3: # multi-label-multi-task # #data x # task x #class # => this program supports only 2 labels pred_score = pred_score[:, :, 1] true_label = np.array(all_data.labels) # #data x # task x #class if len(pred_score.shape) == 1: pred_score = pred_score[:, np.newaxis] if len(true_label.shape) == 1: true_label = true_label[:, np.newaxis] v = [] for i in range(info.label_dim): el = {} if config["task"] == "regression": el["r2"] = sklearn.metrics.r2_score(true_label[:, i], pred_score[:, i]) el["mse"] = sklearn.metrics.mean_squared_error( true_label[:, i], pred_score[:, i]) elif config["task"] == "regression_gmfe": el["gmfe"] = np.exp( np.mean(np.log(true_label[:, i] / pred_score[:, i]))) else: pred = np.zeros(pred_score.shape) pred[pred_score > 0.5] = 1 fpr, tpr, _ = roc_curve(true_label[:, i], pred_score[:, i], pos_label=1) roc_auc = auc(fpr, tpr) acc = accuracy_score(true_label[:, i], pred[:, i]) scores = precision_recall_fscore_support(true_label[:, i], pred[:, i], average='binary') el["auc"] = roc_auc el["acc"] = acc el["pre"] = scores[0] el["rec"] = scores[1] el["f"] = scores[2] el["sup"] = scores[3] v.append(el) result["test_metrics"] = el ## save_path = config["save_info_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["save_result_test"] is not None: filename = config["save_result_test"] save_prediction(filename, prediction_data) if config["make_plot"]: plot_auc(config, all_data.labels, np.array(prediction_data)) if "save_edge_result_test" in config and config[ "save_edge_result_test"] is not None: output_data = model.output(all_data) pred_score = np.array(prediction_data) true_label = np.array(all_data.label_list) test_idx = all_data.test_data_idx score_list = [] print(true_label.shape) for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data[0] fold["score"] = np.array(score_list) save_path = config["save_edge_result_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) _, ext = os.path.splitext(save_path) if ext == ".json": fp = open(save_path, "w") json.dump(fold, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold, save_path, compress=True)
def visualize(sess, config, args): from tensorflow.python import debug as tf_debug from kgcn.visualization import cal_feature_IG, cal_feature_IG_for_kg # 入力は1分子づつ batch_size = 1 # 入力データから、全データの情報, 学習用データの情報, 検証用データの情報, および # グラフに関する情報を順に取得する all_data, info = load_data(config, filename=config["dataset"], prohibit_shuffle=True) model = importlib.import_module(config["model.py"]) try: # emmbedingレイヤを使っているモデルの可視化。IGはemmbedingレイヤの出力を対象にして計算される。 placeholders = model.build_placeholders(info, config, batch_size=batch_size, feed_embedded_layer=True) except: placeholders = model.build_placeholders(info, config, batch_size=batch_size) try: # emmbedingレイヤを使っているモデルの可視化。IGはemmbedingレイヤの出力を対象にして計算される。 _model, prediction, _, _, _ = model.build_model( placeholders, info, config, batch_size=batch_size, feed_embedded_layer=True) except: _model, prediction, _, _, _ = model.build_model(placeholders, info, config, batch_size=batch_size) #--- セッションの初期化 saver = tf.train.Saver() #tf.compat.v1.logging.info("[LOAD]", config["load_model"]) tf.logging.info("[LOAD]", config["load_model"]) saver.restore(sess, config["load_model"]) #--- integrated gradientsの計算 if config['visualize_type'] == 'graph': cal_feature_IG(sess, all_data, placeholders, info, prediction, args.ig_modal_target, args.ig_label_target, logger=tf.logging, model=_model) #logger=tf.compat.v1.logging, model=_model) else: cal_feature_IG_for_kg(sess, all_data, placeholders, info, config, prediction, logger=tf.logging, model=_model)
def infer(sess, graph, config): dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] if "test_label_list" in config: config["label_list"] = config["test_label_list"] all_data, info = load_data(config, filename=dataset_filename, prohibit_shuffle=True) model = CoreModel(sess, config, info) load_model_py(model, config["model.py"], is_train=False) metric_name = ("mse" if config["task"] == "regression" else "gmfe" if config["task"] == "regression_gmfe" else "accuracy") # Initialize session restore_ckpt(sess, config["load_model"]) # Validation start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval(all_data) infer_time = time.time() - start_t print(f"final cost = {test_cost}\n" f"{metric_name} = {test_metrics[metric_name]}\n" f"infer time: {infer_time}[sec]\n") if config["save_info_test"] is not None: result = {} result["test_cost"] = test_cost result["test_accuracy"] = test_metrics result["infer_time"] = infer_time if config["task"] != "link_prediction": result["test_metrics"] = compute_metrics(config, info, prediction_data, all_data.labels) save_path = config["save_info_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["save_result_test"] is not None: filename = config["save_result_test"] save_prediction(filename, prediction_data) if config["make_plot"]: if config["task"] == "regression": pred_score = np.array(prediction_data) plot_r2(config, all_data.labels, pred_score) elif config["task"] == "regression_gmfe": pred_score = np.array(prediction_data) plot_r2(config, all_data.labels, pred_score) elif config["task"] == "link_prediction": pass else: plot_auc(config, all_data.labels, np.array(prediction_data)) if "save_edge_result_test" in config and config[ "save_edge_result_test"] is not None: #output_left_pred = model.left_pred(all_data) #print(output_left_pred.shape) ## output_data = model.output(all_data) pred_score = np.array(prediction_data) true_label = np.array(all_data.label_list) score_list = [] print(true_label.shape) for pair in true_label[0]: if len(prediction_data[0].shape) == 2: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] elif len(prediction_data[0].shape) == 3: i1, r1, j1, i2, r2, j2 = pair s1 = pred_score[0, r1, i1, j1] s2 = pred_score[0, r2, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data[0] fold["score"] = np.array(score_list) save_path = config["save_edge_result_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(fold, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold, save_path, compress=True) if config["prediction_data"] is not None: obj = {} obj["prediction_data"] = prediction_data obj["labels"] = all_data.labels os.makedirs(os.path.dirname(config["prediction_data"]), exist_ok=True) joblib.dump(obj, config["prediction_data"], compress=True)
def train(sess, config): if config["validation_dataset"] is None: all_data, train_data, valid_data, info = load_and_split_data( config, filename=config["dataset"], valid_data_rate=config["validation_data_rate"]) else: print("[INFO] training") train_data, info = load_data(config, filename=config["dataset"]) print("[INFO] validation") valid_data, valid_info = load_data( config, filename=config["validation_dataset"]) info["graph_node_num"] = max(info["graph_node_num"], valid_info["graph_node_num"]) info["graph_num"] = info["graph_num"] + valid_info["graph_num"] # train model graph_index_list = [] for i in range(info["graph_num"]): graph_index_list.append([i, i]) info.graph_index_list = graph_index_list info.pos_weight = get_pos_weight(train_data) info.norm = get_norm(train_data) print(f"pos_weight={info.pos_weight}") print(f"norm={info.norm}") model = CoreModel(sess, config, info, construct_feed_callback=construct_feed) load_model_py(model, config["model.py"]) vars_to_train = tf.trainable_variables() for v in vars_to_train: print(v) # Training start_t = time.time() model.fit(train_data, valid_data) train_time = time.time() - start_t print(f"training time:{train_time}[sec]") # Validation start_t = time.time() validation_cost, validation_accuracy, validation_prediction_data = model.pred_and_eval( valid_data) training_cost, training_accuracy, training_prediction_data = model.pred_and_eval( train_data) infer_time = time.time() - start_t print(f"final cost(training ) = {training_cost}\n" f"accuracy (training ) = {training_accuracy['accuracy']}\n" f"final cost(validation) = {validation_cost}\n" f"accuracy (validation) = {validation_accuracy['accuracy']}\n" f"infer time:{infer_time}[sec]\n") # Saving if config["save_info_valid"] is not None: result = {} result["validation_cost"] = validation_cost result["validation_accuracy"] = validation_accuracy["accuracy"] result["train_time"] = train_time result["infer_time"] = infer_time save_path = config["save_info_valid"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result, fp, indent=4) if config["save_info_train"] is not None: result = {} result["test_cost"] = training_cost result["test_accuracy"] = training_accuracy["accuracy"] result["train_time"] = train_time save_path = config["save_info_train"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if "reconstruction_valid" in config: filename = config["reconstruction_valid"] print(os.path.dirname(filename)) os.makedirs(os.path.dirname(filename), exist_ok=True) print(f"[SAVE] {filename}") joblib.dump(validation_prediction_data, filename) if "reconstruction_train" in config: filename = config["reconstruction_train"] os.makedirs(os.path.dirname(filename), exist_ok=True) print(f"[SAVE] {filename}") joblib.dump(training_prediction_data, filename)
def train(sess, graph, config): from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support batch_size = config["batch_size"] learning_rate = config["learning_rate"] if config["validation_dataset"] is None: _, train_data, valid_data, info = load_and_split_data( config, filename=config["dataset"], valid_data_rate=config["validation_data_rate"]) else: print("[INFO] training") train_data, info = load_data(config, filename=config["dataset"]) print("[INFO] validation") valid_data, valid_info = load_data( config, filename=config["validation_dataset"]) info["graph_node_num"] = max(info["graph_node_num"], valid_info["graph_node_num"]) info["graph_num"] = info["graph_num"] + valid_info["graph_num"] model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) if config["profile"]: vars_to_train = tf.trainable_variables() print(vars_to_train) writer = tf.summary.FileWriter('logs', sess.graph) # Training start_t = time.time() model.fit(train_data, valid_data) train_time = time.time() - start_t print("traing time:{0}".format(train_time) + "[sec]") if valid_data.num > 0: # Validation start_t = time.time() validation_cost, validation_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print("final cost =", validation_cost) print("accuracy =", validation_metrics["accuracy"]) print("validation time:{0}".format(infer_time) + "[sec]") # Saving if config["save_info_valid"] is not None: result = {} result["validation_cost"] = validation_cost result["validation_accuracy"] = validation_metrics result["train_time"] = train_time result["infer_time"] = infer_time ## pred_score = np.array(prediction_data) if len(pred_score.shape) == 3: # multi-label-multi-task # #data x # task x #class # => this program supports only 2 labels pred_score = pred_score[:, :, 1] true_label = np.array(valid_data.labels) # #data x # task x #class if len(pred_score.shape) == 1: pred_score = pred_score[:, np.newaxis] if len(true_label.shape) == 1: true_label = true_label[:, np.newaxis] v = [] for i in range(info.label_dim): el = {} if config["task"] == "regression": el["r2"] = sklearn.metrics.r2_score( true_label[:, i], pred_score[:, i]) el["mse"] = sklearn.metrics.mean_squared_error( true_label[:, i], pred_score[:, i]) elif config["task"] == "regression_gmfe": el["gmfe"] = np.exp( np.mean(np.log(true_label[:, i] / pred_score[:, i]))) else: pred = np.zeros(pred_score.shape) pred[pred_score > 0.5] = 1 fpr, tpr, _ = roc_curve(true_label[:, i], pred_score[:, i], pos_label=1) roc_auc = auc(fpr, tpr) acc = accuracy_score(true_label[:, i], pred[:, i]) scores = precision_recall_fscore_support(true_label[:, i], pred[:, i], average='binary') el["auc"] = roc_auc el["acc"] = acc el["pre"] = scores[0] el["rec"] = scores[1] el["f"] = scores[2] el["sup"] = scores[3] v.append(el) result["valid_metrics"] = el ## save_path = config["save_info_valid"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["export_model"]: try: print("[SAVE]", config["export_model"]) graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', config["export_model"], as_text=False) except: print('[ERROR] output has been not found') if config["save_result_valid"] is not None: filename = config["save_result_valid"] save_prediction(filename, prediction_data) if config["make_plot"]: plot_cost(config, valid_data, model) plot_auc(config, valid_data.labels, np.array(prediction_data))