print("Tasks len:", len(raw_tasks)) if os.path.exists(DATA_DIR + "/" + datasource + "/id_index/processed_dataset_{}.txt".format( retweet_user_size)): with open( DATA_DIR + "/" + datasource + "/id_index/processed_dataset_{}.txt".format( retweet_user_size), "r") as f: dataset = f.read() dataset = eval(dataset) vocab_size = get_vocab_size(datasource) else: data_builder = DatasetBuilder(datasource, time_cutoff=None, only_binary=True) dataset = data_builder.create_dataset(dataset_type="id_index", standardize_features=True) vocab_size = data_builder.get_vocab_size() np.set_printoptions(threshold=1e6) with open( DATA_DIR + "/" + datasource + "/id_index/processed_dataset_{}.txt".format( retweet_user_size), "w") as f: f.write(str(dataset)) print("dataset size: {}".format(len(dataset))) # print("task ids shape:\n{}".format(tasks_ids.shape)) # split dataset for training and testing idxs = np.arange(0, len(raw_tasks)) train_idxs, test_idxs = split_dataset(idxs, topic_split_rate[2], seed) val_idxs = train_idxs[-int(len(idxs) * topic_split_rate[1]):] train_idxs = train_idxs[:-int(len(idxs) * topic_split_rate[1])] # train_idxs, val_idxs, test_idxs = split_dataset(idxs, topic_split_rate, topic_task_nums, seed)
except ValueError: pass if (i + 1) % 5 == 0: time_spent = time.time() - start progress = 100. * (i + 1) / total print( f"{progress:.2f} % DONE, in {time_spent:.2f} seconds. Total would be {time_spent * 100 / (progress * 60):.2f} mins") df = pd.DataFrame(data=df_data, columns=df_columns) df.label = df.label.astype('category') df.to_csv(f"seiz_dataset_{name}.csv", index=False) if __name__ == "__main__": dataset_selected = 'twitter16' # Building a SEIZ dataset dataset_builder = DatasetBuilder(dataset_selected, only_binary=False, time_cutoff=10000) full_dataset = dataset_builder.create_dataset(dataset_type="raw", standardize_features=False) train_set = full_dataset['train'] dump_seiz_dataset(train_set, name=dataset_selected) dump_seiz_dataset(full_dataset['val'], name=dataset_selected + '_val') dump_seiz_dataset(full_dataset['test'], name=dataset_selected + '_test') dataset_selected = 'twitter15' dataset_builder = DatasetBuilder(dataset_selected, only_binary=False, time_cutoff=10000) full_dataset = dataset_builder.create_dataset(dataset_type="raw", standardize_features=False) train_set = full_dataset['train'] dump_seiz_dataset(train_set, name=dataset_selected) dump_seiz_dataset(full_dataset['val'], name=dataset_selected + '_val') dump_seiz_dataset(full_dataset['test'], name=dataset_selected + '_test')
retweet_user_size, seed ) if not os.path.exists(dst_folder): os.makedirs(dst_folder) # ---------------------------------------------- # Load dataset # ---------------------------------------------- # load raw dataset raw_dataset = load_raw_dataset(DATA_ROOT_PATH) # build dataset with preprocessing # parameter setting data_builder = DatasetBuilder(raw_dataset, retweet_user_size) dataset, topic_index = data_builder.create_dataset() print('Topics in dataset: {}'.format(topic_index.keys())) print('Dataset size: {}'.format(len(dataset))) # raw_task: [[0:[t_ids],1:[t_ids]],...] raw_tasks = [topic_index[topic] for topic in topic_index.keys()] task_sizes = [] print("scaled task distribution:") for task in raw_tasks: print([len(task[key]) for key in task.keys()]) task_sizes.append(sum([len(task[key]) for key in task.keys()])) task_sizes = np.array(task_sizes) # split tasks in the dataset for training and testing idxs = np.arange(0, len(raw_tasks)) train_idxs, test_idxs = split_dataset(idxs, topic_split_rate[2], seed)
def train(dataset, args): on_gpu = torch.cuda.is_available() if on_gpu: print("Using gpu") # Loading dataset time_cutoff = None if args.time_cutoff == "None" else int(args.time_cutoff) dataset_builder = DatasetBuilder(dataset, only_binary=args.only_binary, features_to_consider=args.features, time_cutoff=time_cutoff, seed=args.seed) datasets = dataset_builder.create_dataset( standardize_features=args.standardize, on_gpu=on_gpu, oversampling_ratio=args.oversampling_ratio) train_data_loader = torch_geometric.data.DataLoader( datasets["train"], batch_size=args.batch_size, shuffle=True) val_data_loader = torch_geometric.data.DataLoader( datasets["val"], batch_size=args.batch_size, shuffle=True) test_data_loader = torch_geometric.data.DataLoader( datasets["test"], batch_size=args.batch_size, shuffle=True) print("Number of node features", dataset_builder.num_node_features) print("Dimension of hidden space", args.hidden_dim) # Setting up model model = GNNStack(dataset_builder.num_node_features, args.hidden_dim, dataset_builder.num_classes, args) # model = GNNStack(dataset.num_node_features, 32, dataset.num_classes, args) if on_gpu: model.cuda() # Tensorboard logging log_dir = os.path.join("logs", args.exp_name) if not os.path.isdir(log_dir): os.makedirs(log_dir) train_writer = SummaryWriter(os.path.join(log_dir, "train")) val_writer = SummaryWriter(os.path.join(log_dir, "val")) test_writer = SummaryWriter(os.path.join(log_dir, "test")) # CSV logging csv_logging = [] # Checkpoints checkpoint_dir = os.path.join("checkpoints", args.exp_name) checkpoint_path = os.path.join(checkpoint_dir, "model.pt") if args.exp_name == "default" or not os.path.isfile(checkpoint_path): if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) epoch_ckp = 0 global_step = 0 best_val_acc = 0 else: checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["model_state_dict"]) epoch_ckp = checkpoint["epoch"] global_step = checkpoint["global_step"] best_val_acc = checkpoint["best_val_acc"] print("Restoring previous model at epoch", epoch_ckp) # Training phase optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=5e-4) for epoch in range(epoch_ckp, epoch_ckp + args.num_epochs): model.train() epoch_loss = 0 for batch in train_data_loader: # print(batch) # import pdb; pdb.set_trace() optimizer.zero_grad() out = model(batch) loss = F.nll_loss(out, batch.y) epoch_loss += loss.sum().item() # Optimization loss.backward() optimizer.step() # TFBoard logging train_writer.add_scalar("loss", loss.mean(), global_step) global_step += 1 print("epoch", epoch, "loss:", epoch_loss / len(train_data_loader)) if epoch % 1 == 0: # Evaluation on the training set model.eval() correct = 0 n_samples = 0 samples_per_label = np.zeros(dataset_builder.num_classes) pred_per_label = np.zeros(dataset_builder.num_classes) correct_per_label = np.zeros(dataset_builder.num_classes) with torch.no_grad(): for batch in train_data_loader: _, pred = model(batch).max(dim=1) correct += float(pred.eq(batch.y).sum().item()) for i in range(dataset_builder.num_classes): batch_i = batch.y.eq(i) pred_i = pred.eq(i) samples_per_label[i] += batch_i.sum().item() pred_per_label[i] += pred_i.sum().item() correct_per_label[i] += (batch_i * pred_i).sum().item() n_samples += len(batch.y) train_acc = correct / n_samples acc_per_label = correct_per_label / samples_per_label rec_per_label = correct_per_label / pred_per_label train_writer.add_scalar("Accuracy", train_acc, epoch) for i in range(dataset_builder.num_classes): train_writer.add_scalar("Accuracy_{}".format(i), acc_per_label[i], epoch) train_writer.add_scalar("Recall_{}".format(i), rec_per_label[i], epoch) print('Training accuracy: {:.4f}'.format(train_acc)) # Evaluation on the validation set model.eval() correct = 0 n_samples = 0 samples_per_label = np.zeros(dataset_builder.num_classes) pred_per_label = np.zeros(dataset_builder.num_classes) correct_per_label = np.zeros(dataset_builder.num_classes) with torch.no_grad(): for batch in val_data_loader: _, pred = model(batch).max(dim=1) correct += float(pred.eq(batch.y).sum().item()) for i in range(dataset_builder.num_classes): batch_i = batch.y.eq(i) pred_i = pred.eq(i) samples_per_label[i] += batch_i.sum().item() pred_per_label[i] += pred_i.sum().item() correct_per_label[i] += (batch_i * pred_i).sum().item() n_samples += len(batch.y) val_acc = correct / n_samples acc_per_label = correct_per_label / samples_per_label rec_per_label = correct_per_label / pred_per_label val_writer.add_scalar("Accuracy", val_acc, epoch) for i in range(dataset_builder.num_classes): val_writer.add_scalar("Accuracy_{}".format(i), acc_per_label[i], epoch) val_writer.add_scalar("Recall_{}".format(i), rec_per_label[i], epoch) print('Validation accuracy: {:.4f}'.format(val_acc)) # Evaluation on the test set model.eval() correct = 0 n_samples = 0 samples_per_label = np.zeros(dataset_builder.num_classes) pred_per_label = np.zeros(dataset_builder.num_classes) correct_per_label = np.zeros(dataset_builder.num_classes) with torch.no_grad(): for batch in test_data_loader: _, pred = model(batch).max(dim=1) correct += float(pred.eq(batch.y).sum().item()) for i in range(dataset_builder.num_classes): batch_i = batch.y.eq(i) pred_i = pred.eq(i) samples_per_label[i] += batch_i.sum().item() pred_per_label[i] += pred_i.sum().item() correct_per_label[i] += (batch_i * pred_i).sum().item() n_samples += len(batch.y) test_acc = correct / n_samples acc_per_label = correct_per_label / samples_per_label rec_per_label = correct_per_label / pred_per_label test_writer.add_scalar("Accuracy", test_acc, epoch) for i in range(dataset_builder.num_classes): test_writer.add_scalar("Accuracy_{}".format(i), acc_per_label[i], epoch) test_writer.add_scalar("Recall_{}".format(i), rec_per_label[i], epoch) print('Test accuracy: {:.4f}'.format(test_acc)) if val_acc > best_val_acc: best_val_acc = val_acc # Saving model if model is better checkpoint = { "epoch": epoch, "model_state_dict": model.state_dict(), "epoch_loss": epoch_loss / len(train_data_loader), "global_step": global_step, "best_val_acc": best_val_acc } torch.save(checkpoint, checkpoint_path) dict_logging = vars(args).copy() dict_logging["train_acc"] = train_acc dict_logging["val_acc"] = val_acc dict_logging["test_acc"] = test_acc csv_logging.append(dict_logging) csv_exists = os.path.exists("results.csv") header = dict_logging.keys() with open("results.csv", "a") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=header) if not csv_exists: writer.writeheader() for dict_ in csv_logging: writer.writerow(dict_) return
# } # torch.save(checkpoint, checkpoint_path) print("epoch", epoch, "loss:", epoch_loss / len(train_loader)) return max_running_mean if __name__ == "__main__": args = parser.parse_args() # Loading dataset dataset_builder = DatasetBuilder(args.dataset, only_binary=True, time_cutoff=1500) full_dataset = dataset_builder.create_dataset(dataset_type="sequential", standardize_features=False) val_dataset = full_dataset['val'] if args.debug: train_dataset = val_dataset else: train_dataset = full_dataset['train'] train_dataset = seq_data_to_dataset(train_dataset, cap_len=args.cap_len, num_features=11, standardize=True) val_dataset = seq_data_to_dataset(val_dataset, cap_len=args.cap_len, num_features=11, standardize=True)