def test(): logger.info("Loading Data...") logger.info("Data processing...") test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file) logger.info("Data padding...") test_dataset = dh.MyData(test_data, args.pad_seq_len, device) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix( args.word2vec_file) criterion = Loss() net = HMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device) checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False) checkpoint = torch.load(checkpoint_file) net.load_state_dict(checkpoint['model_state_dict']) net.eval() logger.info("Scoring...") true_labels, predicted_scores = [], [] batches = trange(len(test_loader), desc="Batches", leave=True) for batch_cnt, batch in zip(batches, test_loader): x_test_fb_content, x_test_fb_question, x_test_fb_option, \ x_test_fb_clens, x_test_fb_qlens, x_test_fb_olens, y_test_fb = batch logits, scores = net(x_test_fb_content, x_test_fb_question, x_test_fb_option) for i in y_test_fb[0].tolist(): true_labels.append(i) for j in scores[0].tolist(): predicted_scores.append(j) # Calculate the Metrics test_rmse = mean_squared_error(true_labels, predicted_scores)**0.5 test_r2 = r2_score(true_labels, predicted_scores) test_pcc, test_doa = dh.evaluation(true_labels, predicted_scores) logger.info( "All Test set: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}". format(test_pcc, test_doa, test_rmse, test_r2)) logger.info('Test Finished.') logger.info('Creating the prediction file...') dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data['f_id'], predictions=predicted_scores) logger.info('All Finished.')
def test(): logger.info("Loading Data...") logger.info("Data processing...") test_data = dh.load_data_and_labels(args.test_file) test_dataset = dh.MyData(test_data.activity, test_data.timestep, test_data.labels) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=dh.collate_fn) # Load word2vec model COURSE_SIZE = dh.course2vec(args.course2vec_file) criterion = Loss() net = MOOCNet(args, COURSE_SIZE).to(device) checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False) checkpoint = torch.load(checkpoint_file) net.load_state_dict(checkpoint['model_state_dict']) net.eval() logger.info("Scoring...") true_labels, predicted_scores, predicted_labels = [], [], [] batches = trange(len(test_loader), desc="Batches", leave=True) for batch_cnt, batch in zip(batches, test_loader): x_test, tsp_test, y_test = create_input_data(batch) logits, scores = net(x_test, tsp_test) for i in y_test.tolist(): true_labels.append(i) for j in scores.tolist(): predicted_scores.append(j) if j >= 0.5: predicted_labels.append(1) else: predicted_labels.append(0) # Calculate the Metrics logger.info('Test Finished.') logger.info('Creating the prediction file...') dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data.id, predictions=predicted_labels) logger.info('All Finished.')
def train(): """Training RMIDP model.""" dh.tab_printer(args, logger) # Load sentences, labels, and training parameters logger.info("Loading data...") logger.info("Data processing...") train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file) val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file) logger.info("Data padding...") train_dataset = dh.MyData(train_data, args.pad_seq_len, device) val_dataset = dh.MyData(val_data, args.pad_seq_len, device) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) # Load word2vec model VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) # Init network logger.info("Init nn...") net = RMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device) print("Model's state_dict:") for param_tensor in net.state_dict(): print(param_tensor, "\t", net.state_dict()[param_tensor].size()) criterion = Loss() optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda) if OPTION == 'T': timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) logger.info("Writing to {0}\n".format(out_dir)) elif OPTION == 'R': timestamp = input("[Input] Please input the checkpoints model you want to restore: ") while not (timestamp.isdigit() and len(timestamp) == 10): timestamp = input("[Warning] The format of your input is illegal, please re-input: ") out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) logger.info("Writing to {0}\n".format(out_dir)) checkpoint = torch.load(out_dir) net.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) logger.info("Training...") writer = SummaryWriter('summary') def eval_model(val_loader, epoch): """ Evaluate on the validation set. """ net.eval() eval_loss = 0.0 true_labels, predicted_scores = [], [] for batch in val_loader: x_val_fb_content, x_val_fb_question, x_val_fb_option, \ x_val_fb_clens, x_val_fb_qlens, x_val_fb_olens, y_val_fb = batch logits, scores = net(x_val_fb_content, x_val_fb_question, x_val_fb_option) avg_batch_loss = criterion(scores, y_val_fb) eval_loss = eval_loss + avg_batch_loss.item() for i in y_val_fb[0].tolist(): true_labels.append(i) for j in scores[0].tolist(): predicted_scores.append(j) # Calculate the Metrics eval_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 eval_r2 = r2_score(true_labels, predicted_scores) eval_pcc, eval_doa = dh.evaluation(true_labels, predicted_scores) eval_loss = eval_loss / len(val_loader) cur_value = eval_rmse logger.info("All Validation set: Loss {0:g} | PCC {1:.4f} | DOA {2:.4f} | RMSE {3:.4f} | R2 {4:.4f}" .format(eval_loss, eval_pcc, eval_doa, eval_rmse, eval_r2)) writer.add_scalar('validation loss', eval_loss, epoch) writer.add_scalar('validation PCC', eval_pcc, epoch) writer.add_scalar('validation DOA', eval_doa, epoch) writer.add_scalar('validation RMSE', eval_rmse, epoch) writer.add_scalar('validation R2', eval_r2, epoch) return cur_value for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True): # Training step batches = trange(len(train_loader), desc="Batches", leave=True) for batch_cnt, batch in zip(batches, train_loader): net.train() x_train_fb_content, x_train_fb_question, x_train_fb_option, \ x_train_fb_clens, x_train_fb_qlens, x_train_fb_olens, y_train_fb = batch optimizer.zero_grad() # 如果不置零,Variable 的梯度在每次 backward 的时候都会累加 logits, scores = net(x_train_fb_content, x_train_fb_question, x_train_fb_option) avg_batch_loss = criterion(scores, y_train_fb) avg_batch_loss.backward() optimizer.step() # Parameter updating batches.set_description("Batches (Loss={:.4f})".format(avg_batch_loss.item())) logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(epoch + 1, batch_cnt, avg_batch_loss.item())) writer.add_scalar('training loss', avg_batch_loss, batch_cnt) # Evaluation step cur_value = eval_model(val_loader, epoch) saver.handle(cur_value, net, optimizer, epoch) writer.close() logger.info('Training Finished.')
def train(): """Training QuesNet model.""" dh.tab_printer(args, logger) # Load sentences, labels, and training parameters logger.info("Loading data...") logger.info("Data processing...") train_data = dh.load_data_and_labels(args.train_file) val_data = dh.load_data_and_labels(args.validation_file) logger.info("Data padding...") train_dataset = dh.MyData(train_data.activity, train_data.timestep, train_data.labels) val_dataset = dh.MyData(val_data.activity, val_data.timestep, val_data.labels) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=dh.collate_fn) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=dh.collate_fn) # Load word2vec model COURSE_SIZE = dh.course2vec(args.course2vec_file) # Init network logger.info("Init nn...") net = MOOCNet(args, COURSE_SIZE).to(device) # weights_init(model=net) # print_weight(model=net) print("Model's state_dict:") for param_tensor in net.state_dict(): print(param_tensor, "\t", net.state_dict()[param_tensor].size()) criterion = Loss() optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda) if OPTION == 'T': timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) logger.info("Writing to {0}\n".format(out_dir)) elif OPTION == 'R': timestamp = input( "[Input] Please input the checkpoints model you want to restore: ") while not (timestamp.isdigit() and len(timestamp) == 10): timestamp = input( "[Warning] The format of your input is illegal, please re-input: " ) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) logger.info("Writing to {0}\n".format(out_dir)) checkpoint = torch.load(out_dir) net.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) logger.info("Training...") writer = SummaryWriter('summary') def eval_model(val_loader, epoch): """ Evaluate on the validation set. """ net.eval() eval_loss = 0.0 true_labels, predicted_scores, predicted_labels = [], [], [] for batch in val_loader: x_val, tsp_val, y_val = create_input_data(batch) logits, scores = net(x_val, tsp_val) avg_batch_loss = criterion(scores, y_val) eval_loss = eval_loss + avg_batch_loss.item() for i in y_val.tolist(): true_labels.append(i) for j in scores.tolist(): predicted_scores.append(j) if j >= args.threshold: predicted_labels.append(1) else: predicted_labels.append(0) # Calculate the Metrics eval_acc = accuracy_score(true_labels, predicted_labels) eval_pre = precision_score(true_labels, predicted_labels) eval_rec = recall_score(true_labels, predicted_labels) eval_F1 = f1_score(true_labels, predicted_labels) eval_auc = roc_auc_score(true_labels, predicted_scores) eval_prc = average_precision_score(true_labels, predicted_scores) eval_loss = eval_loss / len(val_loader) cur_value = eval_F1 logger.info( "All Validation set: Loss {0:g} | ACC {1:.4f} | PRE {2:.4f} | REC {3:.4f} | F1 {4:.4f} | AUC {5:.4f} | PRC {6:.4f}" .format(eval_loss, eval_acc, eval_pre, eval_rec, eval_F1, eval_auc, eval_prc)) writer.add_scalar('validation loss', eval_loss, epoch) writer.add_scalar('validation ACC', eval_acc, epoch) writer.add_scalar('validation PRECISION', eval_pre, epoch) writer.add_scalar('validation RECALL', eval_rec, epoch) writer.add_scalar('validation F1', eval_F1, epoch) writer.add_scalar('validation AUC', eval_auc, epoch) writer.add_scalar('validation PRC', eval_prc, epoch) return cur_value for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True): # Training step batches = trange(len(train_loader), desc="Batches", leave=True) for batch_cnt, batch in zip(batches, train_loader): net.train() x_train, tsp_train, y_train = create_input_data(batch) optimizer.zero_grad() # 如果不置零,Variable 的梯度在每次 backward 的时候都会累加 logits, scores = net(x_train, tsp_train) # TODO avg_batch_loss = criterion(scores, y_train) avg_batch_loss.backward() optimizer.step() # Parameter updating batches.set_description("Batches (Loss={:.4f})".format( avg_batch_loss.item())) logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format( epoch + 1, batch_cnt, avg_batch_loss.item())) writer.add_scalar('training loss', avg_batch_loss, batch_cnt) # Evaluation step cur_value = eval_model(val_loader, epoch) saver.handle(cur_value, net, optimizer, epoch) writer.close() logger.info('Training Finished.')