Пример #1
0
def test():
    logger.info("Loading Data...")
    logger.info("Data processing...")
    test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file)
    logger.info("Data padding...")
    test_dataset = dh.MyData(test_data, args.pad_seq_len, device)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             shuffle=False)
    VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(
        args.word2vec_file)

    criterion = Loss()
    net = HMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE,
                pretrained_word2vec_matrix).to(device)
    checkpoint_file = cm.get_best_checkpoint(CPT_DIR,
                                             select_maximum_value=False)
    checkpoint = torch.load(checkpoint_file)
    net.load_state_dict(checkpoint['model_state_dict'])
    net.eval()

    logger.info("Scoring...")
    true_labels, predicted_scores = [], []
    batches = trange(len(test_loader), desc="Batches", leave=True)
    for batch_cnt, batch in zip(batches, test_loader):
        x_test_fb_content, x_test_fb_question, x_test_fb_option, \
        x_test_fb_clens, x_test_fb_qlens, x_test_fb_olens, y_test_fb = batch
        logits, scores = net(x_test_fb_content, x_test_fb_question,
                             x_test_fb_option)
        for i in y_test_fb[0].tolist():
            true_labels.append(i)
        for j in scores[0].tolist():
            predicted_scores.append(j)

    # Calculate the Metrics
    test_rmse = mean_squared_error(true_labels, predicted_scores)**0.5
    test_r2 = r2_score(true_labels, predicted_scores)
    test_pcc, test_doa = dh.evaluation(true_labels, predicted_scores)
    logger.info(
        "All Test set: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}".
        format(test_pcc, test_doa, test_rmse, test_r2))
    logger.info('Test Finished.')

    logger.info('Creating the prediction file...')
    dh.create_prediction_file(save_dir=SAVE_DIR,
                              identifiers=test_data['f_id'],
                              predictions=predicted_scores)

    logger.info('All Finished.')
Пример #2
0
def test():
    logger.info("Loading Data...")
    logger.info("Data processing...")

    test_data = dh.load_data_and_labels(args.test_file)

    test_dataset = dh.MyData(test_data.activity, test_data.timestep, test_data.labels)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=dh.collate_fn)

    # Load word2vec model
    COURSE_SIZE = dh.course2vec(args.course2vec_file)

    criterion = Loss()
    net = MOOCNet(args, COURSE_SIZE).to(device)
    checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False)
    checkpoint = torch.load(checkpoint_file)
    net.load_state_dict(checkpoint['model_state_dict'])
    net.eval()

    logger.info("Scoring...")
    true_labels, predicted_scores, predicted_labels = [], [], []
    batches = trange(len(test_loader), desc="Batches", leave=True)
    for batch_cnt, batch in zip(batches, test_loader):
        x_test, tsp_test, y_test = create_input_data(batch)
        logits, scores = net(x_test, tsp_test)
        for i in y_test.tolist():
            true_labels.append(i)
        for j in scores.tolist():
            predicted_scores.append(j)
            if j >= 0.5:
                predicted_labels.append(1)
            else:
                predicted_labels.append(0)

    # Calculate the Metrics
    logger.info('Test Finished.')

    logger.info('Creating the prediction file...')
    dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data.id, predictions=predicted_labels)

    logger.info('All Finished.')
def train():
    """Training RMIDP model."""
    dh.tab_printer(args, logger)

    # Load sentences, labels, and training parameters
    logger.info("Loading data...")
    logger.info("Data processing...")
    train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file)
    val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file)

    logger.info("Data padding...")
    train_dataset = dh.MyData(train_data, args.pad_seq_len, device)
    val_dataset = dh.MyData(val_data, args.pad_seq_len, device)

    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)

    # Load word2vec model
    VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)

    # Init network
    logger.info("Init nn...")
    net = RMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device)

    print("Model's state_dict:")
    for param_tensor in net.state_dict():
        print(param_tensor, "\t", net.state_dict()[param_tensor].size())

    criterion = Loss()
    optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda)

    if OPTION == 'T':
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False)
        logger.info("Writing to {0}\n".format(out_dir))
    elif OPTION == 'R':
        timestamp = input("[Input] Please input the checkpoints model you want to restore: ")
        while not (timestamp.isdigit() and len(timestamp) == 10):
            timestamp = input("[Warning] The format of your input is illegal, please re-input: ")
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False)
        logger.info("Writing to {0}\n".format(out_dir))
        checkpoint = torch.load(out_dir)
        net.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    logger.info("Training...")
    writer = SummaryWriter('summary')

    def eval_model(val_loader, epoch):
        """
        Evaluate on the validation set.
        """
        net.eval()
        eval_loss = 0.0
        true_labels, predicted_scores = [], []
        for batch in val_loader:
            x_val_fb_content, x_val_fb_question, x_val_fb_option, \
            x_val_fb_clens, x_val_fb_qlens, x_val_fb_olens, y_val_fb = batch

            logits, scores = net(x_val_fb_content, x_val_fb_question, x_val_fb_option)
            avg_batch_loss = criterion(scores, y_val_fb)
            eval_loss = eval_loss + avg_batch_loss.item()
            for i in y_val_fb[0].tolist():
                true_labels.append(i)
            for j in scores[0].tolist():
                predicted_scores.append(j)

        # Calculate the Metrics
        eval_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
        eval_r2 = r2_score(true_labels, predicted_scores)
        eval_pcc, eval_doa = dh.evaluation(true_labels, predicted_scores)
        eval_loss = eval_loss / len(val_loader)
        cur_value = eval_rmse
        logger.info("All Validation set: Loss {0:g} | PCC {1:.4f} | DOA {2:.4f} | RMSE {3:.4f} | R2 {4:.4f}"
                    .format(eval_loss, eval_pcc, eval_doa, eval_rmse, eval_r2))
        writer.add_scalar('validation loss', eval_loss, epoch)
        writer.add_scalar('validation PCC', eval_pcc, epoch)
        writer.add_scalar('validation DOA', eval_doa, epoch)
        writer.add_scalar('validation RMSE', eval_rmse, epoch)
        writer.add_scalar('validation R2', eval_r2, epoch)
        return cur_value

    for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True):
        # Training step
        batches = trange(len(train_loader), desc="Batches", leave=True)
        for batch_cnt, batch in zip(batches, train_loader):
            net.train()
            x_train_fb_content, x_train_fb_question, x_train_fb_option, \
            x_train_fb_clens, x_train_fb_qlens, x_train_fb_olens, y_train_fb = batch

            optimizer.zero_grad()   # 如果不置零,Variable 的梯度在每次 backward 的时候都会累加
            logits, scores = net(x_train_fb_content, x_train_fb_question, x_train_fb_option)
            avg_batch_loss = criterion(scores, y_train_fb)
            avg_batch_loss.backward()
            optimizer.step()    # Parameter updating
            batches.set_description("Batches (Loss={:.4f})".format(avg_batch_loss.item()))
            logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(epoch + 1, batch_cnt, avg_batch_loss.item()))
            writer.add_scalar('training loss', avg_batch_loss, batch_cnt)
        # Evaluation step
        cur_value = eval_model(val_loader, epoch)
        saver.handle(cur_value, net, optimizer, epoch)
    writer.close()

    logger.info('Training Finished.')
Пример #4
0
def train():
    """Training QuesNet model."""
    dh.tab_printer(args, logger)

    # Load sentences, labels, and training parameters
    logger.info("Loading data...")
    logger.info("Data processing...")
    train_data = dh.load_data_and_labels(args.train_file)
    val_data = dh.load_data_and_labels(args.validation_file)

    logger.info("Data padding...")
    train_dataset = dh.MyData(train_data.activity, train_data.timestep,
                              train_data.labels)
    val_dataset = dh.MyData(val_data.activity, val_data.timestep,
                            val_data.labels)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=dh.collate_fn)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            collate_fn=dh.collate_fn)

    # Load word2vec model
    COURSE_SIZE = dh.course2vec(args.course2vec_file)

    # Init network
    logger.info("Init nn...")
    net = MOOCNet(args, COURSE_SIZE).to(device)

    # weights_init(model=net)
    # print_weight(model=net)

    print("Model's state_dict:")
    for param_tensor in net.state_dict():
        print(param_tensor, "\t", net.state_dict()[param_tensor].size())

    criterion = Loss()
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.l2_lambda)

    if OPTION == 'T':
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(
            os.path.join(os.path.curdir, "runs", timestamp))
        saver = cm.BestCheckpointSaver(save_dir=out_dir,
                                       num_to_keep=args.num_checkpoints,
                                       maximize=False)
        logger.info("Writing to {0}\n".format(out_dir))
    elif OPTION == 'R':
        timestamp = input(
            "[Input] Please input the checkpoints model you want to restore: ")
        while not (timestamp.isdigit() and len(timestamp) == 10):
            timestamp = input(
                "[Warning] The format of your input is illegal, please re-input: "
            )
        out_dir = os.path.abspath(
            os.path.join(os.path.curdir, "runs", timestamp))
        saver = cm.BestCheckpointSaver(save_dir=out_dir,
                                       num_to_keep=args.num_checkpoints,
                                       maximize=False)
        logger.info("Writing to {0}\n".format(out_dir))
        checkpoint = torch.load(out_dir)
        net.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    logger.info("Training...")
    writer = SummaryWriter('summary')

    def eval_model(val_loader, epoch):
        """
        Evaluate on the validation set.
        """
        net.eval()
        eval_loss = 0.0
        true_labels, predicted_scores, predicted_labels = [], [], []
        for batch in val_loader:
            x_val, tsp_val, y_val = create_input_data(batch)
            logits, scores = net(x_val, tsp_val)
            avg_batch_loss = criterion(scores, y_val)
            eval_loss = eval_loss + avg_batch_loss.item()
            for i in y_val.tolist():
                true_labels.append(i)
            for j in scores.tolist():
                predicted_scores.append(j)
                if j >= args.threshold:
                    predicted_labels.append(1)
                else:
                    predicted_labels.append(0)

        # Calculate the Metrics
        eval_acc = accuracy_score(true_labels, predicted_labels)
        eval_pre = precision_score(true_labels, predicted_labels)
        eval_rec = recall_score(true_labels, predicted_labels)
        eval_F1 = f1_score(true_labels, predicted_labels)
        eval_auc = roc_auc_score(true_labels, predicted_scores)
        eval_prc = average_precision_score(true_labels, predicted_scores)
        eval_loss = eval_loss / len(val_loader)
        cur_value = eval_F1
        logger.info(
            "All Validation set: Loss {0:g} | ACC {1:.4f} | PRE {2:.4f} | REC {3:.4f} | F1 {4:.4f} | AUC {5:.4f} | PRC {6:.4f}"
            .format(eval_loss, eval_acc, eval_pre, eval_rec, eval_F1, eval_auc,
                    eval_prc))
        writer.add_scalar('validation loss', eval_loss, epoch)
        writer.add_scalar('validation ACC', eval_acc, epoch)
        writer.add_scalar('validation PRECISION', eval_pre, epoch)
        writer.add_scalar('validation RECALL', eval_rec, epoch)
        writer.add_scalar('validation F1', eval_F1, epoch)
        writer.add_scalar('validation AUC', eval_auc, epoch)
        writer.add_scalar('validation PRC', eval_prc, epoch)
        return cur_value

    for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True):
        # Training step
        batches = trange(len(train_loader), desc="Batches", leave=True)
        for batch_cnt, batch in zip(batches, train_loader):
            net.train()
            x_train, tsp_train, y_train = create_input_data(batch)
            optimizer.zero_grad()  # 如果不置零,Variable 的梯度在每次 backward 的时候都会累加
            logits, scores = net(x_train, tsp_train)
            # TODO
            avg_batch_loss = criterion(scores, y_train)
            avg_batch_loss.backward()
            optimizer.step()  # Parameter updating
            batches.set_description("Batches (Loss={:.4f})".format(
                avg_batch_loss.item()))
            logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(
                epoch + 1, batch_cnt, avg_batch_loss.item()))
            writer.add_scalar('training loss', avg_batch_loss, batch_cnt)
        # Evaluation step
        cur_value = eval_model(val_loader, epoch)
        saver.handle(cur_value, net, optimizer, epoch)
    writer.close()

    logger.info('Training Finished.')