Exemplo n.º 1
0
def predict():
    test_contents, test_labels = load_corpus('./dataset/test.txt',
                                             word2id,
                                             max_sen_len=50)
    # 加载测试集
    test_dataset = TensorDataset(
        torch.from_numpy(test_contents).type(torch.float),
        torch.from_numpy(test_labels).type(torch.long))
    test_dataloader = DataLoader(dataset=test_dataset,
                                 batch_size=config.batch_size,
                                 shuffle=False,
                                 num_workers=2)
    # 读取模型
    model = TextCNN(config)
    model.load_state_dict(torch.load(config.model_path))
    model.eval()
    model.to(device)

    # 测试过程
    count, correct = 0, 0
    for _, (batch_x, batch_y) in enumerate(test_dataloader):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        output = model(batch_x)
        # correct += (output.argmax(1) == batch_y).float().sum().item()
        correct += (output.argmax(1) == batch_y).sum().item()
        count += len(batch_x)

    # 打印准确率
    print('test accuracy is {:.2f}%.'.format(100 * correct / count))
Exemplo n.º 2
0
def test():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len)
    test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True)

    # 预训练词向量矩阵
    embedding_matrix = get_pre_embedding_matrix("./data/final_vectors")
    # 模型
    model = TextCNN(cf,torch.tensor(embedding_matrix))

    # model.load_state_dict(torch.load("./output/model.bin",map_location='cpu'))
    model.load_state_dict(torch.load("./output/model.bin"))
    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count()>1:
        model = torch.nn.DataParallel(model)
    
    # 训练
    start_time = time.time()

    data_len = len(test_dataloader)

    model.eval()
    y_pred = np.array([])
    y_test = np.array([])
    for step,batch in enumerate(tqdm(test_dataloader,"batch",total=len(test_dataloader))):
        
        label_id = batch['label_id'].squeeze(1).to(device) 
        segment_ids = batch['segment_ids'].to(device) 
        with torch.no_grad():
            pred = model.get_labels(segment_ids)
        y_pred = np.hstack((y_pred,pred))
        y_test = np.hstack((y_test,label_id.to("cpu").numpy()))

    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test, y_pred, target_names=get_labels('./data/label')))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_pred)
    print(cm)
Exemplo n.º 3
0
def evaluate():
    # test
    model = TextCNN(config)
    model.cuda()
    saved_model = torch.load(config.save_model)
    model.load_state_dict(saved_model["state_dict"])
    print(
        "epoch:%s steps:%s best_valid_acc:%s" %
        (saved_model["epoch"], saved_model["steps"], saved_model["valid_acc"]))

    test_loss, test_acc, cm = test(config.test)
    print(
        f"\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)")

    print_confusion_matrix(cm, list(id2label.values()))
Exemplo n.º 4
0
def train():
    train_contents, train_labels = load_corpus('./dataset/train.txt',
                                               word2id,
                                               max_sen_len=50)
    val_contents, val_labels = load_corpus('./dataset/validation.txt',
                                           word2id,
                                           max_sen_len=50)
    # 混合训练集和验证集
    contents = np.vstack([train_contents, val_contents])
    labels = np.concatenate([train_labels, val_labels])
    # 加载训练用的数据
    train_dataset = TensorDataset(
        torch.from_numpy(contents).type(torch.float),
        torch.from_numpy(labels).type(torch.long))
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  num_workers=2)
    model = TextCNN(config)
    if config.model_path:
        model.load_state_dict(torch.load(config.model_path))
    model.to(device)
    # 设置优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    # 设置损失函数
    criterion = nn.CrossEntropyLoss()
    # 定义训练过程
    for epoch in range(config.epochs):
        for batch_idx, (batch_x, batch_y) in enumerate(train_dataloader):
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            output = model(batch_x)
            loss = criterion(output, batch_y)
            if batch_idx % 200 == 0 & config.verbose:
                print("Train Epoch:{}[{}/{} ({:.0f}%)]\tLoss:{:.6f}".format(
                    epoch + 1, batch_idx * len(batch_x),
                    len(train_dataloader.dataset),
                    100. * batch_idx / len(train_dataloader), loss.item()))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    # 保存模型
    torch.save(model.state_dict(), './models/model.pth')
Exemplo n.º 5
0
net = TextCNN(args)

output = None
dl_output = None
ml_output = None

FEATURE_LABEL = [
    "PROJECT_NAME", "BUSINESS_UNIT", "REGION_ID", "REP_OFFICE_ID",
    "CUSTOMER_ID", "PROJECT_LEVEL_NAME", "BUSINESS_GROUP_NAME",
    "DELIVERY_TYPE", "PROJECT_LABEL"
]

# Deep Learning
if args.snapshot is not None:
    net.load_state_dict(torch.load(args.snapshot))

    net.eval()
    feature = []
    for label in FEATURE_LABEL:
        text = getattr(args, label)
        text = text_fields.preprocess(text)
        text = [[text_fields.vocab.stoi[x] for x in text]]
        x = text_fields.tensor_type(text)
        x = autograd.Variable(x, volatile=True)
        feature.append(x)

    dl_output = net(feature).int().squeeze(0).tolist()

# Machine Learning
if args.machine_learning_model is not None:
Exemplo n.º 6
0
from text_processor import TextProcessor
from torch_config import EMBEDDINGS_DIR

app = Sanic('PyTorch API')

embeddings = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl')

model = TextCNN(
    embeddings=embeddings,
    n_filters=64,
    filter_sizes=[2, 3],
    dropout=0.0,
)

device = torch.device('cpu')
model.load_state_dict(torch.load('model.pth', map_location=device))
model.eval()

text_processing = TextProcessor(
    wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')),
    tokenizer=get_tokenizer('basic_english'),
    standardize=True,
    min_len=3,
)


@app.post('/game')
async def game(request: Request):
    q = request.form.get('q', None)

    if q is None:
Exemplo n.º 7
0
        i += 1

        print('epoch: %d, [iter: %d / all %d], class_loss: %f, domain_s_loss: %f, domain_t_loss: %f' \
        % (epoch, i, len_dataloader, class_loss.cpu().data.numpy(),
           domain_s_loss.cpu().data.numpy(), domain_t_loss.cpu().data.numpy()))
        logging.info('epoch: %d, [iter: %d / all %d], class_loss: %f, domain_s_loss: %f, domain_t_loss: %f' \
        % (epoch, i, len_dataloader, class_loss.cpu().data.numpy(),
           domain_s_loss.cpu().data.numpy(), domain_t_loss.cpu().data.numpy()))

        dir = 'checkpoint/WithoutImage_' + str(epoch + 1) + '.pkl'
        torch.save(model.state_dict(), dir)

# test
model = TextCNN(args, W)
model.load_state_dict(torch.load(dir))
if torch.cuda.is_available():
    model.cuda()
model.eval()
test_sub = np.zeros((len(label_df['id']), 3), dtype=np.float)
batch = len(label_df['id']) // args.batch_size

for i, (test_data, event_labels) in enumerate(test_loader):
    test_text, test_mask = to_var(test_data[0]), to_var(test_data[1])

    test_text = test_text.long()
    test_mask = test_mask.float()
    test_outputs, domain_outputs = model(test_text, test_mask)
    if i != batch:
        test_sub[i * args.batch_size:(i + 1) *
                 args.batch_size, :] = to_np(test_outputs)
Exemplo n.º 8
0
            label = torch.autograd.Variable(label).squeeze()
            out = model(data)
            l2_loss = config.l2 * torch.sum(
                torch.pow(list(model.parameters())[1], 2))
            loss = criterion(out, autograd.Variable(label.long())) + l2_loss
            loss_sum += loss.data.item()
            count += 1
            if count % 100 == 0:
                print("epoch", epoch, end='  ')
                print("The loss is: %.5f" % (loss_sum / 100))
                loss_sum = 0
                count = 0
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # save the model in every epoch
        # 一轮训练结束,在验证集测试
        valid_loss, valid_acc = get_test_result(valid_iter, valid_set)
        early_stopping(valid_loss, model)
        print("The valid acc is: %.5f" % valid_acc)
        if early_stopping.early_stop:
            print("Early stopping")
            break
    # 1 fold训练结果
    model.load_state_dict(torch.load('./checkpoints/checkpoint%d.pt' % i))
    test_loss, test_acc = get_test_result(test_iter, test_set)
    print("The test acc is: %.5f" % test_acc)
    acc += test_acc / 10
# 输出10-fold的平均acc
print("The test acc is: %.5f" % acc)
Exemplo n.º 9
0
def train(config):
    try:
        split = config["split"]
        data_path = config["data_path"]
        pretrained_model_dir = config["pretrained_model_dir"]
        pretrained_model_file = config["pretrained_model_file"]
        last_model_path = config["last_model_path"]
        save_to = config["save_to"]
        min_freq = config["min_freq"]
        batch_size = config["batch_size"]
        max_sent_length = config["max_sent_length"]
        embed_dim = config["embed_dim"]
        filter_num = config["filter_num"]
        filter_widths = config["filter_widths"]
        learning_rate = config["learning_rate"]
        patience = config["patience"]
        lr_decay = config["lr_decay"]
        max_num_trial = config["max_num_trial"]
        max_epoch = config["max_epoch"]
        save_every = config["save_every"]
        cuda = config["cuda"]
        debug = config["debug"]
    except KeyError:
        print("Input Parameter Error")
        exit(1)

    if not Path(save_to).exists():
        Path(save_to).mkdir()
    device = torch.device("cuda:0" if (
        torch.cuda.is_available() and cuda) else "cpu")

    # build torchtext field
    TEXT = torchtext.data.Field(tokenize='spacy', lower=True)
    LABEL = torchtext.data.Field(dtype=torch.long)

    train_data, test_data = IMDB.splits(TEXT, LABEL, root=data_path)
    if debug:
        train_data, val_data = train_data.split(split_ratio=0.1)
    train_data, val_data = train_data.split(split_ratio=0.7)
    train_iter, val_iter = torchtext.data.Iterator.splits(
        (train_data, val_data), batch_size=batch_size, device=device)

    if (pretrained_model_file is not None) and (pretrained_model_dir
                                                is not None):
        pretrained_vector = Vectors(name=pretrained_model_file,
                                    cache=pretrained_model_dir)

    TEXT.build_vocab(train_data, min_freq=min_freq, vectors=pretrained_vector)
    LABEL.build_vocab(train_data)

    logging.info("saving TEXT/LABEL vocabulary...")
    with open(f"{save_to}/TEXT_vocab.bin", "wb") as f:
        dill.dump(TEXT, f)
    with open(f"{save_to}/LABEL_vocab.bin", "wb") as f:
        dill.dump(LABEL, f)

    assert embed_dim == TEXT.vocab.vectors.shape[
        -1], "incompatiable embeddings"
    embed_num, class_num = len(TEXT.vocab), len(LABEL.vocab)

    model = TextCNN(embed_num,
                    embed_dim,
                    class_num,
                    filter_num,
                    filter_widths,
                    from_pretrained=TEXT.vocab.vectors).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    cross_entropy = nn.CrossEntropyLoss(weight=torch.tensor(
        [0, 0, 1.0, 1.0], device=device))  # class [<unk>,<pad>,'pos','neg']
    if last_model_path is not None:
        # load model
        logging.info(f'load model from  {last_model_path}')
        params = torch.load(last_model_path,
                            map_location=lambda storage, loc: storage)
        model.load_state_dict(params['state_dict'])
        logging.info('restore parameters of the optimizers')
        optimizer.load_state_dict(torch.load(last_model_path + '.optim'))

    model.train()

    epoch = 0
    cur_trial = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    logging.info("begin training!")
    while True:
        epoch += 1
        train_loss = 0
        cum_cnt = 0
        step = 0
        for batch in iter(train_iter):
            feature, target = batch.text.T, batch.label.squeeze(0)
            step += 1
            optimizer.zero_grad()
            res = model(feature)
            loss = cross_entropy(res, target)
            train_loss += loss
            loss.backward()
            optimizer.step()
        train_loss = train_loss / step
        val_loss, accuracy = evaluate(model, val_iter, cross_entropy)

        logging.info(
            f'epoch {epoch}\t train_loss: {train_loss}\t val_loss:{val_loss}\t val_accuracy:{accuracy}  speed:{time.time()-train_time:.2f}s/epoch\t time elapsed {time.time()-begin_time:.2f}s'
        )
        train_time = time.time()

        is_better = len(
            hist_valid_scores) == 0 or val_loss < min(hist_valid_scores)
        hist_valid_scores.append(val_loss)

        if epoch % save_every == 0:
            model.save(f"{save_to}/model_step_{epoch}")
            torch.save(optimizer.state_dict(),
                       f"{save_to}/model_step_{epoch}.optim")
        if is_better:
            cur_patience = 0
            model_save_path = f"{save_to}/model_best"
            print(f'save currently the best model to [{model_save_path}]')
            model.save(model_save_path)
            # also save the optimizers' state
            torch.save(optimizer.state_dict(), model_save_path + '.optim')
        elif cur_patience < patience:
            cur_patience += 1
            print('hit patience %d' % cur_patience)

            if cur_patience == patience:
                cur_trial += 1
                print(f'hit #{cur_trial} trial')
                if cur_trial == max_num_trial:
                    print('early stop!')
                    exit(0)

                # decay lr, and restore from previously best checkpoint
                lr = optimizer.param_groups[0]['lr'] * lr_decay
                logging.info(
                    f'load previously best model and decay learning rate to {lr}'
                )

                # load model
                params = torch.load(model_save_path,
                                    map_location=lambda storage, loc: storage)
                model.load_state_dict(params['state_dict'])
                model = model.to(device)

                logging.info('restore parameters of the optimizers')
                optimizer.load_state_dict(
                    torch.load(model_save_path + '.optim'))

                # set new lr
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

                # reset patience
                cur_patience = 0

        if epoch == max_epoch:
            print('reached maximum number of epochs!')
            exit(0)
if __name__ == '__main__':

    # load data and model
    with open('/Users/pengyiliu/Desktop/UoS/Dissertation_Project/Implementation/train_data.pkl', 'rb') as fp:
        train_data = pickle.load(fp)
    with open('/Users/pengyiliu/Desktop/UoS/Dissertation_Project/Implementation/test_data.pkl', 'rb') as fp:
        test_data = pickle.load(fp)
    with open('/Users/pengyiliu/Desktop/UoS/Dissertation_Project/Implementation/word2index.pkl', 'rb') as fp:
        word2index = pickle.load(fp)
    with open('/Users/pengyiliu/Desktop/UoS/Dissertation_Project/Implementation/tag2index.pkl', 'rb') as fp:
        tag2index = pickle.load(fp)

    # build model
    config = Config()
    model = TextCNN(len(word2index), config.word_embedding_dimension, len(tag2index))
    model.load_state_dict(torch.load('save_model.pth'))
    model.eval()

    # test
    print('start testing……')
    result = []
    # for test in train_data:
    for i, batch in enumerate(getBatch(config.batch_size, test_data)):
        data, label = pad_to_batch(batch, word2index, tag2index)

        with torch.no_grad():
            score = model(data)
        pred = torch.topk(score, 3, dim=1)[1].data.tolist()
        # print('pred:', pred)

        target = torch.topk(label, 3, dim=1)
Exemplo n.º 11
0
         if args.gpu:
             inputs = inputs.cuda()
             labels = labels.cuda()
         outputs = model(inputs)
         loss = loss_fn(outputs, labels).item()
         cum_loss += loss * labels.size(0)
         cum_cnt += labels.size(0)
     model.train()
     return cum_loss / cum_cnt
 while True:
     valid_loss = validate(net)
     if args.verbose:
         print('validation loss: %.5f' % (valid_loss))
     if ep == 0 or valid_loss < best_loss:
         best_loss = valid_loss
         best_model.load_state_dict(net.state_dict())
         no_improve_cnt = 0
     else:
         no_improve_cnt += 1
     if no_improve_cnt > 5 or ep > 1000:
         if args.verbose:
             print('final validation: %.5f' % (validate(best_model)))
             print('best validation: %.5f' % (best_loss))
         break
     # Train
     for it, data in enumerate(dataLoader, start=0):
         inputs, labels = data
         if args.gpu:
             inputs = inputs.cuda()
             labels = labels.cuda()
         optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate)
Exemplo n.º 12
0
            if valid_acc > best_valid_acc:
                save_checkpoint({
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'valid_acc': valid_acc
                    }, True)

            secs = int(time.time() - start_time)
            mins = secs / 60
            secs = secs % 60
            writer.add_scalars("Loss", {
                'train': train_loss,
                'valid': valid_loss
                }, epoch)
            writer.add_scalars("Acc", {
                'train': train_acc,
                'valid': valid_acc
                }, epoch)

            print("Epoch: %d" % (epoch + 1), " | time in %d minutes, %d seconds" % (mins, secs))
            print(f"\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)")
            print(f"\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)")

        # test
        saved_params = torch.load("%s/%s" % (args.save_model, model_name))
        print("epoch:%s best_valid_acc:%s" % (saved_params['epoch'], saved_params['valid_acc']))
        model.load_state_dict(saved_params['state_dict'])
        loss, acc = test(args.test)
        print("test set loss: %s" % loss)
        print("test set acc: %s" % acc)