Пример #1
0
def train(epochs):
    vocab_size = loader.vocab_size
    num_classes = loader.num_classes

    model = TextCNN(vocab_size, num_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())

    for epoch in range(epochs):
        print('-' * 40 + ' epoch {} '.format(epoch) + '-' * 40)
        train_iter(model, loader, criterion, optimizer)
        print()
    torch.save(model.state_dict(), 'cnn.state_dict.pth')
    return
Пример #2
0
def train():
    train_contents, train_labels = load_corpus('./dataset/train.txt',
                                               word2id,
                                               max_sen_len=50)
    val_contents, val_labels = load_corpus('./dataset/validation.txt',
                                           word2id,
                                           max_sen_len=50)
    # 混合训练集和验证集
    contents = np.vstack([train_contents, val_contents])
    labels = np.concatenate([train_labels, val_labels])
    # 加载训练用的数据
    train_dataset = TensorDataset(
        torch.from_numpy(contents).type(torch.float),
        torch.from_numpy(labels).type(torch.long))
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  num_workers=2)
    model = TextCNN(config)
    if config.model_path:
        model.load_state_dict(torch.load(config.model_path))
    model.to(device)
    # 设置优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    # 设置损失函数
    criterion = nn.CrossEntropyLoss()
    # 定义训练过程
    for epoch in range(config.epochs):
        for batch_idx, (batch_x, batch_y) in enumerate(train_dataloader):
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            output = model(batch_x)
            loss = criterion(output, batch_y)
            if batch_idx % 200 == 0 & config.verbose:
                print("Train Epoch:{}[{}/{} ({:.0f}%)]\tLoss:{:.6f}".format(
                    epoch + 1, batch_idx * len(batch_x),
                    len(train_dataloader.dataset),
                    100. * batch_idx / len(train_dataloader), loss.item()))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    # 保存模型
    torch.save(model.state_dict(), './models/model.pth')
Пример #3
0
def main():
    device = torch.device('cuda')

    embedding_vectors = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl')

    text_processor = TextProcessor(
        wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')),
        tokenizer=get_tokenizer('basic_english'),
        standardize=True,
        min_len=3,
    )

    dataset = TextDataset(CORPUS_DIR, text_processor)

    # split into training and test set
    # TODO: fix this splitting sometimes failing when corpus size changes
    train_set, test_set = torch.utils.data.random_split(
        dataset, [
            int(len(dataset) * DATA_SPLIT),
            int(len(dataset) * (1.0 - DATA_SPLIT))
        ])

    # count number of samples in each class
    class_count = [0, 0]
    for data, label in dataset:
        class_count[int(label.item())] += 1

    # get relative weights for classes
    _sum = sum(class_count)
    class_count[0] /= _sum
    class_count[1] /= _sum

    # reverse the weights since we're getting the inverse for the sampler
    class_count = list(reversed(class_count))

    # set weight for every sample
    weights = [class_count[int(x[1].item())] for x in train_set]

    # weighted sampler
    sampler = torch.utils.data.WeightedRandomSampler(
        weights=weights, num_samples=len(train_set), replacement=True)

    train_loader = DataLoader(dataset=train_set,
                              batch_size=32,
                              collate_fn=Sequencer(SEQUENCE_LEN),
                              sampler=sampler)

    test_loader = DataLoader(dataset=test_set,
                             batch_size=32,
                             collate_fn=Sequencer(SEQUENCE_LEN))

    # number of filters in each convolutional filter
    N_FILTERS = 64

    # sizes and number of convolutional layers
    FILTER_SIZES = [2, 3]

    # dropout for between conv and dense layers
    DROPOUT = 0.5

    model = TextCNN(
        embeddings=embedding_vectors,
        n_filters=N_FILTERS,
        filter_sizes=FILTER_SIZES,
        dropout=DROPOUT,
    ).to(device)

    print(model)
    print('Trainable params:',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    EPOCHS = 12

    best_acc = 0.0

    # training loop
    for epoch in range(EPOCHS):
        print('Epoch', epoch + 1)

        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):
            # get word indices vector and corresponding labels
            x, labels = data

            # send to device
            x = x.to(device)
            labels = labels.to(device)

            # make predictions
            predictions = model(x).squeeze()

            # calculate loss
            loss = criterion(predictions, labels)

            # learning stuff...
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # evaluate
        with torch.no_grad():
            model.eval()

            correct = 0
            wrong = 0
            m = [[0, 0], [0, 0]]

            for data in test_loader:
                x, label = data
                x = x.to(device)

                predictions = model(x).squeeze()

                for truth, prediction in zip(label, predictions):
                    y = int(truth.item())
                    y_pred = 1 if prediction.item() > 0.5 else 0

                    m[y][y_pred] += 1

                    if y == y_pred:
                        correct += 1
                    else:
                        wrong += 1

            model.train()

            acc = correct / (correct + wrong)
            if acc > best_acc:
                best_acc = acc
                for file in glob.glob('models/model_*.pth'):
                    os.remove(file)
                torch.save(model.state_dict(), f'models/state_{epoch}.pth')

            print()
            print('Correct:', f'{correct}/{correct + wrong}', 'Accuracy:', acc)
            print('[[TN, FP], [FN, TP]]')
            print(m)
            print()

    # put into evaluation mode
    model.eval()

    text_processor.do_standardize = True

    with torch.no_grad():
        while True:
            text = input('Prompt: ')
            x = text_processor.process(text)
            x = torch.tensor(x).unsqueeze(dim=0)
            print(model(x.to(device)).squeeze())
Пример #4
0
        domain_t_loss = criterion(domain_outputs, event_labels)
        err = class_loss + domain_s_loss + domain_t_loss
        err.backward()
        optimizer.step()

        i += 1

        print('epoch: %d, [iter: %d / all %d], class_loss: %f, domain_s_loss: %f, domain_t_loss: %f' \
        % (epoch, i, len_dataloader, class_loss.cpu().data.numpy(),
           domain_s_loss.cpu().data.numpy(), domain_t_loss.cpu().data.numpy()))
        logging.info('epoch: %d, [iter: %d / all %d], class_loss: %f, domain_s_loss: %f, domain_t_loss: %f' \
        % (epoch, i, len_dataloader, class_loss.cpu().data.numpy(),
           domain_s_loss.cpu().data.numpy(), domain_t_loss.cpu().data.numpy()))

        dir = 'checkpoint/WithoutImage_' + str(epoch + 1) + '.pkl'
        torch.save(model.state_dict(), dir)

# test
model = TextCNN(args, W)
model.load_state_dict(torch.load(dir))
if torch.cuda.is_available():
    model.cuda()
model.eval()
test_sub = np.zeros((len(label_df['id']), 3), dtype=np.float)
batch = len(label_df['id']) // args.batch_size

for i, (test_data, event_labels) in enumerate(test_loader):
    test_text, test_mask = to_var(test_data[0]), to_var(test_data[1])

    test_text = test_text.long()
    test_mask = test_mask.float()
Пример #5
0
def train():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 训练数据
    train_data = NewsDataset("./data/cnews_final_train.txt",cf.max_seq_len)
    train_dataloader = DataLoader(train_data,batch_size=cf.batch_size,shuffle=True)
    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len)
    test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True)

    # 预训练词向量矩阵
    embedding_matrix = get_pre_embedding_matrix("./data/final_vectors")
    # 模型
    model = TextCNN(cf,torch.tensor(embedding_matrix))
    # 优化器用adam
    optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count()>1:
        model = torch.nn.DataParallel(model)
    
    # 训练
    start_time = time.time()

    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练

    flag = False
    model.train()
    for epoch_id in trange(cf.epoch,desc="Epoch"):
        for step,batch in enumerate(tqdm(train_dataloader,"batch",total=len(train_dataloader))):
            
            label_id = batch['label_id'].squeeze(1).to(device) 
            segment_ids = batch['segment_ids'].to(device) 

            loss = model(segment_ids,label_id)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_batch += 1 

            if total_batch % cf.print_per_batch == 0:
                model.eval()
                with torch.no_grad():
                    loss_train,acc_train = model.get_loss_acc(segment_ids,label_id)
                loss_val,acc_val = evaluate(model,test_dataloader,device)
                
                if acc_val  > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    torch.save(model.state_dict(),"./output/model.bin")
                    improved_str = "*"
                else:
                    improved_str = ""
                
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))
                
                model.train()

            if total_batch - last_improved > require_improvement:
                print("长时间未优化")
                flag = True
                break
        if flag:
            break
Пример #6
0
def train(args, states=None):

    config_obj = Config(args.config_file)
    config = config_obj.elements

    # make training runs deterministic
    set_seed(seed_value=config['random_seed'])

    logging.info("Loading datasets...")
    dataset, labels = load_embeddings(data_path=config['data'],
                                      label_path=config['labels'])

    train_loader, val_loader, test_loader = create_dataloaders(
        dataset,
        labels,
        batch_size=config['batch_size'],
        random_seed=config['random_seed'],
        balance=config['correct_imbalance'],
    )

    model = TextCNN(
        num_classes=config['num_classes'],
        embedding_size=config['embedding_size'],
        num_filters=config['num_filters'],
        dropout_rate=config['dropout'],
    )
    if torch.cuda.is_available():
        model.cuda()

    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])

    best_metric = 0

    # loop over the dataset multiple times
    for epoch in range(1, config['num_epochs'] + 1):
        logging.info(
            f"==================== Epoch: {epoch} ====================")
        running_losses = []
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            if torch.cuda.is_available():
                inputs, labels = inputs.cuda(), labels.cuda()

            # zero the parameter gradients before each pass
            optimizer.zero_grad()

            # forward
            probs, classes = model(inputs)
            # backprop
            loss = loss_function(probs, labels)
            loss.backward()
            # update/optimize
            optimizer.step()

            # Log summary
            running_losses.append(loss.item())
            if i % args.log_interval == 0:
                interval_loss = sum(running_losses) / len(running_losses)
                logging.info(f"step = {i}, loss = {interval_loss}")
                running_losses = []

            if i % args.test_interval == 0:
                dev_metric = eval(
                    val_loader,
                    model,
                    loss_function,
                    args.eval_metric,
                )
                if dev_metric > best_metric:
                    best_metric = dev_metric
                    states = {
                        "epoch": epoch,
                        "step": i,
                        "model": model.state_dict(),
                        "optimizer": optimizer.state_dict()
                    }
                    save_model_state(save_dir=args.model_dir,
                                     step=i,
                                     states=states)

    print(f"Finished Training, best {args.eval_metric}: {best_metric}")
Пример #7
0
    epoches = 2
    emb_dim = 50  # 词向量维度
    lr = 0.001
    filter_num = 10  # 卷积核的个数
    filtersizes = '3,4,5'
    label_size = 4
    dropout = 0.5
    static = True  # 是否使用预训练词向量
    fine_tune = False  # 预训练词向量是否要微调

    # 获取词向量信息
    vocab_array, word_to_ix, ix_to_word = get_embedding(glove_file)
    train_iter = get_data_loader(sample_train_file, batch_size, word_to_ix,
                                 sentence_max_size)
    test_iter = get_data_loader(sample_test_file, batch_size, word_to_ix,
                                sentence_max_size)

    # 定义模型
    model = TextCNN(vocab_array, label_size, filter_num, filtersizes,
                    len(vocab_array), emb_dim, True, False, dropout)

    # 训练
    logging.info('开始训练模型')
    train_model(model, train_iter, epoches, lr)

    # 模型保存
    torch.save(model.state_dict(), model_param_file)

    logging.info('开始测试模型')
    model_test(model, test_iter)
Пример #8
0
        y_hat = net(X)  # 计算预测概率值
        loss = criterion(y_hat, y)  # 计算loss值
        optimizer.zero_grad()  # 梯度置零
        loss.backward()  # 反向传播
        optimizer.step()  # 参数更新
        step += 1

        # 测试
        if step % args.test_per_step == 0:
            net.eval()
            all_pre = []
            all_label = []

            for X, y in test_iter:
                X, y = X.to(device), y.to(device)
                y_hat = net(X)
                y_pre = torch.argmax(y_hat, dim=-1)
                all_pre.extend(y_pre.tolist())
                all_label.extend(y.tolist())
            test_acc_sum = sum(
                [int(line[0] == line[1]) for line in zip(all_pre, all_label)])
            test_acc = test_acc_sum / len(all_label)

            print('train_step %d, loss: %.4f, test_acc: %.4f' %
                  (step, loss, test_acc))

            if test_acc > best_acc:
                best_acc = test_acc
                torch.save(net.state_dict(), './model/best_model.bin')
print('best_acc: ', best_acc)
Пример #9
0
         if args.gpu:
             inputs = inputs.cuda()
             labels = labels.cuda()
         outputs = model(inputs)
         loss = loss_fn(outputs, labels).item()
         cum_loss += loss * labels.size(0)
         cum_cnt += labels.size(0)
     model.train()
     return cum_loss / cum_cnt
 while True:
     valid_loss = validate(net)
     if args.verbose:
         print('validation loss: %.5f' % (valid_loss))
     if ep == 0 or valid_loss < best_loss:
         best_loss = valid_loss
         best_model.load_state_dict(net.state_dict())
         no_improve_cnt = 0
     else:
         no_improve_cnt += 1
     if no_improve_cnt > 5 or ep > 1000:
         if args.verbose:
             print('final validation: %.5f' % (validate(best_model)))
             print('best validation: %.5f' % (best_loss))
         break
     # Train
     for it, data in enumerate(dataLoader, start=0):
         inputs, labels = data
         if args.gpu:
             inputs = inputs.cuda()
             labels = labels.cuda()
         optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate)
Пример #10
0
    def train(self):
        best_valid_loss = 1e9
        all_valid_loss, all_valid_acc = 0, 0

        # CV loop
        for i in range(self.args.cv_num):
            model = TextCNN(self.vocab_size, self.pad_idx,
                            self.args).to(device)

            # model variations (cf. "rand" is default value)
            if self.args.mode == "static":
                model.static_embedding.weight.data.copy_(self.embeddings)
                model.static_embedding.weight.requires_grad = False
            elif self.args.mode == "non-static":
                model.static_embedding.data.normal_(0, 1)
                model.static_embedding.weight.data.copy_(self.embeddings)
            elif self.args.mode == "multichannel":
                model.static_embedding.weight.data.copy_(self.embeddings)
                model.static_embedding.weight.requires_grad = False
                model.nonstatic_embedding.weight.data.copy_(self.embeddings)

            optimizer = optim.Adadelta(model.parameters())
            model.train()

            # generate train dataset
            print(f'>>> {i+1}th dataset is testset')  ## ??
            dataset = self.dataset_list.copy()
            del dataset[i]  # remove testset
            dataset = functools.reduce(
                lambda x, y: x + y,
                dataset)  # Concatenate datasets consecutively.

            data_loader = DataLoader(dataset=dataset,
                                     batch_size=self.args.batch_size,
                                     shuffle=True,
                                     collate_fn=self.collate_fn)

            for epoch in range(self.args.epochs):  # Epoch loop
                pbar = tqdm(data_loader)

                for text, label in pbar:
                    text = text.to(device)
                    label = label.to(device)

                    optimizer.zero_grad()

                    predictions = model(text).squeeze(1)
                    loss = self.criterion(predictions, label)
                    acc = self._binary_accuracy(predictions, label)

                    loss.backward()
                    optimizer.step()

                    # max_norm_scaling
                    eps = 1e-7
                    param = model.fc.weight
                    norm = torch.norm(param)  # l2_norm
                    if norm > self.args.l2_constraint:
                        param.data *= self.args.l2_constraint / (eps + norm)

                    pbar.set_description(
                        f"loss : {loss.item():.4f}, acc : {acc.item():.4f}")

            valid_loss, valid_acc = self.evaluate(model, i)
            all_valid_loss += valid_loss.item()
            all_valid_acc += valid_acc.item()
            print(
                f'valid loss : {valid_loss.item():.3f}, valid acc : {valid_acc.item():.3f}'
            )

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(
                    model.state_dict(),
                    osp.join(self.args.ck_path, f'{self.args.name}_best.pt'))

            if not self.args.cv:
                return

        print()
        print(f'Final loss : {all_valid_loss / self.args.cv_num:.3f}')
        print(f'Final acc : {all_valid_acc / self.args.cv_num:.3f}')
Пример #11
0
    return valid_loss / len(test_dataset), valid_acc / len(test_dataset)

if __name__ == "__main__":
    if args.mode == 'train':
        best_valid_acc = 0.0
        for epoch in range(args.epoch):
            start_time = time.time()
            train_loss, train_acc = train(args.train)
            valid_loss, valid_acc = test(args.dev)

            # save best model
            if valid_acc > best_valid_acc:
                save_checkpoint({
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'valid_acc': valid_acc
                    }, True)

            secs = int(time.time() - start_time)
            mins = secs / 60
            secs = secs % 60
            writer.add_scalars("Loss", {
                'train': train_loss,
                'valid': valid_loss
                }, epoch)
            writer.add_scalars("Acc", {
                'train': train_acc,
                'valid': valid_acc
                }, epoch)