Exemplo n.º 1
0
def train_model(train_x, train_y, train_m, train_t, test_x, test_y, test_m,
                test_t, batch_size, epochs, n_batches, unknowns):
    gen = ner_lib.generate(train_x, train_y, train_m, train_t, batch_size)
    model = models.BiLSTM(len(word2id),
                          embed_size,
                          hidden_size,
                          len(tag2id),
                          vectors,
                          train_embedding=train_embedding)
    model.apply(weights_init_uniform_rule)
    model.cuda()
    optimizer = t.optim.Adam(model.parameters(), lr=0.01)
    accs = []
    for epoch in range(epochs):
        total_loss = 0
        for batch in range(n_batches):

            x, y, m, T2S = next(gen)
            x = t.tensor(x, dtype=t.long).cuda()
            y = t.tensor(y, dtype=t.long).cuda()
            m = t.tensor(m).cuda()
            model.zero_grad()
            # x, scores, best_tag_sequence = model(x, m)
            # loss = model.loss_fn(x, m, y)
            z = model(x, m)
            z = z.view(-1, len(tag2id))
            y = y.view(-1)
            loss = model.loss_fn(z, y)

            loss.backward()
            optimizer.step()
            batch_loss = loss.detach().cpu().numpy()
            # print(batch_loss)
            total_loss += batch_loss

        accuracy = test2(model, test_x, test_y, test_m, test_t,
                         test_batch_size, unknowns)
        # loss = total_loss / n_batches
        # print(print(accuracy[2]))
        # print("Epoch: {0}, Loss: {1:.3}, Test: {2:.3}, {3:.3}, {4:.3}".format(epoch, loss, accuracy[0][0], accuracy[1][0], accuracy[2][0]))
        # accuracy.insert(0, [loss])
        print(accuracy)

        accs.append(accuracy)
    return accs
Exemplo n.º 2
0
train_data = TextDataset(f'data/{args.data}_train.jsonl', tokenizer, ['tokens', 'tags'])
test_data = TextDataset(f'data/{args.data}_test.jsonl', tokenizer, ['tokens', 'tags'])

train_iterator = DataLoader(train_data, batch_size=args.batch_size,
                            shuffle=True, collate_fn=train_data.collate)
test_iterator = DataLoader(test_data, batch_size=args.batch_size,
                           shuffle=False, collate_fn=test_data.collate)

token_vocab_size = len(tokenizer.vocabs['tokens'].itos)
tag_vocab_size = len(tokenizer.vocabs['tags'].itos)
token_pad_token = tokenizer.vocabs['tokens'].pad_token
token_pad_idx = tokenizer.vocabs['tokens'].stoi[token_pad_token]
tag_pad_token = tokenizer.vocabs['tags'].pad_token
tag_pad_idx = tokenizer.vocabs['tags'].stoi[tag_pad_token]

model = models.BiLSTM(token_vocab_size, args.embedding_dim, args.hidden_dim,
                      args.n_layers, args.dropout, token_pad_idx)
head = models.TagHead(args.hidden_dim, tag_vocab_size)

if args.load is not None:
    model.load_state_dict(torch.load(args.load))

model = model.cuda()
head = head.cuda()

optimizer = optim.Adam(list(model.parameters()) + list(head.parameters()),
                       lr=args.lr)

criterion = nn.CrossEntropyLoss(ignore_index=tag_pad_idx)

criterion = criterion.cuda()
Exemplo n.º 3
0
    if 'lin' in model_type:
        hidden_sizes = []
        hidden_sizes.append(lstm_hidden_size)
        linear_hidden = [
            x for x in linear_hidden_sizes if x < lstm_hidden_size
        ]
        linear_hidden_ind = np.random.randint(0, len(linear_hidden))
        linear_hidden_size = linear_hidden_sizes[linear_hidden_ind]
        hidden_sizes.append(linear_hidden_size)

    # Initialize model
    model = None
    if model_type == 'bilstm':
        model = models.BiLSTM(input_size, lstm_hidden_size, layers,
                              num_classes, device, rnn_dropout,
                              other_dropout).to(device)
    elif model_type == 'bigru':
        model = models.BiGRU(input_size, lstm_hidden_size, layers, num_classes,
                             device, rnn_dropout, other_dropout).to(device)
    elif model_type == 'bilstm-lin':
        model = models.BiLSTMLin(input_size, hidden_sizes, layers, num_classes,
                                 device, rnn_dropout, other_dropout).to(device)
    elif model_type == 'bigru-lin':
        model = models.BiGRULin(input_size, hidden_sizes, layers, num_classes,
                                device, rnn_dropout, other_dropout).to(device)
    elif model_type == 'bilstm-attn':
        model = models.BiLSTMAttn(input_size, lstm_hidden_size, layers,
                                  num_classes, device, rnn_dropout,
                                  other_dropout).to(device)
    elif model_type == 'bigru-attn':
Exemplo n.º 4
0
def run_rnn_exp(data,
                embedding_matrix,
                token_to_idx,
                seed=0,
                weight_decay=0.0,
                lr=0.001,
                max_len=51,
                batch_size=128,
                idx_to_label=['negative', 'neutral', 'positive'],
                embedding_freeze=True,
                embedding_normalize=True,
                obj='loss',
                measures=['loss', 'macro_f1', 'acc', 'avgrecall'],
                epoches=50,
                silent=False,
                cuda=-1):
    # set seed for reproduciable
    seed = seed
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Load data into numpy format
    train_list_sentences, train_list_labels = data_helper.map_to_num_rnn(
        data['train'][0],
        data['train'][1],
        token_to_idx,
        idx_to_label=idx_to_label,
        max_len=max_len)
    dev_list_sentences, dev_list_labels = data_helper.map_to_num_rnn(
        data['dev'][0],
        data['dev'][1],
        token_to_idx,
        idx_to_label=idx_to_label,
        max_len=max_len)
    test_list_sentences, test_list_labels = data_helper.map_to_num_rnn(
        data['test'][0],
        data['test'][1],
        token_to_idx,
        idx_to_label=idx_to_label,
        max_len=max_len)

    # create sampler to solve imbalance of training data
    train_num_count = [0] * len(idx_to_label)
    for label in train_list_labels:
        train_num_count[label] += 1
    if not silent:
        print(train_num_count)
    sample_weights = [0.0] * len(train_list_labels)
    for i, label in enumerate(train_list_labels):
        sample_weights[i] = len(train_list_labels) / train_num_count[label]
    sampler = torch.utils.data.sampler.WeightedRandomSampler(
        sample_weights, len(sample_weights))

    # create iter for train, dev and test
    train_iter = DataLoader(data_helper.BasicDataset(train_list_sentences,
                                                     train_list_labels),
                            batch_size=batch_size,
                            sampler=sampler,
                            collate_fn=data_helper.rnn_collate_fn_cuda)
    dev_iter = DataLoader(data_helper.BasicDataset(dev_list_sentences,
                                                   dev_list_labels),
                          batch_size=batch_size,
                          collate_fn=data_helper.rnn_collate_fn_cuda)
    test_iter = DataLoader(data_helper.BasicDataset(test_list_sentences,
                                                    test_list_labels),
                           batch_size=batch_size,
                           collate_fn=data_helper.rnn_collate_fn_cuda)

    model = models.BiLSTM(embedding_matrix,
                          hidden_size=150,
                          num_layer=2,
                          embedding_freeze=True,
                          embedding_normalize=True,
                          max_norm=5.0,
                          num_classes=2)
    if cuda != -1:
        model.cuda(cuda)

    # start training
    criterion = torch.nn.CrossEntropyLoss(size_average=False)
    optimizer = torch.optim.Adam(model.custom_params,
                                 lr=lr,
                                 weight_decay=weight_decay)
    obj_value = 0.0
    final_metrics = {
        'loss': 0.0,
        'macro_f1': 0.0,
        'acc': 0.0,
        'avgrecall': 0.0
    }
    for epoch in range(epoches):
        start_time = time.time()
        model.train()
        train_sum_loss = 0.0
        train_count = 0
        train_predict = []
        train_gold = []
        batch = ""
        for batch in train_iter:
            model.hidden1 = model.init_hidden(
                batch_size=int(batch['labels'].data.size()[0]))
            model.hidden2 = model.init_hidden(
                batch_size=int(batch['labels'].data.size()[0]))
            optimizer.zero_grad()
            outputs = model(batch['sentence'])
            _, outputs_label = torch.max(outputs, 1)
            for label in outputs_label.data:
                train_predict.append(int(label))
            for label in batch['labels'].data:
                train_gold.append(int(label))
            loss = criterion(outputs, batch['labels'])
            loss.backward()
            optimizer.step()
            train_sum_loss += loss.data[0]
            train_count += batch['labels'].shape[0]

        train_metrics_result = metrics.evaluation_metrics(
            train_gold,
            train_predict,
            measures=measures,
            idx_to_label=idx_to_label)
        train_metrics_result['loss'] = train_sum_loss / train_count
        if not silent:
            output_str = "[{}/{}]\ntrain\t".format(epoch + 1, epoches)
            for key in measures:
                output_str += "{}={:.4f}\t".format(key,
                                                   train_metrics_result[key])
            print(output_str)

        model.eval()
        dev_sum_loss = 0.0
        dev_count = 0
        dev_predict = []
        dev_gold = []
        for batch in dev_iter:
            model.hidden1 = model.init_hidden(
                batch_size=int(batch['labels'].data.size()[0]))
            model.hidden2 = model.init_hidden(
                batch_size=int(batch['labels'].data.size()[0]))
            optimizer.zero_grad()
            outputs = model(batch['sentence'])
            _, outputs_label = torch.max(outputs, 1)
            for label in outputs_label.data:
                dev_predict.append(int(label))
            for label in batch['labels'].data:
                dev_gold.append(int(label))
            loss = criterion(outputs, batch['labels'])
            dev_sum_loss += loss.data[0]
            dev_count += batch['labels'].shape[0]

        dev_metrics_result = metrics.evaluation_metrics(
            dev_gold,
            dev_predict,
            measures=measures,
            idx_to_label=idx_to_label)
        dev_metrics_result['loss'] = dev_sum_loss / dev_count
        if not silent:
            output_str = "dev\t".format(epoch + 1, epoches)
            for key in measures:
                output_str += "{}={:.4f}\t".format(key,
                                                   dev_metrics_result[key])
            print(output_str)

        test_sum_loss = 0.0
        test_count = 0
        test_predict = []
        test_gold = []
        for batch in test_iter:
            model.hidden1 = model.init_hidden(
                batch_size=int(batch['labels'].data.size()[0]))
            model.hidden2 = model.init_hidden(
                batch_size=int(batch['labels'].data.size()[0]))
            optimizer.zero_grad()
            outputs = model(batch['sentence'])
            _, outputs_label = torch.max(outputs, 1)
            for label in outputs_label.data:
                test_predict.append(int(label))
            for label in batch['labels'].data:
                test_gold.append(int(label))
            loss = criterion(outputs, batch['labels'])
            test_sum_loss += loss.data[0]
            test_count += batch['labels'].shape[0]

        test_metrics_result = metrics.evaluation_metrics(
            test_gold,
            test_predict,
            measures=measures,
            idx_to_label=idx_to_label)
        test_metrics_result['loss'] = test_sum_loss / test_count
        if not silent:
            output_str = "test\t".format(epoch + 1, epoches)
            for key in measures:
                output_str += "{}={:.4f}\t".format(
                    key, round(test_metrics_result[key], 5))
            print(output_str)

        # output time
        if not silent:
            print("cost time:{}".format(time.time() - start_time))

        # early stop procedure
        if epoch == 0:
            obj_value = dev_metrics_result[obj]
            final_metrics = test_metrics_result
        else:
            if obj != 'loss':
                if dev_metrics_result[obj] > obj_value:
                    obj_value = dev_metrics_result[obj]
                    final_metrics = test_metrics_result
            else:
                if dev_metrics_result[obj] < obj_value:
                    obj_value = dev_metrics_result[obj]
                    final_metrics = test_metrics_result

    return obj_value, final_metrics, model.embed.weight.data.cpu().numpy()