示例#1
0
 def __init__(self, opt):
     self.opt = opt
     tokenizer = build_tokenizer(
         fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
         max_length=opt.max_length,
         data_file='{0}_tokenizer.dat'.format(opt.dataset))
     embedding_matrix = build_embedding_matrix(
         vocab=tokenizer.vocab,
         embed_dim=opt.embed_dim,
         data_file='{0}d_{1}_embedding_matrix.dat'.format(
             str(opt.embed_dim), opt.dataset))
     trainset = SentenceDataset(opt.dataset_file['train'],
                                tokenizer,
                                target_dim=self.opt.polarities_dim)
     testset = SentenceDataset(opt.dataset_file['test'],
                               tokenizer,
                               target_dim=self.opt.polarities_dim)
     self.train_dataloader = DataLoader(dataset=trainset,
                                        batch_size=opt.batch_size,
                                        shuffle=True)
     self.test_dataloader = DataLoader(dataset=testset,
                                       batch_size=opt.batch_size,
                                       shuffle=False)
     self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
     if opt.device.type == 'cuda':
         print('cuda memory allocated:',
               torch.cuda.memory_allocated(self.opt.device.index))
     self._print_args()
示例#2
0
文件: model.py 项目: chaonan99/tflm
def main():
    import json

    option_file_path = 'dump/sentlm_base/options.json'

    with open(option_file_path, 'r') as fin:
        options = json.load(fin)

    with tf.variable_scope('lm'):
        model = SentenceLanguageModel(options, True)

    init = tf.initializers.global_variables()
    init_state_tensors = [model.init_lstm_state]
    final_state_tensors = [model.final_lstm_state]

    batch_size = options['batch_size']
    max_seq_length = options['unroll_steps']
    max_chars = options['char_cnn']['max_characters_per_token']

    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True

    sess = tf.InteractiveSession(config=config)
    sess.run(init)
    feed_dict = {
        model.tokens_characters:
        np.zeros([batch_size, max_seq_length, max_chars], dtype=np.int32)
    }
    init_state_values = sess.run(init_state_tensors, feed_dict=feed_dict)

    from data import SentenceDataset, UnicodeCharsVocabularyPad

    test_prefix = 'data/test/violin_test.txt'
    vocab_path = 'data/vocabulary/vocab_bnc_5.txt'

    vocabulary = UnicodeCharsVocabularyPad(vocab_path,
                                           max_word_length=max_chars)
    dataset = SentenceDataset(test_prefix, vocabulary)
    a = dataset.iter_batches(batch_size=batch_size, seq_length=max_seq_length)
    b = next(a)

    feed_dict = {
        model.tokens_characters: b['tokens_characters'],
        model.seq_length: b['lengths'],
        model.next_token_id: b['next_token_id']
    }
    total_loss = sess.run(model.total_loss, feed_dict=feed_dict)
    losses = sess.run(model.losses, feed_dict=feed_dict)
    print(f'Loss: {total_loss} (should be around 12)')

    from IPython import embed
    embed()
    import os
    os._exit(1)
示例#3
0
def main():
    # c=Collection()
    # c.load(Path("./2021/ref/training/medline.1200.es.txt"))
    # pickle_postag(c)

    file = './2021/ref/training/medline.1200.es.txt'
    data = SentenceDataset(
        file,
        transform=sentence_to_tensor,
        target_transform=lambda l: torch.stack(tuple(map(label_to_tensor, l))))
    data_loader = DataLoader(data,
                             batch_size=4,
                             collate_fn=my_collate_fn,
                             shuffle=True)
    n = MyLSTM(50, 50, len(TAGS), 113, 50)
    n.to(DEVICE)
    optimizer = torch.optim.SGD(n.parameters(), lr=learning_rate)
    metrics = {
        'acc':
        lambda pred, true: Accuracy()(pred, true),
        'f1':
        lambda pred, true: F1Score()
        (torch.tensor(pred.argmax(dim=1), dtype=torch.float32),
         torch.tensor(true, dtype=torch.float32))
    }
    train(data_loader,
          n,
          criterion,
          optimizer,
          5,
          filename='test_lstm.pth',
          metrics=metrics)
示例#4
0
def main(args):
    config = Config(args)
    options, ckpt_file = load_options_latest_checkpoint(config.save_path)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
        vocab = UnicodeCharsVocabularyPad(args.vocab_file, max_word_length)
    else:
        ## Not tested yet
        vocab = VocabularyPad(args.vocab_file)

    test_path = 'data/Selectional_Restrictions/Pylkkanen2007_processed.txt'
    # test_path = 'data/Selectional_Restrictions/Warren2015_processed.txt'
    # test_path = 'data/CSR/WSC_sent.txt'

    with open(test_path) as f:
        sents = [l.rstrip() for l in f.readlines()]
    num_per_group = 2 if 'WSC' in test_path else 3
    positions = _get_changed_positions(sents, num_per_group)
    data = SentenceDataset(test_path,
                           vocab,
                           test=True,
                           shuffle_on_load=False,
                           tokenizer=nltk.word_tokenize)

    # if options.get('bidirectional'):
    #     data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    # else:
    #     data = LMDataset(test_prefix, vocab, **kwargs)

    all_losses, all_lengths = test(options,
                                   ckpt_file,
                                   data,
                                   batch_size=args.batch_size)

    # Full score
    print('Full probability results')
    scores = all_losses.sum(axis=1) / all_lengths
    scores = np.array(scores).reshape(-1, num_per_group)
    res = scores.argmax(axis=1)
    for i in range(num_per_group):
        print(sum(res == i) / len(res))

    # Partial score
    print('Partial probability results')
    seq_mask = sequence_mask(np.array(positions) + 1, options['unroll_steps'])
    partial_losses = seq_mask * all_losses
    loss_mask = partial_losses > 0
    scores = partial_losses.sum(axis=1) / loss_mask.sum(axis=1)
    scores = np.array(scores).reshape(-1, num_per_group)
    res = scores.argmax(axis=1)
    for i in range(num_per_group):
        print(sum(res == i) / len(res))

    from IPython import embed
    embed()
    import os
    os._exit(1)
示例#5
0
def main():
    import json
    from data import SentenceDataset, VocabularyPad

    option_file_path = 'dump/sentpad_test/options.json'
    test_prefix = 'data/test/violin_test.txt'
    vocab_path = 'data/vocabulary/vocab_bnc_5.txt'

    with open(option_file_path, 'r') as fin:
        options = json.load(fin)

    with tf.variable_scope('lm'):
        model = SentenceLanguageModel(options, is_training=False)

    init = tf.initializers.global_variables()
    batch_size = options['batch_size']
    max_seq_length = options['unroll_steps']

    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True

    sess = tf.InteractiveSession()
    sess.run(init)
    vocabulary = VocabularyPad(vocab_path)
    dataset = SentenceDataset(test_prefix, vocabulary)
    a = dataset.iter_batches(batch_size=batch_size, seq_length=max_seq_length)
    b = next(a)

    feed_dict = {
        model.token_ids: b['token_ids'],
        model.seq_length: b['lengths'],
        model.next_token_id: b['next_token_id']
    }
    total_loss = sess.run(model.total_loss, feed_dict=feed_dict)
    losses = sess.run(model.losses, feed_dict=feed_dict)
    print(f'Loss: {total_loss} (should be around 12)')

    from IPython import embed
    embed()
    import os
    os._exit(1)
示例#6
0
文件: train.py 项目: chaonan99/tflm
def main(args):
    config = Config(args)
    options = config.get_options()

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
        vocab = UnicodeCharsVocabularyPad(args.vocab_file, max_word_length)
    else:
        vocab = VocabularyPad(args.vocab_file)
    data = SentenceDataset(args.prefix,
                           vocab,
                           test=False,
                           shuffle_on_load=True)
    train(options,
          data,
          int(args.ngpus),
          config.save_path,
          config.save_path,
          config.get_logger(),
          restart_ckpt_file=args.start_from)
示例#7
0
if __name__ == "__main__":
    cfg = ConfigBinaryClassification()
    cuda = True
    device = torch.device("cuda:1" if cuda else "cpu")

    model_path = "checkpoints/roberta24"

    model = BertForSequenceClassification.from_pretrained(model_path,
                                                          num_labels=2)
    model.to(device)
    model.eval()
    model.zero_grad()

    tokenizer_path = "hfl/chinese-roberta-wwm-ext"
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    train_dataset = SentenceDataset(tokenizer,
                                    cfg.DATA_PATH,
                                    dataset="train",
                                    cuda=False)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
    preds = []
    for tokens, label in train_loader:
        tokens = {key: item.to(device) for key, item in tokens.items()}
        label = label.to(device)

        pred = model(**tokens)[0]
        preds.append(pred.detach().cpu().numpy())
    preds = np.concatenate(preds)
    np.save("checkpoints/PTM-pred.npy", preds)
示例#8
0
def finetune(args, cfg):
    device = torch.device("cuda:%d" % args.cuda)
    model_config = args.model_config
    tokenizer = BertTokenizer.from_pretrained(model_config)
    train_dataset = SentenceDataset(tokenizer,
                                    cfg.DATA_PATH,
                                    dataset="train",
                                    cuda=False)
    valid_dataset = SentenceDataset(tokenizer,
                                    cfg.DATA_PATH,
                                    dataset="valid",
                                    cuda=False)

    train_loader = DataLoader(train_dataset, batch_size=16)
    valid_loader = DataLoader(valid_dataset, batch_size=16)

    model = BertForSequenceClassification.from_pretrained(
        model_config, num_labels=args.class_num)
    model.to(device)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr)
    criterion = nn.CrossEntropyLoss()
    #criterion = FocalLoss(classes=3, device=device).to(device)

    for epoch in range(args.epoch_num):
        for tokens, label in tqdm(train_loader):
            model.train()
            optimizer.zero_grad()

            tokens = {key: item.to(device) for key, item in tokens.items()}
            label = label.to(device)

            pred = model(**tokens)[0]
            loss = criterion(pred, label)
            loss.backward()
            optimizer.step()

            tokens = {key: item.cpu() for key, item in tokens.items()}
            label = label.cpu()
            del tokens, label

        with torch.no_grad():
            model.eval()
            preds = []
            labels = []
            for tokens, label in tqdm(valid_loader):
                tokens = {key: item.to(device) for key, item in tokens.items()}
                pred = model(**tokens)[0]
                p = pred.argmax(1).cpu().tolist()
                l = label.tolist()
                preds += p
                labels += l
            report = classification_report(preds, labels)
            print(report)
            model.save_pretrained(
                os.path.join(args.save_dir, args.save_config + str(epoch)))