def build_vocab(dataset):
    word_freq = Counter()
    pos_freq = Counter()
    nuc_freq = Counter()
    rel_freq = Counter()
    for paragraph in chain(*dataset):
        for node in paragraph.iterfind(filter=node_type_filter([EDU, Relation])):
            if isinstance(node, EDU):
                word_freq.update(node.words)
                pos_freq.update(node.tags)
            elif isinstance(node, Relation):
                nuc_freq[node.nuclear] += 1
                rel_freq[node.ftype] += 1

    word_vocab = Vocab("word", word_freq)
    pos_vocab = Vocab("part of speech", pos_freq)
    nuc_label = Label("nuclear", nuc_freq)
    rel_label = Label("relation", rel_freq)
    return word_vocab, pos_vocab, nuc_label, rel_label
예제 #2
0
def build_vocab(instances):
    words_counter = Counter()
    poses_counter = Counter()
    trans_counter = Counter()
    for words, poses, trans in instances:
        words_counter.update(chain(*words))
        poses_counter.update(chain(*poses))
        trans_counter.update(trans)
    word_vocab = Vocab("word", words_counter)
    pos_vocab = Vocab("part of speech", poses_counter)
    trans_label = Label("transition", trans_counter)
    return word_vocab, pos_vocab, trans_label
예제 #3
0
def build_vocab(trees, trans):
    trans_label = Label("transition", Counter(chain(*trans)))

    words_counter = Counter()
    poses_counter = Counter()
    for tree in trees:
        edus = list(tree.edus())
        words = [getattr(edu, "words") for edu in edus]
        poses = [getattr(edu, "tags") for edu in edus]
        words_counter.update(chain(*words))
        poses_counter.update(chain(*poses))
    word_vocab = Vocab("word", words_counter)
    pos_vocab = Vocab("part of speech", poses_counter)
    return word_vocab, pos_vocab, trans_label
def main(args):
    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    np.random.seed(args.seed)

    logger.info("args:" + str(args))
    # load dataset
    cdtb = CDTB(args.data,
                "TRAIN",
                "VALIDATE",
                "TEST",
                ctb_dir=args.ctb_dir,
                preprocess=True,
                cache_dir=args.cache_dir)
    word_vocab, pos_vocab = build_vocab(cdtb.train)
    instances, tags = gen_train_instances(cdtb.train)
    tag_label = Label("tag", Counter(chain(*tags)))
    trainset = numericalize(instances, tags, word_vocab, pos_vocab, tag_label)

    # build model
    model = RNNSegmenterModel(hidden_size=args.hidden_size,
                              dropout=args.dropout,
                              rnn_layers=args.rnn_layers,
                              word_vocab=word_vocab,
                              pos_vocab=pos_vocab,
                              tag_label=tag_label,
                              pos_size=args.pos_size,
                              pretrained=args.pretrained,
                              w2v_freeze=args.w2v_freeze,
                              use_gpu=args.use_gpu)
    if args.use_gpu:
        model.cuda()
    logger.info(model)

    # train
    step = 0
    best_model_f1 = 0
    wait_count = 0
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.l2)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     factor=0.1,
                                                     patience=3)
    for nepoch in range(1, args.epoch + 1):
        batch_iter = gen_batch_iter(trainset,
                                    args.batch_size,
                                    use_gpu=args.use_gpu)
        for nbatch, (inputs, target) in enumerate(batch_iter, start=1):
            step += 1
            model.train()
            optimizer.zero_grad()
            loss = model.loss(inputs, target)
            loss.backward()
            optimizer.step()
            if nbatch > 0 and nbatch % args.log_every == 0:
                logger.info(
                    "step %d, patient %d, lr %f, epoch %d, batch %d, train loss %.4f"
                    % (step, wait_count, get_lr(optimizer), nepoch, nbatch,
                       loss.item()))
        # model selection
        score = evaluate(cdtb.validate, model)
        f1 = score[-1]
        scheduler.step(f1, nepoch)
        logger.info("evaluation score:")
        logger.info("\n" + gen_edu_report(score))
        if f1 > best_model_f1:
            wait_count = 0
            best_model_f1 = f1
            logger.info("save new best model to %s" % args.model_save)
            with open(args.model_save, "wb+") as model_fd:
                torch.save(model, model_fd)
            logger.info("test on new best model...")
            test_score = evaluate(cdtb.test, model)
            logger.info("test score:")
            logger.info("\n" + gen_edu_report(test_score))
        else:
            wait_count += 1
            if wait_count > args.patient:
                logger.info("early stopping...")
                break

    with open(args.model_save, "rb") as model_fd:
        best_model = torch.load(model_fd)
    test_score = evaluate(cdtb.test, best_model)
    logger.info("test score on final best model:")
    logger.info("\n" + gen_edu_report(test_score))