示例#1
0
    def __init__(self, *paths):
        self.data = [
            list(U.FileReader(path).sents()) for path in paths
        ]

        assert all(len(d) == len(self.data[0]) for d in self.data), \
            "Not all files have the same length."
示例#2
0
def create_dataloader(dataset):
    return D.MultiSentWordDataLoader(dataset=dataset,
                                     input_vocabs=input_vocabs,
                                     label_vocabs=label_vocab,
                                     batch_size=batch_size,
                                     shuffle=shuffle,
                                     tensor_lens=True,
                                     num_workers=1,
                                     pin_memory=True)


if __name__ == "__main__":
    input_vocabs = []
    input = input_path
    vocab = utils.Vocabulary()
    words = utils.FileReader(input).words()
    vocab.add("<pad>")
    vocab.add("<unk>")
    utils.populate_vocab(words, vocab)
    input_vocabs.append(vocab)
    # print(input_vocabs)[<utils.Vocabulary object at 0x7fa839f5a0b8>]

    label_vocab = utils.Vocabulary()
    words = utils.FileReader(label_path).words()
    label_vocab.add("START")
    label_vocab.add("END")
    utils.populate_vocab(words, label_vocab)

    crf = M.CRF(len(label_vocab))
    model = M.LSTMCRF(crf=crf,
                      vocab_sizes=[len(v) for v in input_vocabs],
示例#3
0
def main(args):
    logging.basicConfig(level=logging.INFO)
    check_arguments(args)

    logging.info("Creating vocabulary...")
    input_vocabs = []

    for input in args.input_path:
        vocab = utils.Vocabulary()
        words = utils.FileReader(input).words()
        vocab.add("<pad>")
        vocab.add("<unk>")
        utils.populate_vocab(words, vocab)
        input_vocabs.append(vocab)
    # print(input_vocabs)[<utils.Vocabulary object at 0x7fa839f5a0b8>]

    label_vocab = utils.Vocabulary()
    words = utils.FileReader(args.label_path).words()
    label_vocab.add("START")
    label_vocab.add("END")
    utils.populate_vocab(words, label_vocab)

    for i, input_vocab in enumerate(input_vocabs):
        vocab_path = os.path.join(args.save_dir,
                                  "vocab-input{}.pkl".format(i + 1))
        pickle.dump(input_vocab, open(vocab_path, "wb"))
    vocab_path = os.path.join(args.save_dir, "vocab-label.pkl")
    pickle.dump(label_vocab, open(vocab_path, "wb"))

    logging.info("Initializing model...")
    crf = M.CRF(len(label_vocab))
    print('args.word_dim==',args.word_dim,type(args.word_dim))
    model = M.LSTMCRF(
        crf=crf,
        vocab_sizes=[len(v) for v in input_vocabs],
        word_dims=args.word_dim,
        hidden_dim=args.lstm_dim,
        layers=args.lstm_layers,
        dropout_prob=args.dropout_prob,
        bidirectional=args.bidirectional
    )
    model.reset_parameters()
    if args.gpu:
        gpu_main = args.gpu[0]
        model = model.cuda(gpu_main)
    params = sum(np.prod(p.size()) for p in model.parameters())
    logging.info("Number of parameters: {}".format(params))

    logging.info("Loading word embeddings...")
    # for vocab, we_type, we_path, we_freeze, emb in \
    #         zip(input_vocabs, args.wordembed_type, args.wordembed_path,
    #             args.wordembed_freeze, model.embeddings):
    #     if we_type == "glove":
    #         assert we_path is not None
    #         load_glove_embeddings(emb, vocab, we_path)
    #     elif we_type == "fasttext":
    #         assert we_path is not None
    #         assert args.fasttext_path is not None
    #         load_fasttext_embeddings(emb, vocab,
    #                                  fasttext_path=args.fasttext_path,
    #                                  embedding_path=we_path)
    #     elif we_type == "none":
    #         pass
    #     else:
    #         raise ValueError("Unrecognized word embedding "
    #                          "type: {}".format(we_type))
    #
    #     if we_freeze:
    #         emb.weight.requires_grad = False

    # Copying configuration file to save directory if config file is specified.
    if args.config:
        config_path = os.path.join(args.save_dir, os.path.basename(args.config))
        shutil.copy(args.config, config_path)

    def create_dataloader(dataset):
        return D.MultiSentWordDataLoader(
            dataset=dataset,
            input_vocabs=input_vocabs,
            label_vocabs=label_vocab,
            batch_size=args.batch_size,
            shuffle=args.shuffle,
            tensor_lens=True,
            num_workers=len(args.gpu) if args.gpu is not None else 1,
            pin_memory=True
        )

    dataset = D.MultiSentWordDataset(*args.input_path, args.label_path)
    test_dataset = D.MultiSentWordDataset(*args.test_input_path, args.test_label_path)

    if args.val:
        vr = args.val_ratio
        val_dataset, _ = dataset.split(vr, 1-vr, shuffle=args.shuffle)
    else:
        val_dataset = None

    train_dataset = dataset
    train_dataloader = create_dataloader(train_dataset)
    test_dataloader = create_dataloader(test_dataset)

    if val_dataset is not None:
        val_dataloader = create_dataloader(val_dataset)
    else:
        val_dataloader = None
    print(input_vocabs,type(input_vocabs))

    logging.info("Beginning training...")
    trainer = LSTMCRFTrainer(
        sargs=args,
        input_vocabs=input_vocabs,
        label_vocab=label_vocab,
        val_data=val_dataloader,
        model=model,
        epochs=args.epochs,
        gpus=args.gpu
    )

    trainer.train(train_dataloader, data_size=len(train_dataset))
    # trainer.validate()
    logging.info("Beginning testing...")
    # trainer.test(train_dataloader, data_size=len(train_dataset))
    #trainer.test(test_dataloader, data_size=len(test_dataset))
    logging.info("Done!")