Пример #1
0
    def __init__(self, vocab, config):
        word2id = vocab.word2idx
        super(Model, self).__init__()
        vocab_num = len(word2id)
        self.word2id = word2id
        self.config = config
        self.char_dict = preprocess.get_char_dict('data/char_vocab.english.txt')
        self.genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
        self.device = torch.device("cuda:" + config.cuda)

        self.emb = nn.Embedding(vocab_num, 350)

        emb1 = EmbedLoader().load_with_vocab(config.glove, vocab,normalize=False)
        emb2 = EmbedLoader().load_with_vocab(config.turian,  vocab ,normalize=False)
        pre_emb = np.concatenate((emb1, emb2), axis=1)
        pre_emb /= (np.linalg.norm(pre_emb, axis=1, keepdims=True) + 1e-12)

        if pre_emb is not None:
            self.emb.weight = nn.Parameter(torch.from_numpy(pre_emb).float())
            for param in self.emb.parameters():
                param.requires_grad = False
        self.emb_dropout = nn.Dropout(inplace=True)


        if config.use_elmo:
            self.elmo = ElmoEmbedder(options_file='data/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json',
                                     weight_file='data/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5',
                                     cuda_device=int(config.cuda))
            print("elmo load over.")
            self.elmo_args = torch.randn((3), requires_grad=True).to(self.device)

        self.char_emb = nn.Embedding(len(self.char_dict), config.char_emb_size)
        self.conv1 = nn.Conv1d(config.char_emb_size, 50, 3)
        self.conv2 = nn.Conv1d(config.char_emb_size, 50, 4)
        self.conv3 = nn.Conv1d(config.char_emb_size, 50, 5)

        self.feature_emb = nn.Embedding(config.span_width, config.feature_size)
        self.feature_emb_dropout = nn.Dropout(p=0.2, inplace=True)

        self.mention_distance_emb = nn.Embedding(10, config.feature_size)
        self.distance_drop = nn.Dropout(p=0.2, inplace=True)

        self.genre_emb = nn.Embedding(7, config.feature_size)
        self.speaker_emb = nn.Embedding(2, config.feature_size)

        self.bilstm = VarLSTM(input_size=350+150*config.use_CNN+config.use_elmo*1024,hidden_size=200,bidirectional=True,batch_first=True,hidden_dropout=0.2)
        # self.bilstm = nn.LSTM(input_size=500, hidden_size=200, bidirectional=True, batch_first=True)
        self.h0 = nn.init.orthogonal_(torch.empty(2, 1, 200)).to(self.device)
        self.c0 = nn.init.orthogonal_(torch.empty(2, 1, 200)).to(self.device)
        self.bilstm_drop = nn.Dropout(p=0.2, inplace=True)

        self.atten = ffnn(input_size=400, hidden_size=config.atten_hidden_size, output_size=1)
        self.mention_score = ffnn(input_size=1320, hidden_size=config.mention_hidden_size, output_size=1)
        self.sa = ffnn(input_size=3980+40*config.use_metadata, hidden_size=config.sa_hidden_size, output_size=1)
        self.mention_start_np = None
        self.mention_end_np = None
Пример #2
0
    def process(self,
                paths,
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                src_embed_op: EmbeddingOption = None):
        input_name, target_name = 'words', 'target'
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(
            **src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)

        info = DataBundle(datasets=self.load(paths))
        _train_ds = [info.datasets[name] for name in train_ds
                     ] if train_ds else info.datasets.values()
        src_vocab.from_dataset(*_train_ds, field_name=input_name)
        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        src_vocab.index_dataset(*info.datasets.values(),
                                field_name=input_name,
                                new_field_name=input_name)
        tgt_vocab.index_dataset(*info.datasets.values(),
                                field_name=target_name,
                                new_field_name=target_name)
        info.vocabs = {input_name: src_vocab, target_name: tgt_vocab}

        if src_embed_op is not None:
            src_embed_op.vocab = src_vocab
            init_emb = EmbedLoader.load_with_vocab(**src_embed_op)
            info.embeddings[input_name] = init_emb

        for name, dataset in info.datasets.items():
            dataset.set_input(input_name)
            dataset.set_target(target_name)
        return info
Пример #3
0
def word_to_id(glove_data, glove_matrix, vocab_dict_path, file_path):
    if os.path.exists(glove_data) == False or os.path.exists(
            glove_matrix) == False:
        data, feature_words, user_num, item_num, = feature_word(file_path)
        vocab = Vocabulary(max_size=len(feature_words) + 1,
                           unknown='unk',
                           padding='PAD')
        vocab.add_word_lst(feature_words)
        vocab.build_vocab()
        matrix = EmbedLoader.load_with_vocab(vocab_dict_path, vocab)
        matrix = torch.tensor(matrix)

        for d in range(len(data)):
            review = []
            for word in data[d]['reviewText']:
                review.append(vocab.to_index(word))
            data[d]['reviewText'] = review

        with open(glove_data, 'wb') as f:
            pickle.dump(data, f)

        with open(glove_matrix, 'wb') as f:
            pickle.dump(matrix, f)

    with open(glove_data, 'rb') as f:
        glove_data = pickle.load(f)
    with open(glove_matrix, 'rb') as f:
        matrix = pickle.load(f)

    return glove_data, matrix, len(glove_data[0]['reviewText'])
Пример #4
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None,
                char_level_op=False):

        datasets = {}
        info = DataBundle()
        paths = check_dataloader_paths(paths)
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        if char_level_op:
            for dataset in datasets.values():
                dataset.apply_field(wordtochar,
                                    field_name="words",
                                    new_field_name='chars')

        datasets["train"], datasets["dev"] = datasets["train"].split(
            0.1, shuffle=False)

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(
            **src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')

        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {"words": src_vocab, "target": tgt_vocab}

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt,
                                                vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Пример #5
0
def load_conll_with_glove(
        data_dir,
        data_path='train.pos',
        glove_path="",
        # glove_path='/remote-home/ygxu/dataset/glove.empty.txt',
        load_glove=True,
        vocabs=None):
    path = os.path.join(data_dir, data_path)
    print(f"start load dataset from {path}.")

    from dataset import MyConllLoader
    ds = MyConllLoader().load(path)
    print(ds)
    ds.rename_field('word_seq', 'sentence')
    ds.rename_field('label_seq', 'label')
    #ds = DataSet.read_pos(path, headers=('sentence', 'label'), sep='\t')

    #ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence')
    #ds.apply(lambda x: x['sentence'].strip().split(), new_field_name='sentence')
    ds.apply(lambda x: len(x['sentence']) * [1.],
             new_field_name='word_seq_origin_len',
             is_input=True)

    if vocabs is None:
        vocab = Vocabulary(max_size=30000,
                           min_freq=2,
                           unknown='<unk>',
                           padding='<pad>')
        ds.apply(lambda x: [vocab.add(word) for word in x['sentence']])
        vocab.build_vocab()
        vocab_label = Vocabulary(max_size=200, unknown=None, padding='<pad>')
        ds.apply(lambda x: [vocab_label.add(label) for label in x['label']])
        vocab_label.build_vocab()
    else:
        vocab, vocab_label = vocabs

    ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']],
             new_field_name='word_seq',
             is_input=True)
    ds.apply(lambda x: [vocab_label.to_index(w) for w in x['label']],
             new_field_name='truth',
             is_input=True,
             is_target=True)

    if not load_glove:
        print(f"successful load dataset from {path}")
        return ds

    embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove',
                                                vocab)

    print(f"successful load dataset and embedding from {path}")

    return ds, embedding, (vocab, vocab_label)
Пример #6
0
def train(path):
    # test saving pipeline
    save_pipe(path)
    embed = EmbedLoader.fast_load_embedding(model_args['word_emb_dim'],
                                            emb_file_name, word_v)
    embed = torch.tensor(embed, dtype=torch.float32)

    # embed = EmbedLoader.fast_load_embedding(emb_dim=model_args['word_emb_dim'], emb_file=emb_file_name, vocab=word_v)
    # embed = torch.tensor(embed, dtype=torch.float32)
    # model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=True)
    model.word_embedding.padding_idx = word_v.padding_idx
    model.word_embedding.weight.data[word_v.padding_idx].fill_(0)
    model.pos_embedding.padding_idx = pos_v.padding_idx
    model.pos_embedding.weight.data[pos_v.padding_idx].fill_(0)

    class MyCallback(Callback):
        def on_step_end(self, optimizer):
            step = self.trainer.step
            # learning rate decay
            if step > 0 and step % 1000 == 0:
                for pg in optimizer.param_groups:
                    pg['lr'] *= 0.93
                print('decay lr to {}'.format(
                    [pg['lr'] for pg in optimizer.param_groups]))

            if step == 3000:
                # start training embedding
                print('start training embedding at {}'.format(step))
                model = self.trainer.model
                for m in model.modules():
                    if isinstance(m, torch.nn.Embedding):
                        m.weight.requires_grad = True

    # Trainer
    trainer = Trainer(model=model,
                      train_data=train_data,
                      dev_data=dev_data,
                      loss=ParserLoss(),
                      metrics=ParserMetric(),
                      metric_key='UAS',
                      **train_args.data,
                      optimizer=fastNLP.Adam(**optim_args.data),
                      save_path=path,
                      callbacks=[MyCallback()])

    # Start training
    try:
        trainer.train()
        print("Training finished!")
    finally:
        # save pipeline
        save_pipe(path)
        print('pipe saved')
Пример #7
0
def load_dataset_with_glove(data_dir,
                            data_path='mr.task.train',
                            glove_path="",
                            load_glove=True,
                            vocabs=None):
    path = os.path.join(data_dir, data_path)
    print(f"start load dataset from {path}.")

    ds = DataSet.read_csv(path, headers=('label', 'sentence'), sep='\t')

    ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence')
    ds.apply(lambda x: x['sentence'].strip().split(),
             new_field_name='sentence')
    ds.apply(lambda x: len(x['sentence']) * [1.],
             new_field_name='mask',
             is_input=True)
    ds.apply(lambda x: int(x['label']), new_field_name='label', is_target=True)

    if vocabs is None:
        vocab = Vocabulary(max_size=30000,
                           min_freq=2,
                           unknown='<unk>',
                           padding='<pad>')
        ds.apply(lambda x: [vocab.add(word) for word in x['sentence']])
        vocab.build_vocab()
    else:
        vocab = vocabs

    ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']],
             new_field_name='data',
             is_input=True)

    if not load_glove:
        print(f"successful load dataset from {path}")
        return ds

    embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove',
                                                vocab)

    print(f"successful load dataset and embedding from {path}")

    return ds, embedding, vocab
Пример #8
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None):
        
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataBundle()
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')
        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {
            "words": src_vocab,
            "target": tgt_vocab
        }

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Пример #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--method",
        default='cnn',
        help="train model and test it",
        choices=['cnn', 'cnn_glove', 'rnn', 'rnn_maxpool', 'rnn_avgpool'])
    parser.add_argument("--dataset",
                        default='1',
                        help="1: small dataset; 2: big dataset",
                        choices=['1', '2'])
    args = parser.parse_args()

    # 超参数
    embedding_dim = 256
    batch_size = 32
    # RNN
    hidden_dim = 256
    # CNN
    kernel_sizes = (3, 4, 5)
    num_channels = (120, 160, 200)
    acti_function = 'relu'

    learning_rate = 1e-3
    train_patience = 8
    cate_num = 4

    # GloVe
    embedding_file_path = "glove.6B.100d.txt"

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    vocab = read_vocab("vocab.txt")
    print("vocabulary length:", len(vocab))
    train_data = DataSet().load("train_set")
    dev_data = DataSet().load("dev_set")
    test_data = DataSet().load("test_set")

    if (args.dataset == '1'):
        cate_num = 4
        num_channels = (48, 48, 48)
        embedding_dim = 128
        hidden_dim = 128
    elif (args.dataset == '2'):
        cate_num = 20

    if (args.method == 'cnn'):
        model = TextCNN(vocab_size=len(vocab),
                        embedding_dim=embedding_dim,
                        kernel_sizes=kernel_sizes,
                        num_channels=num_channels,
                        num_classes=cate_num,
                        activation=acti_function)
    elif (args.method == 'cnn_glove'):
        glove_embedding = EmbedLoader.load_with_vocab(embedding_file_path,
                                                      vocab)
        embedding_dim = glove_embedding.shape[1]
        print("GloVe embedding_dim:", embedding_dim)

        model = TextCNN_glove(vocab_size=len(vocab),
                              embedding_dim=embedding_dim,
                              kernel_sizes=kernel_sizes,
                              num_channels=num_channels,
                              num_classes=cate_num,
                              activation=acti_function)
        model.embedding.load_state_dict(
            {"weight": torch.from_numpy(glove_embedding)})
        model.constant_embedding.load_state_dict(
            {"weight": torch.from_numpy(glove_embedding)})
        model.constant_embedding.weight.requires_grad = False
        model.embedding.weight.requires_grad = True

    elif (args.method == 'rnn'):
        embedding_dim = 128
        hidden_dim = 128
        model = BiRNNText(vocab_size=len(vocab),
                          embedding_dim=embedding_dim,
                          output_dim=cate_num,
                          hidden_dim=hidden_dim)
    elif (args.method == 'rnn_maxpool'):
        model = BiRNNText_pool(vocab_size=len(vocab),
                               embedding_dim=embedding_dim,
                               output_dim=cate_num,
                               hidden_dim=hidden_dim,
                               pool_name="max")
    elif (args.method == 'rnn_avgpool'):
        model = BiRNNText_pool(vocab_size=len(vocab),
                               embedding_dim=embedding_dim,
                               output_dim=cate_num,
                               hidden_dim=hidden_dim,
                               pool_name="avg")

    tester = Tester(test_data, model, metrics=AccuracyMetric())

    trainer = Trainer(
        train_data=train_data,
        model=model,
        loss=CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET),
        metrics=AccuracyMetric(),
        n_epochs=80,
        batch_size=batch_size,
        print_every=10,
        validate_every=-1,
        dev_data=dev_data,
        optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate),
        check_code_level=2,
        metric_key='acc',
        use_tqdm=True,
        callbacks=[EarlyStopCallback(train_patience)],
        device=device,
    )

    trainer.train()
    tester.test()
Пример #10
0
    tag_v = Vocabulary(need_default=False)
    train_data = loader.load(os.path.join(datadir, train_data_name))
    dev_data = loader.load(os.path.join(datadir, dev_data_name))
    test_data = loader.load(os.path.join(datadir, test_data_name))
    train_data.update_vocab(word_seq=word_v, pos_seq=pos_v, head_labels=tag_v)
    datasets = (train_data, dev_data, test_data)
    save_data(processed_datadir,
              word_v=word_v,
              pos_v=pos_v,
              tag_v=tag_v,
              train_data=train_data,
              dev_data=dev_data,
              test_data=test_data)

embed, _ = EmbedLoader.load_embedding(
    model_args['word_emb_dim'], emb_file_name, 'glove', word_v,
    os.path.join(processed_datadir, 'word_emb.pkl'))

print(len(word_v))
print(embed.size())

# Model
model_args['word_vocab_size'] = len(word_v)
model_args['pos_vocab_size'] = len(pos_v)
model_args['num_label'] = len(tag_v)

model = BiaffineParser(**model_args.data)
model.reset_parameters()
datasets = (train_data, dev_data, test_data)
for ds in datasets:
    ds.index_field("word_seq", word_v).index_field("pos_seq",
Пример #11
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                char_vocab_opt: VocabularyOption = None,
                char_embed_opt: EmbeddingOption = None,
                bigram_vocab_opt: VocabularyOption = None,
                bigram_embed_opt: EmbeddingOption = None,
                L: int = 4):
        """
        支持的数据格式为一行一个sample,并且用空格隔开不同的词语。例如

        Option::

            共同  创造  美好  的  新  世纪  ——  二○○一年  新年  贺词
            (  二○○○年  十二月  三十一日  )  (  附  图片  1  张  )
            女士  们  ,  先生  们  ,  同志  们  ,  朋友  们  :

        paths支持两种格式,第一种是str,第二种是Dict[str, str].

        Option::

            # 1. str类型
            # 1.1 传入具体的文件路径
            data = SigHanLoader('bmes').process('/path/to/cws/data.txt') # 将读取data.txt的内容
            # 包含以下的内容data.vocabs['chars']:Vocabulary对象,
            #             data.vocabs['target']: Vocabulary对象,根据encoding_type可能会没有该值
            #             data.embeddings['chars']: Embedding对象. 只有提供了预训练的词向量的路径才有该项
            #             data.datasets['train']: DataSet对象
            #                   包含的field有:
            #                       raw_chars: list[str], 每个元素是一个汉字
            #                       chars: list[int], 每个元素是汉字对应的index
            #                       target: list[int], 根据encoding_type有对应的变化
            # 1.2 传入一个目录, 里面必须包含train.txt文件
            data = SigHanLoader('bmes').process('path/to/cws/') #将尝试在该目录下读取 train.txt, test.txt以及dev.txt
            # 包含以下的内容data.vocabs['chars']: Vocabulary对象
            #             data.vocabs['target']:Vocabulary对象
            #             data.embeddings['chars']: 仅在提供了预训练embedding路径的情况下,为Embedding对象;
            #             data.datasets['train']: DataSet对象
            #                    包含的field有:
            #                       raw_chars: list[str], 每个元素是一个汉字
            #                       chars: list[int], 每个元素是汉字对应的index
            #                       target: list[int], 根据encoding_type有对应的变化
            #             data.datasets['dev']: DataSet对象,如果文件夹下包含了dev.txt;内容与data.datasets['train']一样

            # 2. dict类型, key是文件的名称,value是对应的读取路径. 必须包含'train'这个key
            paths = {'train': '/path/to/train/train.txt', 'test':'/path/to/test/test.txt', 'dev':'/path/to/dev/dev.txt'}
            data = SigHanLoader(paths).process(paths)
            # 结果与传入目录时是一致的,但是可以传入多个数据集。data.datasets中的key将与这里传入的一致

        :param paths: 支持传入目录,文件路径,以及dict。
        :param char_vocab_opt: 用于构建chars的vocabulary参数,默认为min_freq=2
        :param char_embed_opt: 用于读取chars的Embedding的参数,默认不读取pretrained的embedding
        :param bigram_vocab_opt: 用于构建bigram的vocabulary参数,默认不使用bigram, 仅在指定该参数的情况下会带有bigrams这个field。
            为List[int], 每个instance长度与chars一样, abcde的bigram为ab bc cd de e<eos>
        :param bigram_embed_opt: 用于读取预训练bigram的参数,仅在传入bigram_vocab_opt有效
        :param L: 当target_type为shift_relay时传入的segment长度
        :return:
        """
        # 推荐大家使用这个check_data_loader_paths进行paths的验证
        paths = check_dataloader_paths(paths)
        datasets = {}
        data = DataBundle()
        bigram = bigram_vocab_opt is not None
        for name, path in paths.items():
            dataset = self.load(path, bigram=bigram)
            datasets[name] = dataset
        input_fields = []
        target_fields = []
        # 创建vocab
        char_vocab = Vocabulary(
            min_freq=2) if char_vocab_opt is None else Vocabulary(
                **char_vocab_opt)
        char_vocab.from_dataset(datasets['train'], field_name='raw_chars')
        char_vocab.index_dataset(*datasets.values(),
                                 field_name='raw_chars',
                                 new_field_name='chars')
        data.vocabs[Const.CHAR_INPUT] = char_vocab
        input_fields.extend([Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET])
        target_fields.append(Const.TARGET)
        # 创建target
        if self.target_type == 'bmes':
            target_vocab = Vocabulary(unknown=None, padding=None)
            target_vocab.add_word_lst(['B'] * 4 + ['M'] * 3 + ['E'] * 2 +
                                      ['S'])
            target_vocab.index_dataset(*datasets.values(), field_name='target')
            data.vocabs[Const.TARGET] = target_vocab
        if char_embed_opt is not None:
            char_embed = EmbedLoader.load_with_vocab(**char_embed_opt,
                                                     vocab=char_vocab)
            data.embeddings['chars'] = char_embed
        if bigram:
            bigram_vocab = Vocabulary(**bigram_vocab_opt)
            bigram_vocab.from_dataset(datasets['train'], field_name='bigrams')
            bigram_vocab.index_dataset(*datasets.values(),
                                       field_name='bigrams')
            data.vocabs['bigrams'] = bigram_vocab
            if bigram_embed_opt is not None:
                bigram_embed = EmbedLoader.load_with_vocab(**bigram_embed_opt,
                                                           vocab=bigram_vocab)
                data.embeddings['bigrams'] = bigram_embed
            input_fields.append('bigrams')
        if self.target_type == 'shift_relay':
            func = partial(self._clip_target, L=L)
            for name, dataset in datasets.items():
                res = dataset.apply_field(func, field_name='target')
                relay_target = [res_i[0] for res_i in res]
                relay_mask = [res_i[1] for res_i in res]
                dataset.add_field('relay_target',
                                  relay_target,
                                  is_input=True,
                                  is_target=False,
                                  ignore_type=False)
                dataset.add_field('relay_mask',
                                  relay_mask,
                                  is_input=True,
                                  is_target=False,
                                  ignore_type=False)
        if self.target_type == 'shift_relay':
            input_fields.extend(['end_seg_mask'])
            target_fields.append('start_seg_mask')
        # 将dataset加入DataInfo
        for name, dataset in datasets.items():
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)
            data.datasets[name] = dataset

        return data
Пример #12
0
def main():
    parser = argparse.ArgumentParser(description='Summarization Model')

    # Where to find data
    parser.add_argument(
        '--data_path',
        type=str,
        default='/remote-home/dqwang/Datasets/CNNDM/train.label.jsonl',
        help='Path expression to pickle datafiles.')
    parser.add_argument(
        '--valid_path',
        type=str,
        default='/remote-home/dqwang/Datasets/CNNDM/val.label.jsonl',
        help='Path expression to pickle valid datafiles.')
    parser.add_argument('--vocab_path',
                        type=str,
                        default='/remote-home/dqwang/Datasets/CNNDM/vocab',
                        help='Path expression to text vocabulary file.')

    # Important settings
    parser.add_argument('--mode',
                        choices=['train', 'test'],
                        default='train',
                        help='must be one of train/test')
    parser.add_argument('--embedding',
                        type=str,
                        default='glove',
                        choices=['word2vec', 'glove', 'elmo', 'bert'],
                        help='must be one of word2vec/glove/elmo/bert')
    parser.add_argument('--sentence_encoder',
                        type=str,
                        default='transformer',
                        choices=['bilstm', 'deeplstm', 'transformer'],
                        help='must be one of LSTM/Transformer')
    parser.add_argument('--sentence_decoder',
                        type=str,
                        default='SeqLab',
                        choices=['PN', 'SeqLab'],
                        help='must be one of PN/SeqLab')
    parser.add_argument(
        '--restore_model',
        type=str,
        default='None',
        help=
        'Restore model for further training. [bestmodel/bestFmodel/earlystop/None]'
    )

    # Where to save output
    parser.add_argument('--save_root',
                        type=str,
                        default='save/',
                        help='Root directory for all model.')
    parser.add_argument('--log_root',
                        type=str,
                        default='log/',
                        help='Root directory for all logging.')

    # Hyperparameters
    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--cuda',
                        action='store_true',
                        default=False,
                        help='use cuda')
    parser.add_argument(
        '--vocab_size',
        type=int,
        default=100000,
        help=
        'Size of vocabulary. These will be read from the vocabulary file in order. If the vocabulary file contains fewer words than this number, or if this number is set to 0, will take all words in the vocabulary file.'
    )
    parser.add_argument('--n_epochs',
                        type=int,
                        default=20,
                        help='Number of epochs [default: 20]')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Mini batch size [default: 128]')

    parser.add_argument('--word_embedding',
                        action='store_true',
                        default=True,
                        help='whether to use Word embedding')
    parser.add_argument('--embedding_path',
                        type=str,
                        default='/remote-home/dqwang/Glove/glove.42B.300d.txt',
                        help='Path expression to external word embedding.')
    parser.add_argument('--word_emb_dim',
                        type=int,
                        default=300,
                        help='Word embedding size [default: 200]')
    parser.add_argument(
        '--embed_train',
        action='store_true',
        default=False,
        help='whether to train Word embedding [default: False]')
    parser.add_argument('--min_kernel_size',
                        type=int,
                        default=1,
                        help='kernel min length for CNN [default:1]')
    parser.add_argument('--max_kernel_size',
                        type=int,
                        default=7,
                        help='kernel max length for CNN [default:7]')
    parser.add_argument('--output_channel',
                        type=int,
                        default=50,
                        help='output channel: repeated times for one kernel')
    parser.add_argument('--use_orthnormal_init',
                        action='store_true',
                        default=True,
                        help='use orthnormal init for lstm [default: true]')
    parser.add_argument(
        '--sent_max_len',
        type=int,
        default=100,
        help='max length of sentences (max source text sentence tokens)')
    parser.add_argument(
        '--doc_max_timesteps',
        type=int,
        default=50,
        help='max length of documents (max timesteps of documents)')
    parser.add_argument('--save_label',
                        action='store_true',
                        default=False,
                        help='require multihead attention')

    # Training
    parser.add_argument('--lr',
                        type=float,
                        default=0.0001,
                        help='learning rate')
    parser.add_argument('--lr_descent',
                        action='store_true',
                        default=False,
                        help='learning rate descent')
    parser.add_argument('--grad_clip',
                        action='store_true',
                        default=False,
                        help='for gradient clipping')
    parser.add_argument(
        '--max_grad_norm',
        type=float,
        default=10,
        help='for gradient clipping max gradient normalization')

    # test
    parser.add_argument('-m',
                        type=int,
                        default=3,
                        help='decode summary length')
    parser.add_argument(
        '--test_model',
        type=str,
        default='evalbestmodel',
        help=
        'choose different model to test [evalbestmodel/evalbestFmodel/trainbestmodel/trainbestFmodel/earlystop]'
    )
    parser.add_argument('--use_pyrouge',
                        action='store_true',
                        default=False,
                        help='use_pyrouge')

    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    torch.set_printoptions(threshold=50000)

    # File paths
    DATA_FILE = args.data_path
    VALID_FILE = args.valid_path
    VOCAL_FILE = args.vocab_path
    LOG_PATH = args.log_root

    # # train_log setting
    if not os.path.exists(LOG_PATH):
        if args.mode == "train":
            os.makedirs(LOG_PATH)
        else:
            raise Exception(
                "[Error] Logdir %s doesn't exist. Run in train mode to create it."
                % (LOG_PATH))
    nowTime = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    log_path = os.path.join(LOG_PATH, args.mode + "_" + nowTime)
    # logger = _init_logger(path=log_path)
    # file_handler = logging.FileHandler(log_path)
    # file_handler.setFormatter(formatter)
    # logger.addHandler(file_handler)

    logger.info("Pytorch %s", torch.__version__)

    # dataset
    hps = args
    dbPipe = ExtCNNDMPipe(vocab_size=hps.vocab_size,
                          vocab_path=VOCAL_FILE,
                          sent_max_len=hps.sent_max_len,
                          doc_max_timesteps=hps.doc_max_timesteps)
    if hps.mode == 'test':
        hps.recurrent_dropout_prob = 0.0
        hps.atten_dropout_prob = 0.0
        hps.ffn_dropout_prob = 0.0
        logger.info(hps)
        paths = {"test": DATA_FILE}
        db = dbPipe.process_from_file(paths)
    else:
        paths = {"train": DATA_FILE, "valid": VALID_FILE}
        db = dbPipe.process_from_file(paths)

    # embedding
    if args.embedding == "glove":
        vocab = db.get_vocab("vocab")
        embed = torch.nn.Embedding(len(vocab), hps.word_emb_dim)
        if hps.word_embedding:
            embed_loader = EmbedLoader()
            pretrained_weight = embed_loader.load_with_vocab(
                hps.embedding_path, vocab)  # unfound with random init
            embed.weight.data.copy_(torch.from_numpy(pretrained_weight))
            embed.weight.requires_grad = hps.embed_train
    else:
        logger.error("[ERROR] embedding To Be Continued!")
        sys.exit(1)

    # model
    if args.sentence_encoder == "transformer" and args.sentence_decoder == "SeqLab":
        model_param = json.load(open("config/transformer.config", "rb"))
        hps.__dict__.update(model_param)
        model = TransformerModel(hps, embed)
    elif args.sentence_encoder == "deeplstm" and args.sentence_decoder == "SeqLab":
        model_param = json.load(open("config/deeplstm.config", "rb"))
        hps.__dict__.update(model_param)
        model = SummarizationModel(hps, embed)
    else:
        logger.error("[ERROR] Model To Be Continued!")
        sys.exit(1)
    if hps.cuda:
        model = model.cuda()
        logger.info("[INFO] Use cuda")

    logger.info(hps)

    if hps.mode == 'train':
        db.get_dataset("valid").set_target("text", "summary")
        setup_training(model, db.get_dataset("train"), db.get_dataset("valid"),
                       hps)
    elif hps.mode == 'test':
        logger.info("[INFO] Decoding...")
        db.get_dataset("test").set_target("text", "summary")
        run_test(model, db.get_dataset("test"), hps, limited=hps.limited)
    else:
        logger.error("The 'mode' flag must be one of train/eval/test")
        raise ValueError("The 'mode' flag must be one of train/eval/test")
Пример #13
0
# In[5]:

# 1. get dataset
dataset = load_data('data/train.tsv', 1)
train_dataset, val_dataset = dataset.split(0.1)
test_dataset = load_data('data/test.tsv', 0)
print("train_dataset size: ", train_dataset.get_length())
print("val_dataset size: ", val_dataset.get_length())
print("test_dataset size: ", test_dataset.get_length())

# In[6]:

# 2. get vocabulary
if (use_pretrain):
    loader = EmbedLoader()
    pre_embed, vocab = loader.load_without_vocab(embed_path, normalize=False)
    embedding_size = pre_embed.shape[1]
else:
    vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')
print("vocabulary size: ", len(vocab))

# In[7]:

# 3. word to index
vocab.index_dataset(train_dataset, field_name='words', new_field_name='words')
vocab.index_dataset(val_dataset, field_name='words', new_field_name='words')
vocab.index_dataset(test_dataset, field_name='words', new_field_name='words')

# ### 3. Build CNN model
Пример #14
0
 def test_case(self):
     vocab = Vocabulary()
     vocab.update(["the", "in", "I", "to", "of", "hahaha"])
     embedding = EmbedLoader().fast_load_embedding(
         50, "test/data_for_tests/glove.6B.50d_test.txt", vocab)
     self.assertEqual(tuple(embedding.shape), (len(vocab), 50))