Exemplos de Vocabulary.add_word_lst em Python, exemplos de fastNLP.Vocabulary.add_word_lst em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: dataset.py Projeto: zhongyuchen/information-extraction

def _get_vocab(data_list):
    vocab = Vocabulary(unknown=unk_str, padding=pad_str)
    for l in data_list:
        vocab.add_word_lst(l)
    vocab.build_vocab()
    print('vocab', len(vocab))
    return vocab

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_static_embedding.py Projeto: yuhonghe592/fastNLP

    def test_same_vector5(self):
        # 检查通过使用min_freq后的word是否内容一致
        word_lst = ["they", "the", "they", "the", 'he', 'he', "a", "A"]
        no_create_word_lst = ['of', "of", "she", "she", 'With', 'with']
        all_words = word_lst[:-2] + no_create_word_lst[:-2]
        vocab = Vocabulary().add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=False,
                                min_freq=2)
        words = torch.LongTensor([[vocab.to_index(word)
                                   for word in all_words]])
        words = embed(words)

        min_freq_vocab = Vocabulary(min_freq=2).add_word_lst(word_lst)
        min_freq_vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        min_freq_embed = StaticEmbedding(min_freq_vocab,
                                         model_dir_or_name='en-glove-6B-100d',
                                         lower=False)
        min_freq_words = torch.LongTensor(
            [[min_freq_vocab.to_index(word.lower()) for word in all_words]])
        min_freq_words = min_freq_embed(min_freq_words)

        for idx in range(len(all_words)):
            word_i, word_j = words[0, idx], min_freq_words[0, idx]
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(
                    min_freq_embed.embed_size)

Exemplo n.º 3

0

Exibir arquivo

    def test_save_and_load(self):
        fp = 'vocab_save_test.txt'
        try:
            # check word2idx没变，no_create_entry正常
            words = list('abcdefaddfdkjfe')
            no_create_entry = list('12342331')
            unk = '[UNK]'
            vocab = Vocabulary(unknown=unk, max_size=500)

            vocab.add_word_lst(words)
            vocab.add_word_lst(no_create_entry, no_create_entry=True)
            vocab.save(fp)

            new_vocab = Vocabulary.load(fp)

            for word, index in vocab:
                self.assertEqual(new_vocab.to_index(word), index)
            for word in no_create_entry:
                self.assertTrue(new_vocab._is_word_no_create_entry(word))
            for word in words:
                self.assertFalse(new_vocab._is_word_no_create_entry(word))
            for idx in range(len(vocab)):
                self.assertEqual(vocab.to_word(idx), new_vocab.to_word(idx))
            self.assertEqual(vocab.unknown, new_vocab.unknown)
        except:
            import os
            if os.path.exists(fp):
                os.remove(fp)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_static_embedding.py Projeto: yuhonghe592/fastNLP

    def test_same_vector4(self):
        # 验证在有min_freq下的lower
        word_lst = ["The", "the", "the", "The", "a", "A"]
        no_create_word_lst = ['of', 'Of', "Of", "of", 'With', 'with']
        all_words = word_lst[:-2] + no_create_word_lst[:-2]
        vocab = Vocabulary(min_freq=2).add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=True)
        words = torch.LongTensor([[vocab.to_index(word)
                                   for word in all_words]])
        words = embed(words)

        lowered_word_lst = [word.lower() for word in word_lst]
        lowered_no_create_word_lst = [
            word.lower() for word in no_create_word_lst
        ]
        lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst)
        lowered_vocab.add_word_lst(lowered_no_create_word_lst,
                                   no_create_entry=True)
        lowered_embed = StaticEmbedding(lowered_vocab,
                                        model_dir_or_name='en-glove-6B-100d',
                                        lower=False)
        lowered_words = torch.LongTensor(
            [[lowered_vocab.to_index(word.lower()) for word in all_words]])
        lowered_words = lowered_embed(lowered_words)

        for idx in range(len(all_words)):
            word_i, word_j = words[0, idx], lowered_words[0, idx]
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)

Exemplo n.º 5

0

Exibir arquivo

    def test_case(self):
        vocab = Vocabulary().add_word_lst("This is a test .".split())
        vocab.add_word_lst("Another test !".split())
        embed = StaticEmbedding(vocab,
                                model_dir_or_name=None,
                                embedding_dim=10)

        encoder_output = torch.randn(2, 3, 10)
        tgt_words_idx = torch.LongTensor([[1, 2, 3, 4], [2, 3, 0, 0]])
        src_seq_len = torch.LongTensor([3, 2])
        encoder_mask = seq_len_to_mask(src_seq_len)

        for flag in [True, False]:
            for attention in [True, False]:
                with self.subTest(bind_decoder_input_output_embed=flag,
                                  attention=attention):
                    decoder = LSTMSeq2SeqDecoder(
                        embed=embed,
                        num_layers=2,
                        hidden_size=10,
                        dropout=0.3,
                        bind_decoder_input_output_embed=flag,
                        attention=attention)
                    state = decoder.init_state(encoder_output, encoder_mask)
                    output = decoder(tgt_words_idx, state)
                    self.assertEqual(tuple(output.size()), (2, 4, len(vocab)))

Exemplo n.º 6

0

Exibir arquivo

    def test_case(self):
        vocab = Vocabulary().add_word_lst("This is a test .".split())
        vocab.add_word_lst("Another test !".split())
        embed = StaticEmbedding(vocab, embedding_dim=10)

        encoder_output = torch.randn(2, 3, 10)
        src_seq_len = torch.LongTensor([3, 2])
        encoder_mask = seq_len_to_mask(src_seq_len)

        for flag in [True, False]:
            with self.subTest(bind_decoder_input_output_embed=flag):
                decoder = TransformerSeq2SeqDecoder(
                    embed=embed,
                    pos_embed=None,
                    d_model=10,
                    num_layers=2,
                    n_head=5,
                    dim_ff=20,
                    dropout=0.1,
                    bind_decoder_input_output_embed=True)
                state = decoder.init_state(encoder_output, encoder_mask)
                output = decoder(tokens=torch.randint(0,
                                                      len(vocab),
                                                      size=(2, 4)),
                                 state=state)
                self.assertEqual(output.size(), (2, 4, len(vocab)))

Exemplo n.º 7

0

Exibir arquivo

def word_to_id(glove_data, glove_matrix, vocab_dict_path, file_path):
    if os.path.exists(glove_data) == False or os.path.exists(
            glove_matrix) == False:
        data, feature_words, user_num, item_num, = feature_word(file_path)
        vocab = Vocabulary(max_size=len(feature_words) + 1,
                           unknown='unk',
                           padding='PAD')
        vocab.add_word_lst(feature_words)
        vocab.build_vocab()
        matrix = EmbedLoader.load_with_vocab(vocab_dict_path, vocab)
        matrix = torch.tensor(matrix)

        for d in range(len(data)):
            review = []
            for word in data[d]['reviewText']:
                review.append(vocab.to_index(word))
            data[d]['reviewText'] = review

        with open(glove_data, 'wb') as f:
            pickle.dump(data, f)

        with open(glove_matrix, 'wb') as f:
            pickle.dump(matrix, f)

    with open(glove_data, 'rb') as f:
        glove_data = pickle.load(f)
    with open(glove_matrix, 'rb') as f:
        matrix = pickle.load(f)

    return glove_data, matrix, len(glove_data[0]['reviewText'])

Exemplo n.º 8

0

Exibir arquivo

 def test_search(self):
     """语义搜索.TypeError: expected dimension <= 2 array or matrix
     """
     print('{} test_search {}'.format('-' * 15, '-' * 15))
     texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基']
     # 文本向量化
     vocab = Vocabulary()
     for text in texts:
         vocab.add_word_lst(list(text))
     print(len(vocab))
     embed = StaticEmbedding(
         vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt')
     texts_to_id = [[vocab.to_index(word) for word in list(text)]
                    for text in texts]
     words = torch.LongTensor(texts_to_id)  # 将文本转为index
     features_vec = embed(words)
     print(features_vec.shape)
     # build the search index!
     cp = ci.MultiClusterIndex(features_vec.detach().numpy(), texts)
     search_texts = ['朱日和站', '温都尔站', '国电站']
     for text in search_texts:
         texts_to_id = [[vocab.to_index(word) for word in list(text)]]
         words = torch.LongTensor(texts_to_id)  # 将文本转为index
         features_vec = embed(words)
         search_features_vec = features_vec.detach().numpy()
         search_result = cp.search(search_features_vec,
                                   k=2,
                                   k_clusters=2,
                                   return_distance=True)
         print('text:{}'.format(text))
         print('search_result:{}'.format(search_result))
     """

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_static_embedding.py Projeto: yuhonghe592/fastNLP

    def test_same_vector3(self):
        # 验证lower
        word_lst = ["The", "the"]
        no_create_word_lst = ['of', 'Of', 'With', 'with']
        vocab = Vocabulary().add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=True)
        words = torch.LongTensor(
            [[vocab.to_index(word) for word in word_lst + no_create_word_lst]])
        words = embed(words)

        lowered_word_lst = [word.lower() for word in word_lst]
        lowered_no_create_word_lst = [
            word.lower() for word in no_create_word_lst
        ]
        lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst)
        lowered_vocab.add_word_lst(lowered_no_create_word_lst,
                                   no_create_entry=True)
        lowered_embed = StaticEmbedding(lowered_vocab,
                                        model_dir_or_name='en-glove-6B-100d',
                                        lower=False)
        lowered_words = torch.LongTensor([[
            lowered_vocab.to_index(word)
            for word in lowered_word_lst + lowered_no_create_word_lst
        ]])
        lowered_words = lowered_embed(lowered_words)

        all_words = word_lst + no_create_word_lst

        for idx, (word_i, word_j) in enumerate(zip(words[0],
                                                   lowered_words[0])):
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)

Exemplo n.º 10

0

Exibir arquivo

def get_vocab(dataset):
    vocabulary = Vocabulary(unknown=unk_str, padding=pad_str)
    for data, _ in dataset:
        vocabulary.add_word_lst(data)
    print('vocab', len(vocabulary))
    print('pad', vocabulary.to_index(pad_str))

    return vocabulary

Exemplo n.º 11

0

Exibir arquivo

 def test_Index2WordProcessor(self):
     vocab = Vocabulary()
     vocab.add_word_lst(["a", "b", "c", "d", "e"])
     proc = Index2WordProcessor(vocab, "tag_id", "tag")
     data_set = DataSet(
         [Instance(tag_id=[np.random.randint(0, 7) for _ in range(32)])])
     data_set = proc(data_set)
     self.assertTrue("tag" in data_set)

Exemplo n.º 12

0

Exibir arquivo

def get_vocabulary(data, min_freq):
    # train data -> vocabulary
    # alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    alphabet = "0123456789,"
    char_list = [c for c in alphabet]
    vocabulary = Vocabulary(padding='<pad>', unknown='<unk>')
    vocabulary.add_word_lst(char_list)
    vocabulary.build_vocab()
    print('vocab size', len(vocabulary), 'pad', vocabulary.padding_idx, 'unk', vocabulary.unknown_idx)
    return vocabulary

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test_seq2seq_generator.py Projeto: isabella232/benchmark-3

def prepare_env():
    vocab = Vocabulary().add_word_lst("This is a test .".split())
    vocab.add_word_lst("Another test !".split())
    embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5)

    encoder_output = torch.randn(2, 3, 10)
    src_seq_len = torch.LongTensor([3, 2])
    encoder_mask = seq_len_to_mask(src_seq_len)

    return embed, encoder_output, encoder_mask

Exemplo n.º 14

0

Exibir arquivo

Arquivo: data.py Projeto: mikkymouse/chenzhan

def get_vocabulary(data, min_freq):
    # train data -> vocabulary
    vocabulary = Vocabulary(min_freq=min_freq, padding='<pad>', unknown='<unk>')
    for filename in data:
        for value in data[filename]:
            for word_list in data[filename][value]['data']:
                vocabulary.add_word_lst(word_list)
    vocabulary.build_vocab()
    print('vocab size', len(vocabulary), 'pad', vocabulary.padding_idx, 'unk', vocabulary.unknown_idx)
    return vocabulary

Exemplo n.º 15

0

Exibir arquivo

Arquivo: test_seq2seq_model.py Projeto: isabella232/benchmark-3

def prepare_env():
    vocab = Vocabulary().add_word_lst("This is a test .".split())
    vocab.add_word_lst("Another test !".split())
    embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5)

    src_words_idx = torch.LongTensor([[3, 1, 2], [1, 2, 0]])
    tgt_words_idx = torch.LongTensor([[1, 2, 3, 4], [2, 3, 0, 0]])
    src_seq_len = torch.LongTensor([3, 2])
    tgt_seq_len = torch.LongTensor([4, 2])

    return embed, src_words_idx, tgt_words_idx, src_seq_len, tgt_seq_len

Exemplo n.º 16

0

Exibir arquivo

Arquivo: preprocess.py Projeto: zhushaoquan/SLSTM

def load_dataset(
    data_dir='/remote-home/ygxu/workspace/Product_all',
    data_path='mr.task.train',
    # bert_dir='/home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12',
    bert_dir='/remote-home/ygxu/workspace/BERT/BERT_English_uncased_L-24_H-1024_A_16',
):

    path = os.path.join(data_dir, data_path)

    ds = DataSet.read_csv(path, headers=('label', 'raw_sentence'), sep='\t')

    ds.apply(lambda x: x['raw_sentence'].lower(),
             new_field_name='raw_sentence')

    ds.apply(lambda x: int(x['label']),
             new_field_name='label_seq',
             is_target=True)

    def transfer_bert_to_fastnlp(ins):
        result = "[CLS] "
        bert_text = ins['bert_tokenize_list']
        for text in bert_text:
            result += text + " "
        return result.strip()

    with open(os.path.join(bert_dir, 'vocab.txt')) as f:
        lines = f.readlines()
    vocabs = []
    for line in lines:
        vocabs.append(line[:-1])

    vocab_bert = Vocabulary(unknown=None, padding=None)
    vocab_bert.add_word_lst(vocabs)
    vocab_bert.build_vocab()
    vocab_bert.unknown = '[UNK]'
    vocab_bert.padding = '[PAD]'

    from pytorch_pretrained_bert import BertTokenizer, BertModel
    tokenizer = BertTokenizer.from_pretrained(
        os.path.join(bert_dir, 'vocab.txt'))
    ds.apply(lambda x: tokenizer.tokenize(x['raw_sentence']),
             new_field_name='bert_tokenize_list')
    ds.apply(transfer_bert_to_fastnlp, new_field_name='bert_tokenize')
    ds.apply(lambda x:
             [vocab_bert.to_index(word) for word in x['bert_tokenize_list']],
             new_field_name='index_words',
             is_input=True)

    ds.rename_field('index_words', 'tokens')
    ds.apply(lambda x: [1.] * len(x['tokens']),
             new_field_name='masks',
             is_input=True)

    return ds

Exemplo n.º 17

0

Exibir arquivo

def read_vocab(file_name):
    # 读入vocab文件
    with open(file_name) as f:
        lines = f.readlines()
    vocabs = []
    for line in lines:
        vocabs.append(line.strip())

    # 实例化Vocabulary
    vocab = Vocabulary(unknown='<unk>', padding='<pad>')
    # 将vocabs列表加入Vocabulary
    vocab.add_word_lst(vocabs)
    # 构建词表
    vocab.build_vocab()
    return vocab

Exemplo n.º 18

0

Exibir arquivo

 def test_rebuild(self):
     # 测试build之后新加入词，原来的词顺序不变
     vocab = Vocabulary()
     text = [str(idx) for idx in range(10)]
     vocab.update(text)
     for i in text:
         self.assertEqual(int(i) + 2, vocab.to_index(i))
     indexes = []
     for word, index in vocab:
         indexes.append((word, index))
     vocab.add_word_lst([str(idx) for idx in range(10, 13)])
     for idx, pair in enumerate(indexes):
         self.assertEqual(pair[1], vocab.to_index(pair[0]))
     for i in range(13):
         self.assertEqual(int(i) + 2, vocab.to_index(str(i)))

Exemplo n.º 19

0

Exibir arquivo

 def test_fit(self):
     """文本编码.
     """
     print('{} test_fit {}'.format('-' * 15, '-' * 15))
     texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基']
     vocab = Vocabulary()
     for text in texts:
         vocab.add_word_lst(list(text))
     print(len(vocab))
     embed = StaticEmbedding(
         vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt')
     texts_to_id = [[vocab.to_index(word) for word in list(text)]
                    for text in ['朱日和', '东台变']]
     print(texts_to_id)  # [[16, 17, 18], [6, 1, 1]]
     words = torch.LongTensor(texts_to_id)  # 将文本转为index
     print(embed(words).size())  # torch.Size([2, 3, 100])

Exemplo n.º 20

0

Exibir arquivo

def get_vocab(trainset, testset):
    # 构建vocab以及word2idx
    #tok
    tok_vocab = Vocabulary()
    tok_vocab.from_dataset(trainset,
                           field_name="tok",
                           no_create_entry_dataset=testset)
    tok_vocab.index_dataset(trainset,
                            testset,
                            field_name="tok",
                            new_field_name="chars")
    tok_vocab.index_dataset(trainset,
                            testset,
                            field_name="asp",
                            new_field_name="aspect")
    # deprel
    dep_vocab = Vocabulary()
    dep_vocab.from_dataset(trainset, field_name="deprel")
    dep_vocab.index_dataset(trainset,
                            testset,
                            field_name="deprel",
                            new_field_name="depidx")
    # pol(target)
    pol_vocab = Vocabulary(padding=None, unknown=None)
    pol_vocab.from_dataset(trainset, field_name="pol")
    pol_vocab.index_dataset(trainset,
                            testset,
                            field_name="pol",
                            new_field_name="target")
    # pos
    pos_vocab = Vocabulary()
    pos_vocab.from_dataset(trainset, field_name="pos")
    pos_vocab.index_dataset(trainset,
                            testset,
                            field_name="pos",
                            new_field_name="posidx")
    # post
    max_len = max(max(trainset["seq_len"]), max(testset["seq_len"]))
    post_vocab = Vocabulary()
    post_vocab.add_word_lst(list(range(-max_len, max_len)))
    post_vocab.index_dataset(trainset,
                             testset,
                             field_name="post",
                             new_field_name="postidx")
    return tok_vocab, pos_vocab, post_vocab, trainset, testset

Exemplo n.º 21

0

Exibir arquivo

def handle_data(n_class):
    train_data = get_text_classification_datasets(n_class)
    dataset = DataSet()
    vocab = Vocabulary(min_freq=0, unknown='<unk>', padding='<pad>')
    for i in range(len(train_data.data)):
        ans = remove_punc(train_data.data[i])
        dataset.append((Instance(content=ans,
                                 target=int(train_data.target[i]))))
    dataset.apply(lambda x: x['content'].lower().split(),
                  new_field_name='words',
                  is_input=True)
    for txt in dataset:
        vocab.add_word_lst(txt['words'])
    vocab.build_vocab()
    # index句子, Vocabulary.to_index(word)
    dataset.apply(lambda x: [vocab.to_index(word) for word in x['words']],
                  new_field_name='index')
    dataset.set_input("index")
    dataset.set_target("target")
    tra, dev = dataset.split(0.2)
    return tra, dev, len(vocab)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: test_seq2seq_generator.py Projeto: isabella232/benchmark-3

def prepare_env():
    vocab = Vocabulary().add_word_lst("This is a test .".split())
    vocab.add_word_lst("Another test !".split())
    embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5)

    src_words_idx = [[3, 1, 2], [1, 2]]
    # tgt_words_idx = [[1, 2, 3, 4], [2, 3]]
    src_seq_len = [3, 2]
    # tgt_seq_len = [4, 2]

    ds = DataSet({
        'src_tokens': src_words_idx,
        'src_seq_len': src_seq_len,
        'tgt_tokens': src_words_idx,
        'tgt_seq_len': src_seq_len
    })

    ds.set_input('src_tokens', 'tgt_tokens', 'src_seq_len')
    ds.set_target('tgt_seq_len', 'tgt_tokens')

    return embed, ds

Exemplo n.º 23

0

Exibir arquivo

Arquivo: dataUtils.py Projeto: zhongyuchen/PRML-Spring19-Fudan

def get_data(filepath):
    data = np.load(filepath, allow_pickle=True)
    data, _, ix2word = data['data'], data['word2ix'].item(
    ), data['ix2word'].item()
    wordlist = []
    for d in data:
        for ix in d:
            wordlist.append(ix2word[ix])
    vocab = Vocabulary(min_freq=10, padding="</s>")
    vocab.add_word_lst(wordlist)
    vocab.build_vocab()
    # vocab = Vocabulary(min_freq=10, padding="</s>").add_word_lst(wordlist).build_vocab()
    vocab_size = len(vocab.word2idx)
    for d in data:
        for i in range(len(d)):
            # d[i] = vocab[vocab.to_word(d[i])]
            if d[i] >= vocab_size:
                d[i] = vocab["<unk>"]

    print(vocab_size)

    return data, vocab

Exemplo n.º 24

0

Exibir arquivo

Arquivo: test_utils.py Projeto: isabella232/benchmark-3

    def test1(self):
        # 测试能否正常打印
        from fastNLP import Vocabulary
        from fastNLP.core.utils import ConfusionMatrix
        import numpy as np
        vocab = Vocabulary(unknown=None, padding=None)
        vocab.add_word_lst(list('abcdef'))
        confusion_matrix = ConfusionMatrix(vocab)
        for _ in range(3):
            length = np.random.randint(1, 5)
            pred = np.random.randint(0, 3, size=(length,))
            target = np.random.randint(0, 3, size=(length,))
            confusion_matrix.add_pred_target(pred, target)
        print(confusion_matrix)

        # 测试print_ratio
        confusion_matrix = ConfusionMatrix(vocab, print_ratio=True)
        for _ in range(3):
            length = np.random.randint(1, 5)
            pred = np.random.randint(0, 3, size=(length,))
            target = np.random.randint(0, 3, size=(length,))
            confusion_matrix.add_pred_target(pred, target)
        print(confusion_matrix)

Exemplo n.º 25

0

Exibir arquivo

    def test_no_entry(self):
        # 先建立vocabulary，然后变化no_create_entry, 测试能否正确识别
        text = [
            "FastNLP", "works", "well", "in", "most", "cases", "and", "scales",
            "well", "in", "works", "well", "in", "most", "cases", "scales",
            "well"
        ]
        vocab = Vocabulary()
        vocab.add_word_lst(text)

        self.assertFalse(vocab._is_word_no_create_entry('FastNLP'))
        vocab.add_word('FastNLP', no_create_entry=True)
        self.assertFalse(vocab._is_word_no_create_entry('FastNLP'))

        vocab.add_word('fastnlp', no_create_entry=True)
        self.assertTrue(vocab._is_word_no_create_entry('fastnlp'))
        vocab.add_word('fastnlp', no_create_entry=False)
        self.assertFalse(vocab._is_word_no_create_entry('fastnlp'))

        vocab.add_word_lst(['1'] * 10, no_create_entry=True)
        self.assertTrue(vocab._is_word_no_create_entry('1'))
        vocab.add_word('1')
        self.assertFalse(vocab._is_word_no_create_entry('1'))

Exemplo n.º 26

0

Exibir arquivo

Arquivo: load_data.py Projeto: mecthew/Flat-Lattice-Transformer

def equip_chinese_ner_with_skip(datasets, vocabs, embeddings, w_list, word_embedding_path=None,
                                word_min_freq=1, only_train_min_freq=0):
    from utils_ import Trie, get_skip_path
    from functools import partial
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    # for k,v in datasets.items():
    #     v.apply_field(partial(get_skip_path,w_trie=w_trie),'chars','skips')

    def skips2skips_l2r(chars, w_trie):
        '''

        :param lexicons: list[[int,int,str]]
        :return: skips_l2r
        '''
        # print(lexicons)
        # print('******')

        lexicons = get_skip_path(chars, w_trie=w_trie)

        # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0

        result = [[] for _ in range(len(chars))]

        for lex in lexicons:
            s = lex[0]
            e = lex[1]
            w = lex[2]

            result[e].append([s, w])

        return result

    def skips2skips_r2l(chars, w_trie):
        '''

        :param lexicons: list[[int,int,str]]
        :return: skips_l2r
        '''
        # print(lexicons)
        # print('******')

        lexicons = get_skip_path(chars, w_trie=w_trie)

        # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0

        result = [[] for _ in range(len(chars))]

        for lex in lexicons:
            s = lex[0]
            e = lex[1]
            w = lex[2]

            result[s].append([e, w])

        return result

    for k, v in datasets.items():
        v.apply_field(partial(skips2skips_l2r, w_trie=w_trie), 'chars', 'skips_l2r')

    for k, v in datasets.items():
        v.apply_field(partial(skips2skips_r2l, w_trie=w_trie), 'chars', 'skips_r2l')

    # print(v['skips_l2r'][0])
    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab
    for k, v in datasets.items():
        v.apply_field(lambda x: [list(map(lambda x: x[0], p)) for p in x], 'skips_l2r', 'skips_l2r_source')
        v.apply_field(lambda x: [list(map(lambda x: x[1], p)) for p in x], 'skips_l2r', 'skips_l2r_word')

    for k, v in datasets.items():
        v.apply_field(lambda x: [list(map(lambda x: x[0], p)) for p in x], 'skips_r2l', 'skips_r2l_source')
        v.apply_field(lambda x: [list(map(lambda x: x[1], p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    for k, v in datasets.items():
        v.apply_field(lambda x: list(map(len, x)), 'skips_l2r_word', 'lexicon_count')
        v.apply_field(lambda x:
                      list(map(lambda y:
                               list(map(lambda z: word_vocab.to_index(z), y)), x)),
                      'skips_l2r_word', new_field_name='skips_l2r_word')

        v.apply_field(lambda x: list(map(len, x)), 'skips_r2l_word', 'lexicon_count_back')

        v.apply_field(lambda x:
                      list(map(lambda y:
                               list(map(lambda z: word_vocab.to_index(z), y)), x)),
                      'skips_r2l_word', new_field_name='skips_r2l_word')

    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0)
        embeddings['word'] = word_embedding

    vocabs['char'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
    vocabs['bigram'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
    vocabs['label'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    return datasets, vocabs, embeddings

Exemplo n.º 27

0

Exibir arquivo

    def process(self, paths, config, load_vocab_file=True):
        """
        :param paths: dict  path for each dataset
        :param load_vocab_file: bool  build vocab (False) or load vocab (True)
        :return: DataBundle
            datasets: dict  keys correspond to the paths dict
            vocabs: dict  key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True)
            embeddings: optional
        """

        vocab_size = config.vocab_size

        def _merge_abstracts(abstracts):
            merged = []
            for abstract in abstracts:
                merged.extend(abstract[:self.max_concat_len] + [SEP])
            if len(abstracts) == 0:
                assert merged == []
            return merged[:-1]

        def _pad_graph_inputs(graph_inputs):
            pad_text_wd = []
            max_len = config.max_graph_enc_steps

            for graph_input in graph_inputs:
                if len(graph_input) < max_len:
                    pad_num = max_len - len(graph_input)
                    graph_input.extend([PAD_TOKEN] * pad_num)
                else:
                    graph_input = graph_input[:max_len]
                pad_text_wd.append(graph_input)

            if len(pad_text_wd) == 0:
                pad_text_wd.append([PAD_TOKEN] * max_len)

            return pad_text_wd

        def _get_nbr_input_len(input_wd):
            enc_len = [
                min(len(text), config.max_graph_enc_steps) for text in input_wd
            ]
            if len(enc_len) == 0:
                enc_len = [0]
            return enc_len

        def _pad_article(text_wd):
            token_num = len(text_wd)
            max_len = config.max_enc_steps
            if config.neighbor_process == "sep":
                max_len += self.max_concat_len * self.max_concat_num
            if token_num < max_len:
                padding = [PAD_TOKEN] * (max_len - token_num)
                article = text_wd + padding
            else:
                article = text_wd[:max_len]
            return article

        def _split_list(input_list):
            return [text.split() for text in input_list]

        def sent_tokenize(abstract):
            abs_list = abstract.split(".")
            return [(abst + ".") for abst in abs_list[:-1]]

        def _article_token_mask(text_wd):
            max_enc_len = config.max_enc_steps
            if config.neighbor_process == "sep":
                max_enc_len += self.max_concat_len * self.max_concat_num
            token_num = len(text_wd)
            if token_num < max_enc_len:
                mask = [1] * token_num + [0] * (max_enc_len - token_num)
            else:
                mask = [1] * max_enc_len
            return mask

        def generate_article_input(text, abstracts):
            if config.neighbor_process == "sep":
                text_wd = text.split()[:config.max_enc_steps]
                text_wd.append(SEP)
                abstracts_wd = _merge_abstracts(abstracts)
                return text_wd + abstracts_wd
            else:
                return text.split()

        def generate_graph_inputs(graph_struct):

            graph_inputs_ = [
                graph_strut_dict[pid][config.graph_input_type]
                for pid in graph_struct
            ]
            return _split_list(graph_inputs_[1:])

        def generate_graph_structs(paper_id):
            sub_graph_dict = {}
            sub_graph_set = []

            n_hop = config.n_hop
            max_neighbor_num = config.max_neighbor_num
            k_nbrs = _k_hop_neighbor(paper_id, n_hop, max_neighbor_num)
            for sub_g in k_nbrs:
                sub_graph_set += sub_g

            for node in sub_graph_set:
                sub_graph_dict[node] = []

            for sub_g in k_nbrs:
                for centre_node in sub_g:
                    nbrs = graph_strut_dict[centre_node]['references']
                    c_nbrs = list(set(nbrs).intersection(sub_graph_set))
                    sub_graph_dict[centre_node].extend(c_nbrs)
                    for c_nbr in c_nbrs:
                        sub_graph_dict[c_nbr].append(centre_node)
            # in python 3.6, the first in subgraph dict is source paper
            return sub_graph_dict

        def _k_hop_neighbor(paper_id, n_hop, max_neighbor):
            sub_graph = [[] for _ in range(n_hop + 1)]
            level = 0
            visited = set()
            q = deque()
            q.append([paper_id, level])
            curr_node_num = 0
            while len(q) != 0:
                paper_first = q.popleft()
                paper_id_first, level_first = paper_first
                if level_first > n_hop:
                    return sub_graph
                sub_graph[level_first].append(paper_id_first)
                curr_node_num += 1
                if curr_node_num > max_neighbor:
                    return sub_graph
                visited.add(paper_id_first)
                for pid in graph_strut_dict[paper_id_first]["references"]:
                    if pid not in visited and pid in graph_strut_dict:
                        q.append([pid, level_first + 1])
                        visited.add(pid)

            return sub_graph

        def generate_dgl_graph(paper_id, graph_struct, nodes_num):
            g = dgl.DGLGraph()
            assert len(graph_struct) == nodes_num

            g.add_nodes(len(graph_struct))
            pid2idx = {}
            for index, key_node in enumerate(graph_struct):
                pid2idx[key_node] = index
            assert pid2idx[paper_id] == 0

            for index, key_node in enumerate(graph_struct):
                neighbor = [pid2idx[node] for node in graph_struct[key_node]]
                # add self loop
                neighbor.append(index)
                key_nodes = [index] * len(neighbor)
                g.add_edges(key_nodes, neighbor)
            return g

        train_ds = None
        dataInfo = self.load(paths)

        # pop nodes in train graph in inductive setting
        if config.mode == "test" and self.setting == "inductive":
            dataInfo.datasets.pop("train")

        graph_strut_dict = {}
        for key, ds in dataInfo.datasets.items():
            for ins in ds:
                graph_strut_dict[ins["paper_id"]] = ins

        logger.info(f"the input graph G_v has {len(graph_strut_dict)} nodes")

        for key, ds in dataInfo.datasets.items():
            # process summary
            ds.apply(lambda x: x['abstract'].split(),
                     new_field_name='summary_wd')
            ds.apply(lambda x: sent_tokenize(x['abstract']),
                     new_field_name='abstract_sentences')
            # generate graph

            ds.apply(lambda x: generate_graph_structs(x["paper_id"]),
                     new_field_name="graph_struct")
            ds.apply(lambda x: generate_graph_inputs(x["graph_struct"]),
                     new_field_name='graph_inputs_wd')

            ds.apply(lambda x: len(x["graph_inputs_wd"]) + 1,
                     new_field_name="nodes_num")
            # pad input
            ds.apply(lambda x: generate_article_input(x['introduction'], x[
                "graph_inputs_wd"]),
                     new_field_name='input_wd')
            ds.apply(lambda x: _article_token_mask(x["input_wd"]),
                     new_field_name="enc_len_mask")
            ds.apply(lambda x: sum(x["enc_len_mask"]),
                     new_field_name="enc_len")
            ds.apply(lambda x: _pad_article(x["input_wd"]),
                     new_field_name="pad_input_wd")

            ds.apply(lambda x: _get_nbr_input_len(x["graph_inputs_wd"]),
                     new_field_name="nbr_inputs_len")

            ds.apply(lambda x: _pad_graph_inputs(x["graph_inputs_wd"]),
                     new_field_name="pad_graph_inputs_wd")
            if key == "train":
                train_ds = ds

        vocab_dict = {}
        if not load_vocab_file:
            logger.info("[INFO] Build new vocab from training dataset!")
            if train_ds is None:
                raise ValueError("Lack train file to build vocabulary!")

            vocabs = Vocabulary(max_size=config.vocab_size - 2,
                                padding=PAD_TOKEN,
                                unknown=UNKNOWN_TOKEN)
            vocabs.from_dataset(train_ds,
                                field_name=["input_wd", "summary_wd"])
            vocabs.add_word(START_DECODING)
            vocabs.add_word(STOP_DECODING)
            vocab_dict["vocab"] = vocabs
            # save vocab
            with open(os.path.join(config.train_path, "vocab"),
                      "w",
                      encoding="utf8") as f:
                for w, idx in vocabs:
                    f.write(str(w) + "\t" + str(idx) + "\n")
            logger.info(
                "build new vocab ends.. please reRun the code with load_vocab = True"
            )
            exit(0)
        else:

            logger.info("[INFO] Load existing vocab from %s!" %
                        config.vocab_path)
            word_list = []
            cnt = 3  # pad and unk
            if config.neighbor_process == "sep":
                cnt += 1

            with open(config.vocab_path, 'r', encoding='utf8') as vocab_f:
                for line in vocab_f:
                    pieces = line.split("\t")
                    word_list.append(pieces[0])
                    cnt += 1
                    if cnt > vocab_size:
                        break

            vocabs = Vocabulary(max_size=vocab_size,
                                padding=PAD_TOKEN,
                                unknown=UNKNOWN_TOKEN)
            vocabs.add_word_lst(word_list)
            vocabs.add(START_DECODING)
            vocabs.add(STOP_DECODING)
            if config.neighbor_process == "sep":
                vocabs.add(SEP)
            vocabs.build_vocab()
            vocab_dict["vocab"] = vocabs

        logger.info(f"vocab size = {len(vocabs)}")
        assert len(vocabs) == config.vocab_size
        dataInfo.set_vocab(vocabs, "vocab")

        for key, dataset in dataInfo.datasets.items():
            # do not process the training set in test mode
            if config.mode == "test" and key == "train":
                continue

            data_dict = {
                "enc_input": [],
                "nbr_inputs": [],
                "graph": [],
                "dec_input": [],
                "target": [],
                "dec_len": [],
                "article_oovs": [],
                "enc_input_extend_vocab": [],
            }
            logger.info(
                f"start construct the input of the model for {key} set, please wait..."
            )
            for instance in dataset:
                graph_inputs = instance["pad_graph_inputs_wd"]
                abstract_sentences = instance["summary_wd"]
                enc_input = instance["pad_input_wd"]
                enc_input, nbr_inputs, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = \
                    getting_full_info(enc_input, graph_inputs, abstract_sentences, dataInfo.vocabs['vocab'], config)
                graph = generate_dgl_graph(instance["paper_id"],
                                           instance["graph_struct"],
                                           instance["nodes_num"])
                data_dict["graph"].append(graph)
                data_dict["enc_input"].append(enc_input)
                data_dict["nbr_inputs"].append(nbr_inputs)
                data_dict["dec_input"].append(dec_input)
                data_dict["target"].append(target)
                data_dict["dec_len"].append(dec_len)
                data_dict["article_oovs"].append(article_oovs)
                data_dict["enc_input_extend_vocab"].append(
                    enc_input_extend_vocab)

            dataset.add_field("enc_input", data_dict["enc_input"])
            dataset.add_field("nbr_inputs", data_dict["nbr_inputs"])
            dataset.add_field("dec_input", data_dict["dec_input"])
            dataset.add_field("target", data_dict["target"])
            dataset.add_field("dec_len", data_dict["dec_len"])
            dataset.add_field("article_oovs", data_dict["article_oovs"])
            dataset.add_field("enc_input_extend_vocab",
                              data_dict["enc_input_extend_vocab"])

            dataset.add_field("graph", data_dict["graph"])
            dataset.set_ignore_type(
                'graph')  # without this line, there may be some errors
            dataset.set_input("graph")

            dataset.set_input("nbr_inputs_len", "nbr_inputs", "enc_len",
                              "enc_input", "enc_len_mask", "dec_input",
                              "dec_len", "article_oovs", "nodes_num",
                              "enc_input_extend_vocab")
            dataset.set_target("target", "article_oovs", "abstract_sentences")

            dataset.delete_field('graph_inputs_wd')
            dataset.delete_field('pad_graph_inputs_wd')
            dataset.delete_field('input_wd')
            dataset.delete_field('pad_input_wd')
        logger.info("------load dataset over---------")
        return dataInfo, vocabs

Exemplo n.º 28

0

Exibir arquivo

    def change_tag(ins):
        words = ['[unused14]'] + ins['words'][1:]
        return words

    for target in target_list:
        all_data[target]['POS-ctb9'].apply(change_tag, new_field_name='words')

    print(all_data['train']['POS-ctb9'][0]['words'][:1])

    for task in all_data['train'].keys():
        if task.startswith('Parsing'):
            continue
        dataset = all_data['train'][task]
        for word_lst in dataset['words']:
            chars_vocab.add_word_lst(word_lst)

    pos_idx = chars_vocab.to_index('[unused14]')
    print(pos_idx)

    label_vocab['POS'] = Vocabulary().from_dataset(
        all_data['train']['POS-ctb9'], field_name='target')
    label_vocab['CWS'] = Vocabulary().from_dataset(
        all_data['train']['CWS-pku'], field_name='target')
    label_vocab['NER'] = Vocabulary().from_dataset(
        all_data['train']['NER-msra'], field_name='target')
    label_vocab['Parsing'] = torch.load('vocab/parsing_vocab')
    label_vocab['pos'] = Vocabulary().from_dataset(
        all_data['train']['Parsing-ctb9'], field_name='pos')

    for target in target_list:

Exemplo n.º 29

0

Exibir arquivo

Arquivo: preprocess.py Projeto: txsun1997/Bachelor-Thesis-XDU

        dev_file = None
        test_file = None
        for f in v:
            # example of f: 'health.dev'
            data_type = f.split('.')[1]
            if data_type == 'train':
                train_file = os.path.join(data_path, f)
            elif data_type == 'dev':
                dev_file = os.path.join(data_path, f)
            elif data_type == 'test':
                test_file = os.path.join(data_path, f)
            else:
                raise ValueError('unknown dataset type')
        train_set = read_instances_from_file(train_file)
        train_set.add_field('task_id', [task_id] * len(train_set))
        train_set.apply(lambda x: vocab.add_word_lst(x['words']))

        dev_set = read_instances_from_file(dev_file)
        dev_set.add_field('task_id', [task_id] * len(dev_set))
        dev_set.apply(lambda x: vocab.add_word_lst(x['words']))

        test_set = read_instances_from_file(test_file)
        test_set.add_field('task_id', [task_id] * len(test_set))
        # test_set.apply(lambda x: vocab.add_word_lst(x['words']))

        task = Task(task_id, k, train_set, dev_set, test_set)
        task_lst.append(task)

    logger.info('Building vocabulary...')
    vocab.build_vocab()
    logger.info('Finished. Size of vocab: {}.'.format(len(vocab)))

Exemplo n.º 30

0

Exibir arquivo

from fastNLP.io import WeiboSenti100kPipe
from fastNLP.embeddings import BertEmbedding
from fastNLP.io.pipe.qa import CMRC2018Loader
from fastNLP.io import CNXNLILoader
from fastNLP.io import WeiboNERLoader
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

if __name__ == "__main__":
    # 下载情感分析-分类数据
    data_bundle = WeiboSenti100kPipe().process_from_file()
    data_bundle.rename_field('chars', 'words')
    # 下载bert
    embed = BertEmbedding(data_bundle.get_vocab('words'),
                          model_dir_or_name='cn-wwm',
                          include_cls_sep=True)
    # 问答数据
    data_bundle = CMRC2018Loader().load()
    # 文本匹配
    data_bundle = CNXNLILoader().load()
    # NER
    data_bundle = WeiboNERLoader().load()
    # embedding
    vocab = Vocabulary()
    vocab.add_word_lst("你 好 .".split())
    embed = StaticEmbedding(vocab, model_dir_or_name='cn-sgns-literature-word')