Пример #1
0
def _get_vocab(data_list):
    vocab = Vocabulary(unknown=unk_str, padding=pad_str)
    for l in data_list:
        vocab.add_word_lst(l)
    vocab.build_vocab()
    print('vocab', len(vocab))
    return vocab
Пример #2
0
    def test_same_vector5(self):
        # 检查通过使用min_freq后的word是否内容一致
        word_lst = ["they", "the", "they", "the", 'he', 'he', "a", "A"]
        no_create_word_lst = ['of', "of", "she", "she", 'With', 'with']
        all_words = word_lst[:-2] + no_create_word_lst[:-2]
        vocab = Vocabulary().add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=False,
                                min_freq=2)
        words = torch.LongTensor([[vocab.to_index(word)
                                   for word in all_words]])
        words = embed(words)

        min_freq_vocab = Vocabulary(min_freq=2).add_word_lst(word_lst)
        min_freq_vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        min_freq_embed = StaticEmbedding(min_freq_vocab,
                                         model_dir_or_name='en-glove-6B-100d',
                                         lower=False)
        min_freq_words = torch.LongTensor(
            [[min_freq_vocab.to_index(word.lower()) for word in all_words]])
        min_freq_words = min_freq_embed(min_freq_words)

        for idx in range(len(all_words)):
            word_i, word_j = words[0, idx], min_freq_words[0, idx]
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(
                    min_freq_embed.embed_size)
Пример #3
0
    def test_save_and_load(self):
        fp = 'vocab_save_test.txt'
        try:
            # check word2idx没变,no_create_entry正常
            words = list('abcdefaddfdkjfe')
            no_create_entry = list('12342331')
            unk = '[UNK]'
            vocab = Vocabulary(unknown=unk, max_size=500)

            vocab.add_word_lst(words)
            vocab.add_word_lst(no_create_entry, no_create_entry=True)
            vocab.save(fp)

            new_vocab = Vocabulary.load(fp)

            for word, index in vocab:
                self.assertEqual(new_vocab.to_index(word), index)
            for word in no_create_entry:
                self.assertTrue(new_vocab._is_word_no_create_entry(word))
            for word in words:
                self.assertFalse(new_vocab._is_word_no_create_entry(word))
            for idx in range(len(vocab)):
                self.assertEqual(vocab.to_word(idx), new_vocab.to_word(idx))
            self.assertEqual(vocab.unknown, new_vocab.unknown)
        except:
            import os
            if os.path.exists(fp):
                os.remove(fp)
Пример #4
0
    def test_same_vector4(self):
        # 验证在有min_freq下的lower
        word_lst = ["The", "the", "the", "The", "a", "A"]
        no_create_word_lst = ['of', 'Of', "Of", "of", 'With', 'with']
        all_words = word_lst[:-2] + no_create_word_lst[:-2]
        vocab = Vocabulary(min_freq=2).add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=True)
        words = torch.LongTensor([[vocab.to_index(word)
                                   for word in all_words]])
        words = embed(words)

        lowered_word_lst = [word.lower() for word in word_lst]
        lowered_no_create_word_lst = [
            word.lower() for word in no_create_word_lst
        ]
        lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst)
        lowered_vocab.add_word_lst(lowered_no_create_word_lst,
                                   no_create_entry=True)
        lowered_embed = StaticEmbedding(lowered_vocab,
                                        model_dir_or_name='en-glove-6B-100d',
                                        lower=False)
        lowered_words = torch.LongTensor(
            [[lowered_vocab.to_index(word.lower()) for word in all_words]])
        lowered_words = lowered_embed(lowered_words)

        for idx in range(len(all_words)):
            word_i, word_j = words[0, idx], lowered_words[0, idx]
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
Пример #5
0
    def test_case(self):
        vocab = Vocabulary().add_word_lst("This is a test .".split())
        vocab.add_word_lst("Another test !".split())
        embed = StaticEmbedding(vocab,
                                model_dir_or_name=None,
                                embedding_dim=10)

        encoder_output = torch.randn(2, 3, 10)
        tgt_words_idx = torch.LongTensor([[1, 2, 3, 4], [2, 3, 0, 0]])
        src_seq_len = torch.LongTensor([3, 2])
        encoder_mask = seq_len_to_mask(src_seq_len)

        for flag in [True, False]:
            for attention in [True, False]:
                with self.subTest(bind_decoder_input_output_embed=flag,
                                  attention=attention):
                    decoder = LSTMSeq2SeqDecoder(
                        embed=embed,
                        num_layers=2,
                        hidden_size=10,
                        dropout=0.3,
                        bind_decoder_input_output_embed=flag,
                        attention=attention)
                    state = decoder.init_state(encoder_output, encoder_mask)
                    output = decoder(tgt_words_idx, state)
                    self.assertEqual(tuple(output.size()), (2, 4, len(vocab)))
Пример #6
0
    def test_case(self):
        vocab = Vocabulary().add_word_lst("This is a test .".split())
        vocab.add_word_lst("Another test !".split())
        embed = StaticEmbedding(vocab, embedding_dim=10)

        encoder_output = torch.randn(2, 3, 10)
        src_seq_len = torch.LongTensor([3, 2])
        encoder_mask = seq_len_to_mask(src_seq_len)

        for flag in [True, False]:
            with self.subTest(bind_decoder_input_output_embed=flag):
                decoder = TransformerSeq2SeqDecoder(
                    embed=embed,
                    pos_embed=None,
                    d_model=10,
                    num_layers=2,
                    n_head=5,
                    dim_ff=20,
                    dropout=0.1,
                    bind_decoder_input_output_embed=True)
                state = decoder.init_state(encoder_output, encoder_mask)
                output = decoder(tokens=torch.randint(0,
                                                      len(vocab),
                                                      size=(2, 4)),
                                 state=state)
                self.assertEqual(output.size(), (2, 4, len(vocab)))
Пример #7
0
def word_to_id(glove_data, glove_matrix, vocab_dict_path, file_path):
    if os.path.exists(glove_data) == False or os.path.exists(
            glove_matrix) == False:
        data, feature_words, user_num, item_num, = feature_word(file_path)
        vocab = Vocabulary(max_size=len(feature_words) + 1,
                           unknown='unk',
                           padding='PAD')
        vocab.add_word_lst(feature_words)
        vocab.build_vocab()
        matrix = EmbedLoader.load_with_vocab(vocab_dict_path, vocab)
        matrix = torch.tensor(matrix)

        for d in range(len(data)):
            review = []
            for word in data[d]['reviewText']:
                review.append(vocab.to_index(word))
            data[d]['reviewText'] = review

        with open(glove_data, 'wb') as f:
            pickle.dump(data, f)

        with open(glove_matrix, 'wb') as f:
            pickle.dump(matrix, f)

    with open(glove_data, 'rb') as f:
        glove_data = pickle.load(f)
    with open(glove_matrix, 'rb') as f:
        matrix = pickle.load(f)

    return glove_data, matrix, len(glove_data[0]['reviewText'])
Пример #8
0
 def test_search(self):
     """语义搜索.TypeError: expected dimension <= 2 array or matrix
     """
     print('{} test_search {}'.format('-' * 15, '-' * 15))
     texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基']
     # 文本向量化
     vocab = Vocabulary()
     for text in texts:
         vocab.add_word_lst(list(text))
     print(len(vocab))
     embed = StaticEmbedding(
         vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt')
     texts_to_id = [[vocab.to_index(word) for word in list(text)]
                    for text in texts]
     words = torch.LongTensor(texts_to_id)  # 将文本转为index
     features_vec = embed(words)
     print(features_vec.shape)
     # build the search index!
     cp = ci.MultiClusterIndex(features_vec.detach().numpy(), texts)
     search_texts = ['朱日和站', '温都尔站', '国电站']
     for text in search_texts:
         texts_to_id = [[vocab.to_index(word) for word in list(text)]]
         words = torch.LongTensor(texts_to_id)  # 将文本转为index
         features_vec = embed(words)
         search_features_vec = features_vec.detach().numpy()
         search_result = cp.search(search_features_vec,
                                   k=2,
                                   k_clusters=2,
                                   return_distance=True)
         print('text:{}'.format(text))
         print('search_result:{}'.format(search_result))
     """
Пример #9
0
    def test_same_vector3(self):
        # 验证lower
        word_lst = ["The", "the"]
        no_create_word_lst = ['of', 'Of', 'With', 'with']
        vocab = Vocabulary().add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=True)
        words = torch.LongTensor(
            [[vocab.to_index(word) for word in word_lst + no_create_word_lst]])
        words = embed(words)

        lowered_word_lst = [word.lower() for word in word_lst]
        lowered_no_create_word_lst = [
            word.lower() for word in no_create_word_lst
        ]
        lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst)
        lowered_vocab.add_word_lst(lowered_no_create_word_lst,
                                   no_create_entry=True)
        lowered_embed = StaticEmbedding(lowered_vocab,
                                        model_dir_or_name='en-glove-6B-100d',
                                        lower=False)
        lowered_words = torch.LongTensor([[
            lowered_vocab.to_index(word)
            for word in lowered_word_lst + lowered_no_create_word_lst
        ]])
        lowered_words = lowered_embed(lowered_words)

        all_words = word_lst + no_create_word_lst

        for idx, (word_i, word_j) in enumerate(zip(words[0],
                                                   lowered_words[0])):
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
Пример #10
0
def get_vocab(dataset):
    vocabulary = Vocabulary(unknown=unk_str, padding=pad_str)
    for data, _ in dataset:
        vocabulary.add_word_lst(data)
    print('vocab', len(vocabulary))
    print('pad', vocabulary.to_index(pad_str))

    return vocabulary
Пример #11
0
 def test_Index2WordProcessor(self):
     vocab = Vocabulary()
     vocab.add_word_lst(["a", "b", "c", "d", "e"])
     proc = Index2WordProcessor(vocab, "tag_id", "tag")
     data_set = DataSet(
         [Instance(tag_id=[np.random.randint(0, 7) for _ in range(32)])])
     data_set = proc(data_set)
     self.assertTrue("tag" in data_set)
Пример #12
0
def get_vocabulary(data, min_freq):
    # train data -> vocabulary
    # alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    alphabet = "0123456789,"
    char_list = [c for c in alphabet]
    vocabulary = Vocabulary(padding='<pad>', unknown='<unk>')
    vocabulary.add_word_lst(char_list)
    vocabulary.build_vocab()
    print('vocab size', len(vocabulary), 'pad', vocabulary.padding_idx, 'unk', vocabulary.unknown_idx)
    return vocabulary
def prepare_env():
    vocab = Vocabulary().add_word_lst("This is a test .".split())
    vocab.add_word_lst("Another test !".split())
    embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5)

    encoder_output = torch.randn(2, 3, 10)
    src_seq_len = torch.LongTensor([3, 2])
    encoder_mask = seq_len_to_mask(src_seq_len)

    return embed, encoder_output, encoder_mask
Пример #14
0
def get_vocabulary(data, min_freq):
    # train data -> vocabulary
    vocabulary = Vocabulary(min_freq=min_freq, padding='<pad>', unknown='<unk>')
    for filename in data:
        for value in data[filename]:
            for word_list in data[filename][value]['data']:
                vocabulary.add_word_lst(word_list)
    vocabulary.build_vocab()
    print('vocab size', len(vocabulary), 'pad', vocabulary.padding_idx, 'unk', vocabulary.unknown_idx)
    return vocabulary
Пример #15
0
def prepare_env():
    vocab = Vocabulary().add_word_lst("This is a test .".split())
    vocab.add_word_lst("Another test !".split())
    embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5)

    src_words_idx = torch.LongTensor([[3, 1, 2], [1, 2, 0]])
    tgt_words_idx = torch.LongTensor([[1, 2, 3, 4], [2, 3, 0, 0]])
    src_seq_len = torch.LongTensor([3, 2])
    tgt_seq_len = torch.LongTensor([4, 2])

    return embed, src_words_idx, tgt_words_idx, src_seq_len, tgt_seq_len
Пример #16
0
def load_dataset(
    data_dir='/remote-home/ygxu/workspace/Product_all',
    data_path='mr.task.train',
    # bert_dir='/home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12',
    bert_dir='/remote-home/ygxu/workspace/BERT/BERT_English_uncased_L-24_H-1024_A_16',
):

    path = os.path.join(data_dir, data_path)

    ds = DataSet.read_csv(path, headers=('label', 'raw_sentence'), sep='\t')

    ds.apply(lambda x: x['raw_sentence'].lower(),
             new_field_name='raw_sentence')

    ds.apply(lambda x: int(x['label']),
             new_field_name='label_seq',
             is_target=True)

    def transfer_bert_to_fastnlp(ins):
        result = "[CLS] "
        bert_text = ins['bert_tokenize_list']
        for text in bert_text:
            result += text + " "
        return result.strip()

    with open(os.path.join(bert_dir, 'vocab.txt')) as f:
        lines = f.readlines()
    vocabs = []
    for line in lines:
        vocabs.append(line[:-1])

    vocab_bert = Vocabulary(unknown=None, padding=None)
    vocab_bert.add_word_lst(vocabs)
    vocab_bert.build_vocab()
    vocab_bert.unknown = '[UNK]'
    vocab_bert.padding = '[PAD]'

    from pytorch_pretrained_bert import BertTokenizer, BertModel
    tokenizer = BertTokenizer.from_pretrained(
        os.path.join(bert_dir, 'vocab.txt'))
    ds.apply(lambda x: tokenizer.tokenize(x['raw_sentence']),
             new_field_name='bert_tokenize_list')
    ds.apply(transfer_bert_to_fastnlp, new_field_name='bert_tokenize')
    ds.apply(lambda x:
             [vocab_bert.to_index(word) for word in x['bert_tokenize_list']],
             new_field_name='index_words',
             is_input=True)

    ds.rename_field('index_words', 'tokens')
    ds.apply(lambda x: [1.] * len(x['tokens']),
             new_field_name='masks',
             is_input=True)

    return ds
Пример #17
0
def read_vocab(file_name):
    # 读入vocab文件
    with open(file_name) as f:
        lines = f.readlines()
    vocabs = []
    for line in lines:
        vocabs.append(line.strip())

    # 实例化Vocabulary
    vocab = Vocabulary(unknown='<unk>', padding='<pad>')
    # 将vocabs列表加入Vocabulary
    vocab.add_word_lst(vocabs)
    # 构建词表
    vocab.build_vocab()
    return vocab
Пример #18
0
 def test_rebuild(self):
     # 测试build之后新加入词,原来的词顺序不变
     vocab = Vocabulary()
     text = [str(idx) for idx in range(10)]
     vocab.update(text)
     for i in text:
         self.assertEqual(int(i) + 2, vocab.to_index(i))
     indexes = []
     for word, index in vocab:
         indexes.append((word, index))
     vocab.add_word_lst([str(idx) for idx in range(10, 13)])
     for idx, pair in enumerate(indexes):
         self.assertEqual(pair[1], vocab.to_index(pair[0]))
     for i in range(13):
         self.assertEqual(int(i) + 2, vocab.to_index(str(i)))
Пример #19
0
 def test_fit(self):
     """文本编码.
     """
     print('{} test_fit {}'.format('-' * 15, '-' * 15))
     texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基']
     vocab = Vocabulary()
     for text in texts:
         vocab.add_word_lst(list(text))
     print(len(vocab))
     embed = StaticEmbedding(
         vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt')
     texts_to_id = [[vocab.to_index(word) for word in list(text)]
                    for text in ['朱日和', '东台变']]
     print(texts_to_id)  # [[16, 17, 18], [6, 1, 1]]
     words = torch.LongTensor(texts_to_id)  # 将文本转为index
     print(embed(words).size())  # torch.Size([2, 3, 100])
Пример #20
0
def get_vocab(trainset, testset):
    # 构建vocab以及word2idx
    #tok
    tok_vocab = Vocabulary()
    tok_vocab.from_dataset(trainset,
                           field_name="tok",
                           no_create_entry_dataset=testset)
    tok_vocab.index_dataset(trainset,
                            testset,
                            field_name="tok",
                            new_field_name="chars")
    tok_vocab.index_dataset(trainset,
                            testset,
                            field_name="asp",
                            new_field_name="aspect")
    # deprel
    dep_vocab = Vocabulary()
    dep_vocab.from_dataset(trainset, field_name="deprel")
    dep_vocab.index_dataset(trainset,
                            testset,
                            field_name="deprel",
                            new_field_name="depidx")
    # pol(target)
    pol_vocab = Vocabulary(padding=None, unknown=None)
    pol_vocab.from_dataset(trainset, field_name="pol")
    pol_vocab.index_dataset(trainset,
                            testset,
                            field_name="pol",
                            new_field_name="target")
    # pos
    pos_vocab = Vocabulary()
    pos_vocab.from_dataset(trainset, field_name="pos")
    pos_vocab.index_dataset(trainset,
                            testset,
                            field_name="pos",
                            new_field_name="posidx")
    # post
    max_len = max(max(trainset["seq_len"]), max(testset["seq_len"]))
    post_vocab = Vocabulary()
    post_vocab.add_word_lst(list(range(-max_len, max_len)))
    post_vocab.index_dataset(trainset,
                             testset,
                             field_name="post",
                             new_field_name="postidx")
    return tok_vocab, pos_vocab, post_vocab, trainset, testset
Пример #21
0
def handle_data(n_class):
    train_data = get_text_classification_datasets(n_class)
    dataset = DataSet()
    vocab = Vocabulary(min_freq=0, unknown='<unk>', padding='<pad>')
    for i in range(len(train_data.data)):
        ans = remove_punc(train_data.data[i])
        dataset.append((Instance(content=ans,
                                 target=int(train_data.target[i]))))
    dataset.apply(lambda x: x['content'].lower().split(),
                  new_field_name='words',
                  is_input=True)
    for txt in dataset:
        vocab.add_word_lst(txt['words'])
    vocab.build_vocab()
    # index句子, Vocabulary.to_index(word)
    dataset.apply(lambda x: [vocab.to_index(word) for word in x['words']],
                  new_field_name='index')
    dataset.set_input("index")
    dataset.set_target("target")
    tra, dev = dataset.split(0.2)
    return tra, dev, len(vocab)
def prepare_env():
    vocab = Vocabulary().add_word_lst("This is a test .".split())
    vocab.add_word_lst("Another test !".split())
    embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5)

    src_words_idx = [[3, 1, 2], [1, 2]]
    # tgt_words_idx = [[1, 2, 3, 4], [2, 3]]
    src_seq_len = [3, 2]
    # tgt_seq_len = [4, 2]

    ds = DataSet({
        'src_tokens': src_words_idx,
        'src_seq_len': src_seq_len,
        'tgt_tokens': src_words_idx,
        'tgt_seq_len': src_seq_len
    })

    ds.set_input('src_tokens', 'tgt_tokens', 'src_seq_len')
    ds.set_target('tgt_seq_len', 'tgt_tokens')

    return embed, ds
Пример #23
0
def get_data(filepath):
    data = np.load(filepath, allow_pickle=True)
    data, _, ix2word = data['data'], data['word2ix'].item(
    ), data['ix2word'].item()
    wordlist = []
    for d in data:
        for ix in d:
            wordlist.append(ix2word[ix])
    vocab = Vocabulary(min_freq=10, padding="</s>")
    vocab.add_word_lst(wordlist)
    vocab.build_vocab()
    # vocab = Vocabulary(min_freq=10, padding="</s>").add_word_lst(wordlist).build_vocab()
    vocab_size = len(vocab.word2idx)
    for d in data:
        for i in range(len(d)):
            # d[i] = vocab[vocab.to_word(d[i])]
            if d[i] >= vocab_size:
                d[i] = vocab["<unk>"]

    print(vocab_size)

    return data, vocab
Пример #24
0
    def test1(self):
        # 测试能否正常打印
        from fastNLP import Vocabulary
        from fastNLP.core.utils import ConfusionMatrix
        import numpy as np
        vocab = Vocabulary(unknown=None, padding=None)
        vocab.add_word_lst(list('abcdef'))
        confusion_matrix = ConfusionMatrix(vocab)
        for _ in range(3):
            length = np.random.randint(1, 5)
            pred = np.random.randint(0, 3, size=(length,))
            target = np.random.randint(0, 3, size=(length,))
            confusion_matrix.add_pred_target(pred, target)
        print(confusion_matrix)

        # 测试print_ratio
        confusion_matrix = ConfusionMatrix(vocab, print_ratio=True)
        for _ in range(3):
            length = np.random.randint(1, 5)
            pred = np.random.randint(0, 3, size=(length,))
            target = np.random.randint(0, 3, size=(length,))
            confusion_matrix.add_pred_target(pred, target)
        print(confusion_matrix)
Пример #25
0
    def test_no_entry(self):
        # 先建立vocabulary,然后变化no_create_entry, 测试能否正确识别
        text = [
            "FastNLP", "works", "well", "in", "most", "cases", "and", "scales",
            "well", "in", "works", "well", "in", "most", "cases", "scales",
            "well"
        ]
        vocab = Vocabulary()
        vocab.add_word_lst(text)

        self.assertFalse(vocab._is_word_no_create_entry('FastNLP'))
        vocab.add_word('FastNLP', no_create_entry=True)
        self.assertFalse(vocab._is_word_no_create_entry('FastNLP'))

        vocab.add_word('fastnlp', no_create_entry=True)
        self.assertTrue(vocab._is_word_no_create_entry('fastnlp'))
        vocab.add_word('fastnlp', no_create_entry=False)
        self.assertFalse(vocab._is_word_no_create_entry('fastnlp'))

        vocab.add_word_lst(['1'] * 10, no_create_entry=True)
        self.assertTrue(vocab._is_word_no_create_entry('1'))
        vocab.add_word('1')
        self.assertFalse(vocab._is_word_no_create_entry('1'))
Пример #26
0
def equip_chinese_ner_with_skip(datasets, vocabs, embeddings, w_list, word_embedding_path=None,
                                word_min_freq=1, only_train_min_freq=0):
    from utils_ import Trie, get_skip_path
    from functools import partial
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    # for k,v in datasets.items():
    #     v.apply_field(partial(get_skip_path,w_trie=w_trie),'chars','skips')

    def skips2skips_l2r(chars, w_trie):
        '''

        :param lexicons: list[[int,int,str]]
        :return: skips_l2r
        '''
        # print(lexicons)
        # print('******')

        lexicons = get_skip_path(chars, w_trie=w_trie)

        # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0

        result = [[] for _ in range(len(chars))]

        for lex in lexicons:
            s = lex[0]
            e = lex[1]
            w = lex[2]

            result[e].append([s, w])

        return result

    def skips2skips_r2l(chars, w_trie):
        '''

        :param lexicons: list[[int,int,str]]
        :return: skips_l2r
        '''
        # print(lexicons)
        # print('******')

        lexicons = get_skip_path(chars, w_trie=w_trie)

        # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0

        result = [[] for _ in range(len(chars))]

        for lex in lexicons:
            s = lex[0]
            e = lex[1]
            w = lex[2]

            result[s].append([e, w])

        return result

    for k, v in datasets.items():
        v.apply_field(partial(skips2skips_l2r, w_trie=w_trie), 'chars', 'skips_l2r')

    for k, v in datasets.items():
        v.apply_field(partial(skips2skips_r2l, w_trie=w_trie), 'chars', 'skips_r2l')

    # print(v['skips_l2r'][0])
    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab
    for k, v in datasets.items():
        v.apply_field(lambda x: [list(map(lambda x: x[0], p)) for p in x], 'skips_l2r', 'skips_l2r_source')
        v.apply_field(lambda x: [list(map(lambda x: x[1], p)) for p in x], 'skips_l2r', 'skips_l2r_word')

    for k, v in datasets.items():
        v.apply_field(lambda x: [list(map(lambda x: x[0], p)) for p in x], 'skips_r2l', 'skips_r2l_source')
        v.apply_field(lambda x: [list(map(lambda x: x[1], p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    for k, v in datasets.items():
        v.apply_field(lambda x: list(map(len, x)), 'skips_l2r_word', 'lexicon_count')
        v.apply_field(lambda x:
                      list(map(lambda y:
                               list(map(lambda z: word_vocab.to_index(z), y)), x)),
                      'skips_l2r_word', new_field_name='skips_l2r_word')

        v.apply_field(lambda x: list(map(len, x)), 'skips_r2l_word', 'lexicon_count_back')

        v.apply_field(lambda x:
                      list(map(lambda y:
                               list(map(lambda z: word_vocab.to_index(z), y)), x)),
                      'skips_r2l_word', new_field_name='skips_r2l_word')

    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0)
        embeddings['word'] = word_embedding

    vocabs['char'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
    vocabs['bigram'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
    vocabs['label'].index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    return datasets, vocabs, embeddings
Пример #27
0
    def process(self, paths, config, load_vocab_file=True):
        """
        :param paths: dict  path for each dataset
        :param load_vocab_file: bool  build vocab (False) or load vocab (True)
        :return: DataBundle
            datasets: dict  keys correspond to the paths dict
            vocabs: dict  key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True)
            embeddings: optional
        """

        vocab_size = config.vocab_size

        def _merge_abstracts(abstracts):
            merged = []
            for abstract in abstracts:
                merged.extend(abstract[:self.max_concat_len] + [SEP])
            if len(abstracts) == 0:
                assert merged == []
            return merged[:-1]

        def _pad_graph_inputs(graph_inputs):
            pad_text_wd = []
            max_len = config.max_graph_enc_steps

            for graph_input in graph_inputs:
                if len(graph_input) < max_len:
                    pad_num = max_len - len(graph_input)
                    graph_input.extend([PAD_TOKEN] * pad_num)
                else:
                    graph_input = graph_input[:max_len]
                pad_text_wd.append(graph_input)

            if len(pad_text_wd) == 0:
                pad_text_wd.append([PAD_TOKEN] * max_len)

            return pad_text_wd

        def _get_nbr_input_len(input_wd):
            enc_len = [
                min(len(text), config.max_graph_enc_steps) for text in input_wd
            ]
            if len(enc_len) == 0:
                enc_len = [0]
            return enc_len

        def _pad_article(text_wd):
            token_num = len(text_wd)
            max_len = config.max_enc_steps
            if config.neighbor_process == "sep":
                max_len += self.max_concat_len * self.max_concat_num
            if token_num < max_len:
                padding = [PAD_TOKEN] * (max_len - token_num)
                article = text_wd + padding
            else:
                article = text_wd[:max_len]
            return article

        def _split_list(input_list):
            return [text.split() for text in input_list]

        def sent_tokenize(abstract):
            abs_list = abstract.split(".")
            return [(abst + ".") for abst in abs_list[:-1]]

        def _article_token_mask(text_wd):
            max_enc_len = config.max_enc_steps
            if config.neighbor_process == "sep":
                max_enc_len += self.max_concat_len * self.max_concat_num
            token_num = len(text_wd)
            if token_num < max_enc_len:
                mask = [1] * token_num + [0] * (max_enc_len - token_num)
            else:
                mask = [1] * max_enc_len
            return mask

        def generate_article_input(text, abstracts):
            if config.neighbor_process == "sep":
                text_wd = text.split()[:config.max_enc_steps]
                text_wd.append(SEP)
                abstracts_wd = _merge_abstracts(abstracts)
                return text_wd + abstracts_wd
            else:
                return text.split()

        def generate_graph_inputs(graph_struct):

            graph_inputs_ = [
                graph_strut_dict[pid][config.graph_input_type]
                for pid in graph_struct
            ]
            return _split_list(graph_inputs_[1:])

        def generate_graph_structs(paper_id):
            sub_graph_dict = {}
            sub_graph_set = []

            n_hop = config.n_hop
            max_neighbor_num = config.max_neighbor_num
            k_nbrs = _k_hop_neighbor(paper_id, n_hop, max_neighbor_num)
            for sub_g in k_nbrs:
                sub_graph_set += sub_g

            for node in sub_graph_set:
                sub_graph_dict[node] = []

            for sub_g in k_nbrs:
                for centre_node in sub_g:
                    nbrs = graph_strut_dict[centre_node]['references']
                    c_nbrs = list(set(nbrs).intersection(sub_graph_set))
                    sub_graph_dict[centre_node].extend(c_nbrs)
                    for c_nbr in c_nbrs:
                        sub_graph_dict[c_nbr].append(centre_node)
            # in python 3.6, the first in subgraph dict is source paper
            return sub_graph_dict

        def _k_hop_neighbor(paper_id, n_hop, max_neighbor):
            sub_graph = [[] for _ in range(n_hop + 1)]
            level = 0
            visited = set()
            q = deque()
            q.append([paper_id, level])
            curr_node_num = 0
            while len(q) != 0:
                paper_first = q.popleft()
                paper_id_first, level_first = paper_first
                if level_first > n_hop:
                    return sub_graph
                sub_graph[level_first].append(paper_id_first)
                curr_node_num += 1
                if curr_node_num > max_neighbor:
                    return sub_graph
                visited.add(paper_id_first)
                for pid in graph_strut_dict[paper_id_first]["references"]:
                    if pid not in visited and pid in graph_strut_dict:
                        q.append([pid, level_first + 1])
                        visited.add(pid)

            return sub_graph

        def generate_dgl_graph(paper_id, graph_struct, nodes_num):
            g = dgl.DGLGraph()
            assert len(graph_struct) == nodes_num

            g.add_nodes(len(graph_struct))
            pid2idx = {}
            for index, key_node in enumerate(graph_struct):
                pid2idx[key_node] = index
            assert pid2idx[paper_id] == 0

            for index, key_node in enumerate(graph_struct):
                neighbor = [pid2idx[node] for node in graph_struct[key_node]]
                # add self loop
                neighbor.append(index)
                key_nodes = [index] * len(neighbor)
                g.add_edges(key_nodes, neighbor)
            return g

        train_ds = None
        dataInfo = self.load(paths)

        # pop nodes in train graph in inductive setting
        if config.mode == "test" and self.setting == "inductive":
            dataInfo.datasets.pop("train")

        graph_strut_dict = {}
        for key, ds in dataInfo.datasets.items():
            for ins in ds:
                graph_strut_dict[ins["paper_id"]] = ins

        logger.info(f"the input graph G_v has {len(graph_strut_dict)} nodes")

        for key, ds in dataInfo.datasets.items():
            # process summary
            ds.apply(lambda x: x['abstract'].split(),
                     new_field_name='summary_wd')
            ds.apply(lambda x: sent_tokenize(x['abstract']),
                     new_field_name='abstract_sentences')
            # generate graph

            ds.apply(lambda x: generate_graph_structs(x["paper_id"]),
                     new_field_name="graph_struct")
            ds.apply(lambda x: generate_graph_inputs(x["graph_struct"]),
                     new_field_name='graph_inputs_wd')

            ds.apply(lambda x: len(x["graph_inputs_wd"]) + 1,
                     new_field_name="nodes_num")
            # pad input
            ds.apply(lambda x: generate_article_input(x['introduction'], x[
                "graph_inputs_wd"]),
                     new_field_name='input_wd')
            ds.apply(lambda x: _article_token_mask(x["input_wd"]),
                     new_field_name="enc_len_mask")
            ds.apply(lambda x: sum(x["enc_len_mask"]),
                     new_field_name="enc_len")
            ds.apply(lambda x: _pad_article(x["input_wd"]),
                     new_field_name="pad_input_wd")

            ds.apply(lambda x: _get_nbr_input_len(x["graph_inputs_wd"]),
                     new_field_name="nbr_inputs_len")

            ds.apply(lambda x: _pad_graph_inputs(x["graph_inputs_wd"]),
                     new_field_name="pad_graph_inputs_wd")
            if key == "train":
                train_ds = ds

        vocab_dict = {}
        if not load_vocab_file:
            logger.info("[INFO] Build new vocab from training dataset!")
            if train_ds is None:
                raise ValueError("Lack train file to build vocabulary!")

            vocabs = Vocabulary(max_size=config.vocab_size - 2,
                                padding=PAD_TOKEN,
                                unknown=UNKNOWN_TOKEN)
            vocabs.from_dataset(train_ds,
                                field_name=["input_wd", "summary_wd"])
            vocabs.add_word(START_DECODING)
            vocabs.add_word(STOP_DECODING)
            vocab_dict["vocab"] = vocabs
            # save vocab
            with open(os.path.join(config.train_path, "vocab"),
                      "w",
                      encoding="utf8") as f:
                for w, idx in vocabs:
                    f.write(str(w) + "\t" + str(idx) + "\n")
            logger.info(
                "build new vocab ends.. please reRun the code with load_vocab = True"
            )
            exit(0)
        else:

            logger.info("[INFO] Load existing vocab from %s!" %
                        config.vocab_path)
            word_list = []
            cnt = 3  # pad and unk
            if config.neighbor_process == "sep":
                cnt += 1

            with open(config.vocab_path, 'r', encoding='utf8') as vocab_f:
                for line in vocab_f:
                    pieces = line.split("\t")
                    word_list.append(pieces[0])
                    cnt += 1
                    if cnt > vocab_size:
                        break

            vocabs = Vocabulary(max_size=vocab_size,
                                padding=PAD_TOKEN,
                                unknown=UNKNOWN_TOKEN)
            vocabs.add_word_lst(word_list)
            vocabs.add(START_DECODING)
            vocabs.add(STOP_DECODING)
            if config.neighbor_process == "sep":
                vocabs.add(SEP)
            vocabs.build_vocab()
            vocab_dict["vocab"] = vocabs

        logger.info(f"vocab size = {len(vocabs)}")
        assert len(vocabs) == config.vocab_size
        dataInfo.set_vocab(vocabs, "vocab")

        for key, dataset in dataInfo.datasets.items():
            # do not process the training set in test mode
            if config.mode == "test" and key == "train":
                continue

            data_dict = {
                "enc_input": [],
                "nbr_inputs": [],
                "graph": [],
                "dec_input": [],
                "target": [],
                "dec_len": [],
                "article_oovs": [],
                "enc_input_extend_vocab": [],
            }
            logger.info(
                f"start construct the input of the model for {key} set, please wait..."
            )
            for instance in dataset:
                graph_inputs = instance["pad_graph_inputs_wd"]
                abstract_sentences = instance["summary_wd"]
                enc_input = instance["pad_input_wd"]
                enc_input, nbr_inputs, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = \
                    getting_full_info(enc_input, graph_inputs, abstract_sentences, dataInfo.vocabs['vocab'], config)
                graph = generate_dgl_graph(instance["paper_id"],
                                           instance["graph_struct"],
                                           instance["nodes_num"])
                data_dict["graph"].append(graph)
                data_dict["enc_input"].append(enc_input)
                data_dict["nbr_inputs"].append(nbr_inputs)
                data_dict["dec_input"].append(dec_input)
                data_dict["target"].append(target)
                data_dict["dec_len"].append(dec_len)
                data_dict["article_oovs"].append(article_oovs)
                data_dict["enc_input_extend_vocab"].append(
                    enc_input_extend_vocab)

            dataset.add_field("enc_input", data_dict["enc_input"])
            dataset.add_field("nbr_inputs", data_dict["nbr_inputs"])
            dataset.add_field("dec_input", data_dict["dec_input"])
            dataset.add_field("target", data_dict["target"])
            dataset.add_field("dec_len", data_dict["dec_len"])
            dataset.add_field("article_oovs", data_dict["article_oovs"])
            dataset.add_field("enc_input_extend_vocab",
                              data_dict["enc_input_extend_vocab"])

            dataset.add_field("graph", data_dict["graph"])
            dataset.set_ignore_type(
                'graph')  # without this line, there may be some errors
            dataset.set_input("graph")

            dataset.set_input("nbr_inputs_len", "nbr_inputs", "enc_len",
                              "enc_input", "enc_len_mask", "dec_input",
                              "dec_len", "article_oovs", "nodes_num",
                              "enc_input_extend_vocab")
            dataset.set_target("target", "article_oovs", "abstract_sentences")

            dataset.delete_field('graph_inputs_wd')
            dataset.delete_field('pad_graph_inputs_wd')
            dataset.delete_field('input_wd')
            dataset.delete_field('pad_input_wd')
        logger.info("------load dataset over---------")
        return dataInfo, vocabs
Пример #28
0
    def change_tag(ins):
        words = ['[unused14]'] + ins['words'][1:]
        return words

    for target in target_list:
        all_data[target]['POS-ctb9'].apply(change_tag, new_field_name='words')

    print(all_data['train']['POS-ctb9'][0]['words'][:1])

    for task in all_data['train'].keys():
        if task.startswith('Parsing'):
            continue
        dataset = all_data['train'][task]
        for word_lst in dataset['words']:
            chars_vocab.add_word_lst(word_lst)

    pos_idx = chars_vocab.to_index('[unused14]')
    print(pos_idx)

    label_vocab['POS'] = Vocabulary().from_dataset(
        all_data['train']['POS-ctb9'], field_name='target')
    label_vocab['CWS'] = Vocabulary().from_dataset(
        all_data['train']['CWS-pku'], field_name='target')
    label_vocab['NER'] = Vocabulary().from_dataset(
        all_data['train']['NER-msra'], field_name='target')
    label_vocab['Parsing'] = torch.load('vocab/parsing_vocab')
    label_vocab['pos'] = Vocabulary().from_dataset(
        all_data['train']['Parsing-ctb9'], field_name='pos')

    for target in target_list:
Пример #29
0
        dev_file = None
        test_file = None
        for f in v:
            # example of f: 'health.dev'
            data_type = f.split('.')[1]
            if data_type == 'train':
                train_file = os.path.join(data_path, f)
            elif data_type == 'dev':
                dev_file = os.path.join(data_path, f)
            elif data_type == 'test':
                test_file = os.path.join(data_path, f)
            else:
                raise ValueError('unknown dataset type')
        train_set = read_instances_from_file(train_file)
        train_set.add_field('task_id', [task_id] * len(train_set))
        train_set.apply(lambda x: vocab.add_word_lst(x['words']))

        dev_set = read_instances_from_file(dev_file)
        dev_set.add_field('task_id', [task_id] * len(dev_set))
        dev_set.apply(lambda x: vocab.add_word_lst(x['words']))

        test_set = read_instances_from_file(test_file)
        test_set.add_field('task_id', [task_id] * len(test_set))
        # test_set.apply(lambda x: vocab.add_word_lst(x['words']))

        task = Task(task_id, k, train_set, dev_set, test_set)
        task_lst.append(task)

    logger.info('Building vocabulary...')
    vocab.build_vocab()
    logger.info('Finished. Size of vocab: {}.'.format(len(vocab)))
Пример #30
0
from fastNLP.io import WeiboSenti100kPipe
from fastNLP.embeddings import BertEmbedding
from fastNLP.io.pipe.qa import CMRC2018Loader
from fastNLP.io import CNXNLILoader
from fastNLP.io import WeiboNERLoader
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

if __name__ == "__main__":
    # 下载情感分析-分类数据
    data_bundle = WeiboSenti100kPipe().process_from_file()
    data_bundle.rename_field('chars', 'words')
    # 下载bert
    embed = BertEmbedding(data_bundle.get_vocab('words'),
                          model_dir_or_name='cn-wwm',
                          include_cls_sep=True)
    # 问答数据
    data_bundle = CMRC2018Loader().load()
    # 文本匹配
    data_bundle = CNXNLILoader().load()
    # NER
    data_bundle = WeiboNERLoader().load()
    # embedding
    vocab = Vocabulary()
    vocab.add_word_lst("你 好 .".split())
    embed = StaticEmbedding(vocab, model_dir_or_name='cn-sgns-literature-word')