def process_poems_large(file_name, sentence_len, vocab_size):
    sentences = []
    with open(file_name, "r", encoding='utf-8', ) as f:
        for line in f.readlines():
            try:
                line = line.strip()
                if line:
                    title, content = line.split(':')
                    # print(title)
                    # print(content)
                    # content = line.replace(' ', '').replace(',','').replace('。','')
                    content = content.replace(' ', '') #包含标点符号
                    # 可以只取五言诗
                    # if len(content) < 6 or content[5] != ',':
                    #     continue
                    if len(content) < 20:
                        continue
                    if ':' in content or '_' in content or '(' in content or '(' in content or '《' in content or '[' in content:
                        continue
                    #截断长度
                    if len(content) > sentence_len:
                        content = content[:sentence_len]
                    content = content + end_token
                    sentences.append(content)
            except ValueError as e:
                pass

    dataset = DataSet()
    # sentences = random.sample(sentences, 5000)
    for sentence in sentences:
        instance = Instance()
        instance['raw_sentence'] = sentence
        instance['target'] = sentence[1:] + sentence[-1]
        dataset.append(instance)

    dataset.set_input("raw_sentence")
    dataset.set_target("target")
    
    # for iter in dataset:
    #     print(iter['raw_sentence'])
    print("dataset_size:", len(dataset))

    train_data, dev_data = dataset.split(0.2)
    train_data.rename_field("raw_sentence", "sentence")
    dev_data.rename_field("raw_sentence", "sentence")
    vocab = Vocabulary(max_size=vocab_size, min_freq=2, unknown='<unk>', padding='<pad>')

    # 构建词表
    train_data.apply(lambda x: [vocab.add(word) for word in x['sentence']])
    vocab.build_vocab()

    # 根据词表index句子
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence')
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target')
    dev_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence')
    dev_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target')

    print("vocabulary_size:", len(vocab))

    return train_data, dev_data, vocab
Exemplo n.º 2
0
    def test_same_vector4(self):
        # 验证在有min_freq下的lower
        word_lst = ["The", "the", "the", "The", "a", "A"]
        no_create_word_lst = ['of', 'Of', "Of", "of", 'With', 'with']
        all_words = word_lst[:-2] + no_create_word_lst[:-2]
        vocab = Vocabulary(min_freq=2).add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=True)
        words = torch.LongTensor([[vocab.to_index(word)
                                   for word in all_words]])
        words = embed(words)

        lowered_word_lst = [word.lower() for word in word_lst]
        lowered_no_create_word_lst = [
            word.lower() for word in no_create_word_lst
        ]
        lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst)
        lowered_vocab.add_word_lst(lowered_no_create_word_lst,
                                   no_create_entry=True)
        lowered_embed = StaticEmbedding(lowered_vocab,
                                        model_dir_or_name='en-glove-6B-100d',
                                        lower=False)
        lowered_words = torch.LongTensor(
            [[lowered_vocab.to_index(word.lower()) for word in all_words]])
        lowered_words = lowered_embed(lowered_words)

        for idx in range(len(all_words)):
            word_i, word_j = words[0, idx], lowered_words[0, idx]
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
Exemplo n.º 3
0
    def test_same_vector3(self):
        # 验证lower
        word_lst = ["The", "the"]
        no_create_word_lst = ['of', 'Of', 'With', 'with']
        vocab = Vocabulary().add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=True)
        words = torch.LongTensor(
            [[vocab.to_index(word) for word in word_lst + no_create_word_lst]])
        words = embed(words)

        lowered_word_lst = [word.lower() for word in word_lst]
        lowered_no_create_word_lst = [
            word.lower() for word in no_create_word_lst
        ]
        lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst)
        lowered_vocab.add_word_lst(lowered_no_create_word_lst,
                                   no_create_entry=True)
        lowered_embed = StaticEmbedding(lowered_vocab,
                                        model_dir_or_name='en-glove-6B-100d',
                                        lower=False)
        lowered_words = torch.LongTensor([[
            lowered_vocab.to_index(word)
            for word in lowered_word_lst + lowered_no_create_word_lst
        ]])
        lowered_words = lowered_embed(lowered_words)

        all_words = word_lst + no_create_word_lst

        for idx, (word_i, word_j) in enumerate(zip(words[0],
                                                   lowered_words[0])):
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
Exemplo n.º 4
0
    def test_same_vector5(self):
        # 检查通过使用min_freq后的word是否内容一致
        word_lst = ["they", "the", "they", "the", 'he', 'he', "a", "A"]
        no_create_word_lst = ['of', "of", "she", "she", 'With', 'with']
        all_words = word_lst[:-2] + no_create_word_lst[:-2]
        vocab = Vocabulary().add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=False,
                                min_freq=2)
        words = torch.LongTensor([[vocab.to_index(word)
                                   for word in all_words]])
        words = embed(words)

        min_freq_vocab = Vocabulary(min_freq=2).add_word_lst(word_lst)
        min_freq_vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        min_freq_embed = StaticEmbedding(min_freq_vocab,
                                         model_dir_or_name='en-glove-6B-100d',
                                         lower=False)
        min_freq_words = torch.LongTensor(
            [[min_freq_vocab.to_index(word.lower()) for word in all_words]])
        min_freq_words = min_freq_embed(min_freq_words)

        for idx in range(len(all_words)):
            word_i, word_j = words[0, idx], min_freq_words[0, idx]
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(
                    min_freq_embed.embed_size)
Exemplo n.º 5
0
def get_fastnlp_dataset():
    text_train, text_test = get_text_classification_datasets()
    train_data = DataSet()
    test_data = DataSet()
    for i in range(len(text_train.data)):
        train_data.append(
            Instance(text=split_sent(text_train.data[i]),
                     target=int(text_train.target[i])))
    for i in range(len(text_test.data)):
        test_data.append(
            Instance(text=split_sent(text_test.data[i]),
                     target=int(text_test.target[i])))

    # 构建词表
    vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>')
    train_data.apply(lambda x: [vocab.add(word) for word in x['text']])
    vocab.build_vocab()

    # 根据词表映射句子
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']],
                     new_field_name='word_seq')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']],
                    new_field_name='word_seq')

    # 设定特征域和标签域
    train_data.set_input("word_seq")
    test_data.set_input("word_seq")
    train_data.set_target("target")
    test_data.set_target("target")

    return train_data, test_data, vocab
Exemplo n.º 6
0
def build_dataset(train_size, test_rate, categories):
    vocab = load('../data/vocab')
    train_set = load('../data/train_set')
    test_set = load('../data/test_set')
    if not vocab is None and not train_set is None and not test_set is None:
        return vocab, train_set, test_set

    train, test = get_20newsgroups_data(categories)
    train_set = create_dataset(train, train_size)
    test_set = create_dataset(test, int(train_size * test_rate))
    # vocabulary
    vocab = Vocabulary(min_freq=10)
    test_set.apply(lambda x: [vocab.add(word) for word in x['word_seq']])
    vocab.build_vocab()
    # word_seq to int
    train_set.apply(lambda x: [vocab.to_index(word) for word in x['word_seq']],
                    new_field_name='word_seq')
    test_set.apply(lambda x: [vocab.to_index(word) for word in x['word_seq']],
                   new_field_name='word_seq')
    # tag
    train_set.set_input('word_seq')
    train_set.set_target('target')
    test_set.set_input('word_seq')
    test_set.set_target('target')

    save('../data/vocab', vocab)
    save('../data/train_set', train_set)
    save('../data/test_set', test_set)
    return vocab, train_set, test_set
Exemplo n.º 7
0
def Get_Data_Vocab(path):
    s = ""
    with open (path, "r", encoding='UTF-8') as f:
        for line in f:
            s += line.rstrip('\r\n') + "#"

    sentences = s.split("#")

    dataset = construct_dataset(sentences)
    dataset.apply(cut_pad, new_field_name='words') #控制每首诗长度一致
    # 分出测试集、训练集
    dev_data, train_data = dataset.split(0.8)
    # 构建词表, Vocabulary.add(word)
    vocab = Vocabulary(padding="<pad>", min_freq=2)
    train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
    vocab.build_vocab()
    print(vocab.idx2word)

    train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
    dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
    train_data.apply(lambda x: x['words'][:-1], new_field_name="input")
    train_data.apply(lambda x: x['words'][1:], new_field_name="target")
    dev_data.apply(lambda x: x['words'][:-1], new_field_name="input")
    dev_data.apply(lambda x: x['words'][1:], new_field_name="target")
    train_data.set_input("input")
    train_data.set_target("target")
    dev_data.set_input("input")
    dev_data.set_target("target")

    return vocab, train_data, dev_data
Exemplo n.º 8
0
 def test_search(self):
     """语义搜索.TypeError: expected dimension <= 2 array or matrix
     """
     print('{} test_search {}'.format('-' * 15, '-' * 15))
     texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基']
     # 文本向量化
     vocab = Vocabulary()
     for text in texts:
         vocab.add_word_lst(list(text))
     print(len(vocab))
     embed = StaticEmbedding(
         vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt')
     texts_to_id = [[vocab.to_index(word) for word in list(text)]
                    for text in texts]
     words = torch.LongTensor(texts_to_id)  # 将文本转为index
     features_vec = embed(words)
     print(features_vec.shape)
     # build the search index!
     cp = ci.MultiClusterIndex(features_vec.detach().numpy(), texts)
     search_texts = ['朱日和站', '温都尔站', '国电站']
     for text in search_texts:
         texts_to_id = [[vocab.to_index(word) for word in list(text)]]
         words = torch.LongTensor(texts_to_id)  # 将文本转为index
         features_vec = embed(words)
         search_features_vec = features_vec.detach().numpy()
         search_result = cp.search(search_features_vec,
                                   k=2,
                                   k_clusters=2,
                                   return_distance=True)
         print('text:{}'.format(text))
         print('search_result:{}'.format(search_result))
     """
Exemplo n.º 9
0
class JokeData(object):
    data_set = None
    train_data = None
    test_data = None
    vocab = None
    data_num = 0
    vocab_size = 0
    max_seq_len = 0

    def __init__(self, conf):
        print(conf.data_path)
        self.data_set = get_joke_data(conf.data_path)
        self.data_num = len(self.data_set)
        self.data_set.apply(self.split_sent,new_field_name='words')
        self.max_seq_len = min(self.max_seq_len,conf.max_seq_len)
        self.data_set.apply(lambda x : len(x['words']),new_field_name='seq_len')
        self.train_data,self.test_data = self.data_set.split(0.2)

    def split_chinese_sent(self,ins,remove_punc=False):
        line = ins['raw_joke'].strip()
        words = ['<START>']
        for c in line:
            if c in [',','。','?','!']:
                if remove_punc:
                    continue
                else:
                    words.append(c)
            else:
                words.append(c)
        words.append('<EOS>')
        self.max_seq_len = max(self.max_seq_len,len(words))
        return words
    
    def split_sent(self,ins,remove_punc=False):
        words = ['<START>'] + ins['raw_joke'].split() + ['<EOS>']
        self.max_seq_len = max(self.max_seq_len,len(words))
        return words

    def pad_seq(self,ins):
        words = ins['words']
        if(len(words) < self.max_seq_len):
            words = [0]*(self.max_seq_len-len(words)) + words
        else:
            words = words[:self.max_seq_len]
        return words
        
    def get_vocab(self):
        self.vocab = Vocabulary(min_freq=10)
        self.train_data.apply(lambda x : [self.vocab.add(word) for word in x['words']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = self.vocab.__len__()

        self.train_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words')
        self.train_data.apply(self.pad_seq,new_field_name='pad_words')
        
        self.test_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words')
        self.test_data.apply(self.pad_seq,new_field_name='pad_words')
Exemplo n.º 10
0
def get_dataset(data_path):
    print('Getting dataset...')

    poetry = []
    with open(data_path, 'r', encoding='utf-8') as f:
        poem = ''
        for line in f:
            if len(line) <= 1:
                ins = Instance(text=poem)
                if len(poem) > 10:
                    poetry.append(ins)
                poem = ''
            else:
                poem += line.strip('\n')
    # print(poetry[0])

    data = DataSet(data=poetry)
    print("Original data:", data[0])

    vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>')
    vocabulary.add_word('<eos>')
    vocabulary.add_word('<START>')
    data.apply(lambda x: [vocabulary.add(char) for char in x['text']])
    vocabulary.build_vocab()
    print('pad:', vocabulary.to_index('<pad>'))
    print('Vocab size:', len(vocabulary))

    data.apply(lambda x: [vocabulary.to_index(char) for char in x['text']],
               new_field_name='text')
    data.apply(lambda x: [vocabulary.to_index('<START>')] + x['text'] +
               [vocabulary.to_index('<eos>')],
               new_field_name='text')
    data.apply(
        lambda x: x['text'][0:min(config.sequence_length, len(x['text']))],
        new_field_name='text')
    data.apply(lambda x: [vocabulary.to_index('<pad>')] *
               (config.sequence_length - len(x['text'])) + x['text'],
               new_field_name='text')
    data.apply(lambda x: x['text'][0:-1], new_field_name='input')
    data.apply(lambda x: x['text'][1:], new_field_name='target')
    data.set_input('input')
    data.set_target('target')

    # length = config.sequence_length
    # for i, d in enumerate(data):
    #     if length != len(d['text']):
    #         print("wrong!")
    # exit()

    train_data, dev_data = data.split(0.2)
    print('Train data size:', len(train_data))
    print('Dev data size:', len(dev_data))
    print("Train data:", train_data[20])
    # print("Dev data:", dev_data[0])

    return train_data, dev_data, vocabulary
Exemplo n.º 11
0
def readdata():
    global target_len
    min_count = 10
    #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ]
    dataset_train = fetch_20newsgroups(subset='train', data_home='../../..')
    dataset_test = fetch_20newsgroups(subset='test', data_home='../../..')

    data = dataset_train.data
    target = dataset_train.target
    target_len = len(dataset_train.target_names)
    train_data =  DataSet()
    padding = 0
    for i in range(len(data)):
        data_t =  re.sub("\d+|\s+|/", " ", data[i] )
        temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != '']
        train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp))
        if len(temp) > padding:
            padding = len(temp)
    train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw')

    data = dataset_test.data
    target = dataset_test.target
    test_data =  DataSet()
    padding = 0
    for i in range(len(data)):
        data_t =  re.sub("\d+|\s+|/", " ", data[i] )
        temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != '']
        test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp))
        if len(temp) > padding:
            padding = len(temp)
    test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw')

    train_data.apply(lambda x: len(x['words']), new_field_name='len')
    test_data.apply(lambda x: len(x['words']), new_field_name='len')

    vocab = Vocabulary(min_freq=10)
    train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
    vocab.build_vocab()
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq')
    train_data.rename_field('seq', Const.INPUT)
    train_data.rename_field('len', Const.INPUT_LEN)
    train_data.rename_field('label', Const.TARGET)

    test_data.rename_field('seq', Const.INPUT)
    test_data.rename_field('len', Const.INPUT_LEN)
    test_data.rename_field('label', Const.TARGET)

    test_data.set_input(Const.INPUT, Const.INPUT_LEN)
    test_data.set_target(Const.TARGET)
    train_data.set_input(Const.INPUT, Const.INPUT_LEN)
    train_data.set_target(Const.TARGET)

    test_data, dev_data = test_data.split(0.5)
    return train_data,dev_data,test_data,vocab
Exemplo n.º 12
0
    def test_fastnlp_1min_tutorial(self):
        # tutorials/fastnlp_1min_tutorial.ipynb
        data_path = "test/data_for_tests/tutorial_sample_dataset.csv"
        ds = DataSet.read_csv(data_path,
                              headers=('raw_sentence', 'label'),
                              sep='\t')
        print(ds[1])

        # 将所有数字转为小写
        ds.apply(lambda x: x['raw_sentence'].lower(),
                 new_field_name='raw_sentence')
        # label转int
        ds.apply(lambda x: int(x['label']),
                 new_field_name='target',
                 is_target=True)

        def split_sent(ins):
            return ins['raw_sentence'].split()

        ds.apply(split_sent, new_field_name='words', is_input=True)

        # 分割训练集/验证集
        train_data, dev_data = ds.split(0.3)
        print("Train size: ", len(train_data))
        print("Test size: ", len(dev_data))

        from fastNLP import Vocabulary
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])

        # index句子, Vocabulary.to_index(word)
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words',
            is_input=True)
        dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']],
                       new_field_name='words',
                       is_input=True)

        from fastNLP.models import CNNText
        model = CNNText((len(vocab), 50),
                        num_classes=5,
                        padding=2,
                        dropout=0.1)

        from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam

        trainer = Trainer(model=model,
                          train_data=train_data,
                          dev_data=dev_data,
                          loss=CrossEntropyLoss(),
                          optimizer=Adam(),
                          metrics=AccuracyMetric(target='target'))
        trainer.train()
        print('Train finished!')
Exemplo n.º 13
0
def load_conll_with_glove(
        data_dir,
        data_path='train.pos',
        glove_path="",
        # glove_path='/remote-home/ygxu/dataset/glove.empty.txt',
        load_glove=True,
        vocabs=None):
    path = os.path.join(data_dir, data_path)
    print(f"start load dataset from {path}.")

    from dataset import MyConllLoader
    ds = MyConllLoader().load(path)
    print(ds)
    ds.rename_field('word_seq', 'sentence')
    ds.rename_field('label_seq', 'label')
    #ds = DataSet.read_pos(path, headers=('sentence', 'label'), sep='\t')

    #ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence')
    #ds.apply(lambda x: x['sentence'].strip().split(), new_field_name='sentence')
    ds.apply(lambda x: len(x['sentence']) * [1.],
             new_field_name='word_seq_origin_len',
             is_input=True)

    if vocabs is None:
        vocab = Vocabulary(max_size=30000,
                           min_freq=2,
                           unknown='<unk>',
                           padding='<pad>')
        ds.apply(lambda x: [vocab.add(word) for word in x['sentence']])
        vocab.build_vocab()
        vocab_label = Vocabulary(max_size=200, unknown=None, padding='<pad>')
        ds.apply(lambda x: [vocab_label.add(label) for label in x['label']])
        vocab_label.build_vocab()
    else:
        vocab, vocab_label = vocabs

    ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']],
             new_field_name='word_seq',
             is_input=True)
    ds.apply(lambda x: [vocab_label.to_index(w) for w in x['label']],
             new_field_name='truth',
             is_input=True,
             is_target=True)

    if not load_glove:
        print(f"successful load dataset from {path}")
        return ds

    embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove',
                                                vocab)

    print(f"successful load dataset and embedding from {path}")

    return ds, embedding, (vocab, vocab_label)
Exemplo n.º 14
0
def get_vocabulary(train_data, test_data):
    # 构建词表, Vocabulary.add(word)
    vocab = Vocabulary(min_freq=0, unknown='<unk>', padding='<pad>')
    train_data.apply(lambda x: [vocab.add(word) for word in x['poem']])
    vocab.build_vocab()
    # index句子, Vocabulary.to_index(word)
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['poem']],
                     new_field_name='words')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['poem']],
                    new_field_name='words')

    return vocab, train_data, test_data
Exemplo n.º 15
0
    def __init__(self, path=".data/yelp", dataset="yelp", batch_size=32):

        if dataset == "yelp":
            dataset = DataSet()

            for db_set in ['train']:
                text_file = os.path.join(path, 'sentiment.' + db_set + '.text')
                label_file = os.path.join(path,
                                          'sentiment.' + db_set + '.labels')
                with io.open(text_file, 'r', encoding="utf-8") as tf, io.open(
                        label_file, 'r', encoding="utf-8") as lf:
                    for text in tf:
                        label = lf.readline()
                        dataset.append(Instance(text=text, label=label))

            dataset.apply(lambda x: x['text'].lower(), new_field_name='text')
            dataset.apply(
                lambda x: ['<start>'] + x['text'].split() + ['<eos>'],
                new_field_name='words')
            dataset.drop(lambda x: len(x['words']) > 1 + 15 + 1)
            dataset.apply(lambda x: x['words'] + ['<pad>'] *
                          (17 - len(x['words'])),
                          new_field_name='words')
            dataset.apply(lambda x: int(x['label']),
                          new_field_name='label_seq',
                          is_target=True)

            _train_data, _test_data = dataset.split(0.3)

            _vocab = Vocabulary(min_freq=2)
            _train_data.apply(
                lambda x: [_vocab.add(word) for word in x['words']])
            _vocab.build_vocab()

            _train_data.apply(
                lambda x: [_vocab.to_index(word) for word in x['words']],
                new_field_name='word_seq',
                is_input=True)
            _test_data.apply(
                lambda x: [_vocab.to_index(word) for word in x['words']],
                new_field_name='word_seq',
                is_input=True)

        self.train_data = _train_data
        self.test_data = _test_data
        self.vocab = _vocab
        self.batch_size = batch_size
        self.train_iter = iter(
            Batch(dataset=self.train_data,
                  batch_size=self.batch_size,
                  sampler=SequentialSampler()))
Exemplo n.º 16
0
 def test_rebuild(self):
     # 测试build之后新加入词,原来的词顺序不变
     vocab = Vocabulary()
     text = [str(idx) for idx in range(10)]
     vocab.update(text)
     for i in text:
         self.assertEqual(int(i) + 2, vocab.to_index(i))
     indexes = []
     for word, index in vocab:
         indexes.append((word, index))
     vocab.add_word_lst([str(idx) for idx in range(10, 13)])
     for idx, pair in enumerate(indexes):
         self.assertEqual(pair[1], vocab.to_index(pair[0]))
     for i in range(13):
         self.assertEqual(int(i) + 2, vocab.to_index(str(i)))
Exemplo n.º 17
0
def process_poems(file_name, sentence_len, vocab_size):
    sentences = []
    with open(file_name, "r", encoding='utf-8', ) as f:
        for line in f.readlines():
            try:
                line = line.strip()
                if line:
                    # content = line.replace(' ', '').replace(',','').replace('。','')
                    content = line.replace(' ', '') #包含标点符号
                    if len(content) < 10 or len(content) > sentence_len:
                        continue
                    # print(content)
                    content = content + end_token
                    sentences.append(content)
            except ValueError as e:
                pass

    dataset = DataSet()
    for sentence in sentences:
        instance = Instance()
        instance['raw_sentence'] = sentence
        instance['target'] = sentence[1:] + sentence[-1]
        dataset.append(instance)

    dataset.set_input("raw_sentence")
    dataset.set_target("target")

    # for iter in dataset:
    #     print(iter)
    print("dataset_size:", len(dataset))

    train_data, dev_data = dataset.split(0.2)
    train_data.rename_field("raw_sentence", "sentence")
    dev_data.rename_field("raw_sentence", "sentence")
    vocab = Vocabulary(max_size=vocab_size, min_freq=2, unknown='<unk>', padding='<pad>')

    # 构建词表
    train_data.apply(lambda x: [vocab.add(word) for word in x['sentence']])
    vocab.build_vocab()
    print("vocabulary_size:", len(vocab))

    # 根据词表index句子
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence')
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target')
    dev_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence')
    dev_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target')

    return train_data, dev_data, vocab
Exemplo n.º 18
0
def word_to_id(glove_data, glove_matrix, vocab_dict_path, file_path):
    if os.path.exists(glove_data) == False or os.path.exists(
            glove_matrix) == False:
        data, feature_words, user_num, item_num, = feature_word(file_path)
        vocab = Vocabulary(max_size=len(feature_words) + 1,
                           unknown='unk',
                           padding='PAD')
        vocab.add_word_lst(feature_words)
        vocab.build_vocab()
        matrix = EmbedLoader.load_with_vocab(vocab_dict_path, vocab)
        matrix = torch.tensor(matrix)

        for d in range(len(data)):
            review = []
            for word in data[d]['reviewText']:
                review.append(vocab.to_index(word))
            data[d]['reviewText'] = review

        with open(glove_data, 'wb') as f:
            pickle.dump(data, f)

        with open(glove_matrix, 'wb') as f:
            pickle.dump(matrix, f)

    with open(glove_data, 'rb') as f:
        glove_data = pickle.load(f)
    with open(glove_matrix, 'rb') as f:
        matrix = pickle.load(f)

    return glove_data, matrix, len(glove_data[0]['reviewText'])
Exemplo n.º 19
0
def data_process():
    with open('./data.txt', encoding='utf-8') as fp:
        out = fp.readlines()
        data = list(out)

    poem = []
    cnt = 0
    for temp in data:
        cnt += 1
        if cnt % 2 == 0:
            rec = re.sub(',', '', temp)
            poem.append(rec[:-1])

    poem_normalized = []
    for i in range(len(poem)):
        if len(poem[i]) < 80:
            poem[i] = ' ' * (80 - len(poem[i])) + poem[i]
            poem_normalized.append(poem[i])
        else:
            poem_normalized.append(poem[i][:80])

    vocab = Vocabulary(min_freq=2)
    for temp in poem_normalized:
        for x in temp:
            vocab.add(x)

    vocab.build_vocab()
    dataset = []
    for temp in poem_normalized:
        dataset.append([vocab.to_index(x) for x in temp])
    return vocab, np.array(dataset)
Exemplo n.º 20
0
def pre_process(file_name):

    poem = []

    with open(file_name, 'r', encoding='utf-8') as f:
        for index, line in enumerate(f.readlines()):
            if index % 2 == 1:
                raw_line = line.strip()

                raw_line = re.sub(',', '', raw_line)
                raw_line = re.sub('。', '', raw_line)

                length = len(raw_line)
                if length < 100:
                    raw_line = raw_line + '~' * (100 - length)

                poem.append(raw_line[:100])

    word_dict = Vocabulary()
    for line in poem:
        for character in line:
            word_dict.add(character)

    word_dict.build_vocab()

    data = []
    for pi in poem:
        p = []
        for ch in pi:
            p.append(word_dict.to_index(ch))
        data.append(p)
    data = np.array(data)

    return word_dict, data
Exemplo n.º 21
0
 def test_same_vector(self):
     vocab = Vocabulary().add_word_lst(["The", "the", "THE"])
     embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5, lower=True)
     words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE"]]])
     words = embed(words)
     embed_0 = words[0, 0]
     for i in range(1, words.size(1)):
         assert torch.sum(embed_0==words[0, i]).eq(len(embed_0))
Exemplo n.º 22
0
    def test_index(self):
        vocab = Vocabulary()
        vocab.update(text)
        res = [vocab[w] for w in set(text)]
        self.assertEqual(len(res), len(set(res)))

        res = [vocab.to_index(w) for w in set(text)]
        self.assertEqual(len(res), len(set(res)))
Exemplo n.º 23
0
def get_vocab(dataset):
    vocabulary = Vocabulary(unknown=unk_str, padding=pad_str)
    for data, _ in dataset:
        vocabulary.add_word_lst(data)
    print('vocab', len(vocabulary))
    print('pad', vocabulary.to_index(pad_str))

    return vocabulary
Exemplo n.º 24
0
    def test_roberta_ebembedding_2(self):
        # 测试only_use_pretrain_vocab与truncate_embed是否正常工作
        Embedding = RobertaEmbedding
        weight_path = 'test/data_for_tests/embedding/small_roberta'
        vocab = Vocabulary().add_word_lst("this is a texta and".split())
        embed1 = Embedding(vocab,
                           model_dir_or_name=weight_path,
                           layers=list(range(3)),
                           only_use_pretrain_bpe=True,
                           truncate_embed=True,
                           min_freq=1)
        # embed_bpe_vocab_size = len(vocab)-1 + 2  # 排除NotInBERT, 额外加##a, [CLS]
        # self.assertEqual(embed_bpe_vocab_size, len(embed1.model.tokenzier.vocab))

        embed2 = Embedding(vocab,
                           model_dir_or_name=weight_path,
                           layers=list(range(3)),
                           only_use_pretrain_bpe=True,
                           truncate_embed=False,
                           min_freq=1)
        # embed_bpe_vocab_size = num_word  # 排除NotInBERT
        # self.assertEqual(embed_bpe_vocab_size, len(embed2.model.tokenzier.vocab))

        embed3 = Embedding(vocab,
                           model_dir_or_name=weight_path,
                           layers=list(range(3)),
                           only_use_pretrain_bpe=False,
                           truncate_embed=True,
                           min_freq=1)
        # embed_bpe_vocab_size = len(vocab)+2  # 新增##a, [CLS]
        # self.assertEqual(embed_bpe_vocab_size, len(embed3.model.tokenzier.vocab))

        embed4 = Embedding(vocab,
                           model_dir_or_name=weight_path,
                           layers=list(range(3)),
                           only_use_pretrain_bpe=False,
                           truncate_embed=False,
                           min_freq=1)
        # embed_bpe_vocab_size = num_word+1  # 新增##a
        # self.assertEqual(embed_bpe_vocab_size, len(embed4.model.tokenzier.vocab))

        # 测试各种情况下以下tensor的值是相等的
        embed1.eval()
        embed2.eval()
        embed3.eval()
        embed4.eval()
        tensor = torch.LongTensor(
            [[vocab.to_index(w) for w in 'this is a texta and'.split()]])
        t1 = embed1(tensor)
        t2 = embed2(tensor)
        t3 = embed3(tensor)
        t4 = embed4(tensor)

        self.assertEqual((t1 - t2).sum(), 0)
        self.assertEqual((t1 - t3).sum(), 0)
        self.assertEqual((t1 - t4).sum(), 0)
Exemplo n.º 25
0
def get_vocabulary(dataset):
    vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>')
    # vocabulary.add_word('<eos>')
    # vocabulary.add_word('<start>')

    dataset.apply(lambda x: [vocabulary.add(word) for word in x['input']])
    vocabulary.build_vocab()

    print('pad:', vocabulary.to_index('<pad>'))
    print('Vocab size:', len(vocabulary))
    return vocabulary
Exemplo n.º 26
0
def load_dataset(
    data_dir='/remote-home/ygxu/workspace/Product_all',
    data_path='mr.task.train',
    # bert_dir='/home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12',
    bert_dir='/remote-home/ygxu/workspace/BERT/BERT_English_uncased_L-24_H-1024_A_16',
):

    path = os.path.join(data_dir, data_path)

    ds = DataSet.read_csv(path, headers=('label', 'raw_sentence'), sep='\t')

    ds.apply(lambda x: x['raw_sentence'].lower(),
             new_field_name='raw_sentence')

    ds.apply(lambda x: int(x['label']),
             new_field_name='label_seq',
             is_target=True)

    def transfer_bert_to_fastnlp(ins):
        result = "[CLS] "
        bert_text = ins['bert_tokenize_list']
        for text in bert_text:
            result += text + " "
        return result.strip()

    with open(os.path.join(bert_dir, 'vocab.txt')) as f:
        lines = f.readlines()
    vocabs = []
    for line in lines:
        vocabs.append(line[:-1])

    vocab_bert = Vocabulary(unknown=None, padding=None)
    vocab_bert.add_word_lst(vocabs)
    vocab_bert.build_vocab()
    vocab_bert.unknown = '[UNK]'
    vocab_bert.padding = '[PAD]'

    from pytorch_pretrained_bert import BertTokenizer, BertModel
    tokenizer = BertTokenizer.from_pretrained(
        os.path.join(bert_dir, 'vocab.txt'))
    ds.apply(lambda x: tokenizer.tokenize(x['raw_sentence']),
             new_field_name='bert_tokenize_list')
    ds.apply(transfer_bert_to_fastnlp, new_field_name='bert_tokenize')
    ds.apply(lambda x:
             [vocab_bert.to_index(word) for word in x['bert_tokenize_list']],
             new_field_name='index_words',
             is_input=True)

    ds.rename_field('index_words', 'tokens')
    ds.apply(lambda x: [1.] * len(x['tokens']),
             new_field_name='masks',
             is_input=True)

    return ds
Exemplo n.º 27
0
 def test_same_vector2(self):
     vocab = Vocabulary().add_word_lst(["The", 'a', 'b', "the", "THE", "B", 'a', "A"])
     embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d',
                             lower=True)
     words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'b', "B", 'a', 'A']]])
     words = embed(words)
     embed_0 = words[0, 0]
     for i in range(1, 3):
         assert torch.sum(embed_0==words[0, i]).eq(len(embed_0))
     embed_0 = words[0, 3]
     for i in range(3, 5):
         assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0))
Exemplo n.º 28
0
    def test_from_dataset(self):
        start_char = 65
        num_samples = 10

        # 0 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=chr(start_char + i))
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

        # 1 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=[chr(start_char + i)] * 6)
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

        # 2 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=[[chr(start_char + i) for _ in range(6)]
                                 for _ in range(6)])
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')
Exemplo n.º 29
0
 def test_fit(self):
     """文本编码.
     """
     print('{} test_fit {}'.format('-' * 15, '-' * 15))
     texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基']
     vocab = Vocabulary()
     for text in texts:
         vocab.add_word_lst(list(text))
     print(len(vocab))
     embed = StaticEmbedding(
         vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt')
     texts_to_id = [[vocab.to_index(word) for word in list(text)]
                    for text in ['朱日和', '东台变']]
     print(texts_to_id)  # [[16, 17, 18], [6, 1, 1]]
     words = torch.LongTensor(texts_to_id)  # 将文本转为index
     print(embed(words).size())  # torch.Size([2, 3, 100])
Exemplo n.º 30
0
def load_dataset_with_glove(data_dir,
                            data_path='mr.task.train',
                            glove_path="",
                            load_glove=True,
                            vocabs=None):
    path = os.path.join(data_dir, data_path)
    print(f"start load dataset from {path}.")

    ds = DataSet.read_csv(path, headers=('label', 'sentence'), sep='\t')

    ds.apply(lambda x: x['sentence'].lower(), new_field_name='sentence')
    ds.apply(lambda x: x['sentence'].strip().split(),
             new_field_name='sentence')
    ds.apply(lambda x: len(x['sentence']) * [1.],
             new_field_name='mask',
             is_input=True)
    ds.apply(lambda x: int(x['label']), new_field_name='label', is_target=True)

    if vocabs is None:
        vocab = Vocabulary(max_size=30000,
                           min_freq=2,
                           unknown='<unk>',
                           padding='<pad>')
        ds.apply(lambda x: [vocab.add(word) for word in x['sentence']])
        vocab.build_vocab()
    else:
        vocab = vocabs

    ds.apply(lambda x: [vocab.to_index(w) for w in x['sentence']],
             new_field_name='data',
             is_input=True)

    if not load_glove:
        print(f"successful load dataset from {path}")
        return ds

    embedding, _ = EmbedLoader().load_embedding(300, glove_path, 'glove',
                                                vocab)

    print(f"successful load dataset and embedding from {path}")

    return ds, embedding, vocab