Exemplo n.º 1
0
 def test_init_assert(self):
     with self.assertRaises(AssertionError):
         _ = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 100})
     with self.assertRaises(AssertionError):
         _ = DataSet([[1, 2, 3, 4]] * 10)
     with self.assertRaises(ValueError):
         _ = DataSet(0.00001)
Exemplo n.º 2
0
def preprocess():
    train_set = DataSet()
    for i in range(len(raw_train.data)):
        train_set.append(
            Instance(sentence=raw_train.data[i],
                     target=int(raw_train.target[i])))

    train_set.apply(lambda x: x['sentence'].translate(
        str.maketrans("", "", string.punctuation)).lower(),
                    new_field_name='sentence')
    train_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
    train_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

    test_set = DataSet()
    for i in range(len(raw_test.data)):
        test_set.append(
            Instance(sentence=raw_test.data[i],
                     target=int(raw_test.target[i])))

    test_set.apply(lambda x: x['sentence'].translate(
        str.maketrans("", "", string.punctuation)).lower(),
                   new_field_name='sentence')
    test_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
    test_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

    vocab = Vocabulary(min_freq=10)
    train_set.apply(lambda x: [vocab.add(word) for word in x['words']])
    test_set.apply(lambda x: [vocab.add(word) for word in x['words']])
    vocab.build_vocab()
    vocab.index_dataset(train_set, field_name='words', new_field_name='words')
    vocab.index_dataset(test_set, field_name='words', new_field_name='words')

    return train_set, test_set, vocab
Exemplo n.º 3
0
def get_fastnlp_dataset():
    text_train, text_test = get_text_classification_datasets()
    train_data = DataSet()
    test_data = DataSet()
    for i in range(len(text_train.data)):
        train_data.append(
            Instance(text=split_sent(text_train.data[i]),
                     target=int(text_train.target[i])))
    for i in range(len(text_test.data)):
        test_data.append(
            Instance(text=split_sent(text_test.data[i]),
                     target=int(text_test.target[i])))

    # 构建词表
    vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>')
    train_data.apply(lambda x: [vocab.add(word) for word in x['text']])
    vocab.build_vocab()

    # 根据词表映射句子
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']],
                     new_field_name='word_seq')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']],
                    new_field_name='word_seq')

    # 设定特征域和标签域
    train_data.set_input("word_seq")
    test_data.set_input("word_seq")
    train_data.set_target("target")
    test_data.set_target("target")

    return train_data, test_data, vocab
Exemplo n.º 4
0
def get_data():
    dataset_train, dataset_test = get_text_classification_datasets()
    # print(dataset_train.data)

    dic_train = {
        "input" : dataset_train.data,
        "target" : dataset_train.target
    }
    dic_test = {
        "input" : dataset_test.data,
        "target" : dataset_test.target
    }

    dataset = DataSet(dic_train)
    test_data = DataSet(dic_test)

    dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input')
    dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input')
    dataset.apply_field(lambda x: x.split(), field_name='input', new_field_name='words')

    test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input')
    test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input')
    test_data.apply_field(lambda x: x.split(), field_name='input', new_field_name='words')


    # **************************
    dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')
    test_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')
    dataset.rename_field('words', Const.INPUT)
    dataset.rename_field('seq_len', Const.INPUT_LEN)
    dataset.rename_field('target', Const.TARGET)
    
    test_data.rename_field('words', Const.INPUT)
    test_data.rename_field('seq_len', Const.INPUT_LEN)
    test_data.rename_field('target', Const.TARGET)

    # dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    dataset.set_input(Const.INPUT)
    dataset.set_target(Const.TARGET)

    # test_data.set_input(Const.INPUT, Const.INPUT_LEN)
    test_data.set_input(Const.INPUT)
    test_data.set_target(Const.TARGET)
    # **************************

    # only use train for vocab or train+dev
    train_data, dev_data = dataset.split(0.1)
    # print(len(train_data), len(dev_data), len(test_data))
    # print(train_data[0])

    vocab = Vocabulary(min_freq=10).from_dataset(train_data, field_name=Const.INPUT)

    vocab.index_dataset(train_data, field_name=Const.INPUT,new_field_name=Const.INPUT)
    vocab.index_dataset(dev_data, field_name=Const.INPUT,new_field_name=Const.INPUT)
    vocab.index_dataset(test_data, field_name=Const.INPUT,new_field_name=Const.INPUT)

    # print(test_data[0])
    print(len(vocab))
    return vocab, train_data, dev_data, test_data
Exemplo n.º 5
0
    def test_get_item_error(self):
        with self.assertRaises(RuntimeError):
            ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
            _ = ds[40:]

        with self.assertRaises(KeyError):
            ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
            _ = ds["kom"]
Exemplo n.º 6
0
def readdata():
    global target_len
    min_count = 10
    #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ]
    dataset_train = fetch_20newsgroups(subset='train', data_home='../../..')
    dataset_test = fetch_20newsgroups(subset='test', data_home='../../..')

    data = dataset_train.data
    target = dataset_train.target
    target_len = len(dataset_train.target_names)
    train_data =  DataSet()
    padding = 0
    for i in range(len(data)):
        data_t =  re.sub("\d+|\s+|/", " ", data[i] )
        temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != '']
        train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp))
        if len(temp) > padding:
            padding = len(temp)
    train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw')

    data = dataset_test.data
    target = dataset_test.target
    test_data =  DataSet()
    padding = 0
    for i in range(len(data)):
        data_t =  re.sub("\d+|\s+|/", " ", data[i] )
        temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != '']
        test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp))
        if len(temp) > padding:
            padding = len(temp)
    test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw')

    train_data.apply(lambda x: len(x['words']), new_field_name='len')
    test_data.apply(lambda x: len(x['words']), new_field_name='len')

    vocab = Vocabulary(min_freq=10)
    train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
    vocab.build_vocab()
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq')
    train_data.rename_field('seq', Const.INPUT)
    train_data.rename_field('len', Const.INPUT_LEN)
    train_data.rename_field('label', Const.TARGET)

    test_data.rename_field('seq', Const.INPUT)
    test_data.rename_field('len', Const.INPUT_LEN)
    test_data.rename_field('label', Const.TARGET)

    test_data.set_input(Const.INPUT, Const.INPUT_LEN)
    test_data.set_target(Const.TARGET)
    train_data.set_input(Const.INPUT, Const.INPUT_LEN)
    train_data.set_target(Const.TARGET)

    test_data, dev_data = test_data.split(0.5)
    return train_data,dev_data,test_data,vocab
Exemplo n.º 7
0
    def test_copy_padder(self):
        from fastNLP.core.field import AutoPadder
        ds = DataSet()
        ds.add_field('idx', [1, 2, 3])
        ds['idx'].set_padder(None)  # workaround of problem 1
        ds.apply_field(lambda x: x, 'idx', 'idx')
        self.assertEqual(ds['idx'].padder,
                         None)  # should be None, but AutoPadder

        ds = DataSet()
        ds.add_field('idx', [1, 2, 3])
        ds.apply_field(lambda x: x, 'idx', 'idx')
        self.assertTrue(
            isinstance(ds.get_field('idx').padder,
                       AutoPadder))  # should be None, but AutoPadder
Exemplo n.º 8
0
def read_file(filename, processing_word=get_processing_word(lowercase=False)):
    dataset = DataSet()
    niter = 0
    with codecs.open(filename, "r", "utf-16") as f:
        words, tags = [], []
        for line in f:
            line = line.strip()
            if len(line) == 0 or line.startswith("-DOCSTART-"):
                if len(words) != 0:
                    assert len(words) > 2
                    if niter == 1:
                        print(words, tags)
                    niter += 1
                    dataset.append(
                        Instance(ori_words=words[:-1], ori_tags=tags[:-1]))
                    words, tags = [], []
            else:
                word, tag = line.split()
                word = processing_word(word)
                words.append(word)
                tags.append(tag.lower())

    dataset.apply_field(lambda x: [x[0]],
                        field_name='ori_words',
                        new_field_name='task')
    dataset.apply_field(lambda x: len(x),
                        field_name='ori_tags',
                        new_field_name='seq_len')
    dataset.apply_field(lambda x: expand(x),
                        field_name='ori_words',
                        new_field_name="bi1")
    return dataset
Exemplo n.º 9
0
def is_phrase_match_BERT(phrase1, phrase2):
    """
    Determine if two phrases match
    :param phrase1: phrase1
    :param phrase2: phrase2
    """
    from fastNLP import DataSetIter, DataSet
    from fastNLP.core.utils import _move_dict_value_to_device
    from my_bert_match import addWords, addWordPiece, processItem, processNum, addSeqlen
    # 0 for not match,1 for match
    testset = DataSet({"raw_words": [f"{phrase1}::{phrase2}"]})
    testset.apply(addWords, new_field_name="p_words")
    testset.apply(addWordPiece, new_field_name="t_words")
    testset.apply(processItem, new_field_name="word_pieces")
    testset.apply(processNum, new_field_name="word_nums")
    testset.apply(addSeqlen, new_field_name="seq_len")
    testset.field_arrays["word_pieces"].is_input = True
    testset.field_arrays["seq_len"].is_input = True
    testset.field_arrays["word_nums"].is_input = True
    # print(testset)
    with torch.no_grad():
        bert_model.eval()
        test_batch = DataSetIter(batch_size=1, dataset=testset, sampler=None)
        outputs = []
        for batch_x, batch_y in test_batch:
            _move_dict_value_to_device(batch_x, batch_y, device=device)
            outputs.append(bert_model.forward(batch_x["word_pieces"], batch_x["word_nums"], batch_x["seq_len"])['pred'])
        outputs = torch.cat(outputs)
        outputs = torch.nn.functional.softmax(outputs, dim=1)
        return ["Not Match", "Related", "Match"][outputs.argmax().item()]
Exemplo n.º 10
0
 def test_delete_field(self):
     dd = DataSet()
     dd.add_field("x", [[1, 2, 3]] * 10)
     dd.add_field("y", [[1, 2, 3, 4]] * 10)
     dd.delete_field("x")
     self.assertFalse("x" in dd.field_arrays)
     self.assertTrue("y" in dd.field_arrays)
Exemplo n.º 11
0
 def test_append(self):
     dd = DataSet()
     for _ in range(3):
         dd.append(Instance(x=[1, 2, 3, 4], y=[5, 6]))
     self.assertEqual(len(dd), 3)
     self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3, 4]] * 3)
     self.assertEqual(dd.field_arrays["y"].content, [[5, 6]] * 3)
Exemplo n.º 12
0
def load(path):

    data = DataSet()
    _data = []

    with open(path, "r", encoding="utf-8") as fil:
        fil.readline()

        for line in fil:
            try:
                tradi, verna = line.strip().split("\t")
            except ValueError:
                continue

            tradi = chinese_tokenizer(tradi)
            verna = chinese_tokenizer(verna)

            vocab.add_word_lst(tradi)
            vocab.add_word_lst(verna)

            _data.append(Instance(traditional=tradi, vernacular=verna))

    random.shuffle(_data)
    for x in _data:
        data.append(x)

    data.set_input("vernacular")
    data.set_target("traditional")
    return data
Exemplo n.º 13
0
def process_data(data_path, data_name, test=False, bert=False, input_name='text', target_name='target'):
    print('Processing', data_name)

    schemas = {}
    with open(os.path.join(data_path, "all_50_schemas"), 'rb') as f:
        for i, line in enumerate(f):
            spo = json.loads(line)
            schemas[spo['subject_type'] + spo['predicate'] + spo['object_type']] = i

    # input
    text = []
    # target
    target = []
    with open(os.path.join(data_path, data_name), 'rb') as f:
        for line in f:
            dic = json.loads(line)
            if bert:
                text.append(dic['text'])
            else:
                text.append(list(dic['text']))
            if not test:
                target.append(process_class(schemas, dic['spo_list']))

    if not test:
        data_dict = {
            input_name: text,
            target_name: target
        }
    else:
        data_dict = {input_name: text}
    dataset = DataSet(data=data_dict)
    print('Len', len(dataset))
    print('Sample', dataset[0])
    #exit()
    return dataset
Exemplo n.º 14
0
def make_dataset(data):
    dataset = DataSet()
    tot = 0
    for x in data:

        seq = "[CLS] " + x["raw_text"]
        seq = tokenizer.encode(seq)
        """
        seq=["[CLS]"]+word_tokenize(x["raw_text"])
        seq=tokenizer.convert_tokens_to_ids(seq)
        """
        if len(seq) > 512:
            seq = seq[:512]
            tot += 1
            # print(x["raw_text"])
            # print()

        label = int(x["label"])
        ins = Instance(origin=x["raw_text"],
                       seq=seq,
                       label=label,
                       seq_len=len(seq))
        dataset.append(ins)

    dataset.set_input("seq", "seq_len")
    dataset.set_target("label")
    print(dataset[5])
    print("number:", len(dataset), tot)
    print()
    return dataset
Exemplo n.º 15
0
    def test_roberta_embed_eq_roberta_piece_encoder(self):
        # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致
        weight_path = 'test/data_for_tests/embedding/small_roberta'
        ds = DataSet({
            'words': ["this is a texta a sentence".split(), 'this is'.split()]
        })
        encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path)
        encoder.eval()
        encoder.index_datasets(ds, field_name='words')
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
        word_pieces_res = encoder(word_pieces)

        vocab = Vocabulary()
        vocab.from_dataset(ds, field_name='words')
        vocab.index_dataset(ds, field_name='words', new_field_name='words')
        ds.set_input('words')
        words = torch.LongTensor(ds['words'].get([0, 1]))
        embed = RobertaEmbedding(vocab,
                                 model_dir_or_name=weight_path,
                                 pool_method='first',
                                 include_cls_sep=True,
                                 pooled_cls=False)
        embed.eval()
        words_res = embed(words)

        # 检查word piece什么的是正常work的
        self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0)
        self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0)
        self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
Exemplo n.º 16
0
    def load(self, folder):
        fns ={
            'dev':'{}_dev.csv'.format(self.lg1_lg2),
            'test':'{}_test500.csv'.format(self.lg1_lg2),
            'train': '{}_train500_10.csv'.format(self.lg1_lg2)
        }
        target_lg = self.lg1_lg2.split('_')[0]
        data_bundle = DataBundle()
        for name, fn in fns.items():
            path = os.path.join(folder, fn)
            ds = DataSet()
            with open(path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        parts = line.split('\t')
                        if self.lower:
                            ins = Instance(word=parts[1].lower(), definition=parts[-1].lower())
                        else:
                            ins = Instance(word=parts[1], definition=parts[-1])
                        ds.append(ins)
            data_bundle.set_dataset(ds, name=name)
        target_words = {}
        with open(os.path.join(folder, '{}.txt'.format(target_lg)), encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    if self.lower:
                        line = line.lower()
                    target_words[line] = 1
        target_words = list(target_words.keys())

        setattr(data_bundle, 'target_words', target_words)
        return data_bundle
def create_dataset():
        # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles']
        # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale']
        categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
                      'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
                      'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
                      'soc.religion.christian', 'talk.politics.guns',
                      'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

        newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..')
        newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..')

        dataset = DataSet()

        for i in range(len(newsgroups_train.data)):
            if len(newsgroups_train.data[i]) <= 2000:
                dataset.append(Instance(raw_sentence=newsgroups_train.data[i], target=int(newsgroups_train.target[i])))
        for i in range(len(newsgroups_test.data)):
            if len(newsgroups_test.data[i]) <= 2000:
                dataset.append(Instance(raw_sentence=newsgroups_test.data[i], target=int(newsgroups_test.target[i])))

        dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')
        dataset.apply(lambda x: x['sentence'].split(), new_field_name='words')
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')

        vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')
        vocab.index_dataset(dataset, field_name='words', new_field_name='words')

        dataset.set_input('words', 'seq_len')
        dataset.set_target('target')

        train_dev_data, test_data = dataset.split(0.1)
        train_data, dev_data = train_dev_data.split(0.1)

        return vocab, train_data, dev_data, test_data
Exemplo n.º 18
0
 def load(self, path: str, bigram: bool = False) -> DataSet:
     """
     :param path: str
     :param bigram: 是否使用bigram feature
     :return:
     """
     dataset = DataSet()
     with open(path, 'r', encoding='utf-8') as f:
         for line in f:
             line = line.strip()
             if not line:  # 去掉空行
                 continue
             parts = line.split()
             word_lens = map(len, parts)
             chars = list(''.join(parts))
             tags = self._word_len_to_target(word_lens)
             assert len(chars) == len(tags['target'])
             dataset.append(
                 Instance(raw_chars=chars, **tags, seq_len=len(chars)))
     if len(dataset) == 0:
         raise RuntimeError(f"{path} has no valid data.")
     if bigram:
         dataset.apply_field(self._gen_bigram,
                             field_name='raw_chars',
                             new_field_name='bigrams')
     return dataset
Exemplo n.º 19
0
def get_joke_data(data_path):
    data_set = DataSet()
    sample_num = 0
    sample_len = []
    if os.path.exists(data_path):
        with open(data_path, 'r', encoding='utf-8') as fin:
            for lid, line in enumerate(fin):
                joke = json.loads(line)
                if joke['support'] > 0:
                    if len(joke['content']) == 0:
                        continue
                    else:
                        instance = Instance(raw_joke=joke['content'])
                        data_set.append(instance)
                        sample_num += 1
                        sample_len.append(len(joke['content']))
    else:
        print("the data path doesn't  exit.")
    print("Got {} samples from file.".format(sample_num))
    for i in range(5):
        import random
        id = random.randint(0, sample_num)
        print("sample {}: {}".format(id, data_set[id]['raw_joke']))

    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    plt.hist(sample_len, bins=50, range=(0, 1000))
    plt.savefig("./examples.jpg")
    count = 0
    for i in sample_len:
        if i < 255:
            count += 1
    print(count, '/', len(sample_len))
    return data_set
Exemplo n.º 20
0
def data_analysis(data_path):
    data_set = DataSet()
    sample_num = 0
    sample_len = []
    scores = []
    if os.path.exists(data_path):
        with open(data_path, 'r', encoding='utf-8') as fin:
            for lid, line in enumerate(fin):
                joke = json.loads(line)
                if len(joke['content']) > 0:
                    scores.append(joke['support'])
                    sample_num += 1
                    sample_len.append(len(joke['content']))
    else:
        print("the data path doesn't  exit.")
    print("Got {} samples from file.".format(sample_num))
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    plt.hist(scores, bins=50, range=(0, 1500))
    plt.savefig("./sample_scores.jpg")
    count = 0
    for i in scores:
        if i >= 3:
            count += 1
    print(count, '/', len(sample_len))
    return
Exemplo n.º 21
0
    def test_bert_embed_eq_bert_piece_encoder(self):
        ds = DataSet({
            'words':
            ["this is a texta model vocab".split(), 'this is'.split()]
        })
        encoder = BertWordPieceEncoder(
            model_dir_or_name='test/data_for_tests/embedding/small_bert')
        encoder.eval()
        encoder.index_datasets(ds, field_name='words')
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
        word_pieces_res = encoder(word_pieces)

        vocab = Vocabulary()
        vocab.from_dataset(ds, field_name='words')
        vocab.index_dataset(ds, field_name='words', new_field_name='words')
        ds.set_input('words')
        words = torch.LongTensor(ds['words'].get([0, 1]))
        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            pool_method='first',
            include_cls_sep=True,
            pooled_cls=False,
            min_freq=1)
        embed.eval()
        words_res = embed(words)

        # 检查word piece什么的是正常work的
        self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0)
        self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0)
        self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
Exemplo n.º 22
0
def make_dataset(data):
    dataset = DataSet()
    mx = 0
    le = None
    for x, y in zip(data.data, data.target):
        xx = deal(x)
        ins = Instance(sentence=xx, label=int(y))
        if mx < len(xx.split()):
            mx = max(mx, len(xx.split()))
            le = xx
        dataset.append(ins)
    print(mx)
    dataset.apply_field(lambda x: x.split(),
                        field_name='sentence',
                        new_field_name='words')
    dataset.apply_field(lambda x: len(x),
                        field_name='words',
                        new_field_name='seq_len')

    dataset.rename_field('words', Const.INPUT)
    dataset.rename_field('seq_len', Const.INPUT_LEN)
    dataset.rename_field('label', Const.TARGET)

    dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    dataset.set_target(Const.TARGET)
    return dataset
Exemplo n.º 23
0
def generate_fake_dataset(num_samples=1000):
    """
    产生的DataSet包含以下的field {'1':[], '2':[], '3': [], '4':[]}
    :param num_samples: sample的数量
    :return:
    """

    max_len = 50
    min_len = 10
    num_features = 4

    data_dict = {}
    for i in range(num_features):
        data = []
        lengths = np.random.randint(min_len, max_len, size=(num_samples))
        for length in lengths:
            data.append(np.random.randint(1, 100, size=length))
        data_dict[str(i)] = data

    dataset = DataSet(data_dict)

    for i in range(num_features):
        if np.random.randint(2) == 0:
            dataset.set_input(str(i))
        else:
            dataset.set_target(str(i))
    return dataset
Exemplo n.º 24
0
    def test_apply_more(self):

        T = DataSet({"a": [1, 2, 3], "b": [2, 4, 5]})
        func_1 = lambda x: {"c": x["a"] * 2, "d": x["a"]**2}
        func_2 = lambda x: {"c": x * 3, "d": x**3}

        def func_err_1(x):
            if x["a"] == 1:
                return {"e": x["a"] * 2, "f": x["a"]**2}
            else:
                return {"e": x["a"] * 2}

        def func_err_2(x):
            if x == 1:
                return {"e": x * 2, "f": x**2}
            else:
                return {"e": x * 2}

        T.apply_more(func_1)
        self.assertEqual(list(T["c"]), [2, 4, 6])
        self.assertEqual(list(T["d"]), [1, 4, 9])

        res = T.apply_field_more(func_2, "a", modify_fields=False)
        self.assertEqual(list(T["c"]), [2, 4, 6])
        self.assertEqual(list(T["d"]), [1, 4, 9])
        self.assertEqual(list(res["c"]), [3, 6, 9])
        self.assertEqual(list(res["d"]), [1, 8, 27])

        with self.assertRaises(ApplyResultException) as e:
            T.apply_more(func_err_1)
            print(e)

        with self.assertRaises(ApplyResultException) as e:
            T.apply_field_more(func_err_2, "a")
            print(e)
Exemplo n.º 25
0
def read_instances_from_file(file, max_len=400, keep_case=False):
    ''' Collect instances and construct vocab '''

    dataset = DataSet()
    trimmed_sent = 0

    with open(file) as f:
        lines = f.readlines()
        for l in lines:
            l = l.strip().split('\t')
            if len(l) < 2:
                continue
            label = int(l[0])
            sent = l[1]
            if not keep_case:
                sent = sent.lower()
            word_lst = sent.split()
            if len(word_lst) > max_len:
                word_lst = word_lst[:max_len]
                trimmed_sent += 1
            if word_lst:
                dataset.append(Instance(words=word_lst, label=label))

    logger.info('Get {} instances from file {}'.format(len(dataset), file))
    if trimmed_sent:
        logger.info('{} sentences are trimmed. Max sentence length: {}.'
                    .format(trimmed_sent, max_len))

    return dataset
Exemplo n.º 26
0
 def test_drop(self):
     ds = DataSet({
         "x": [[1, 2, 3, 4]] * 40,
         "y": [[5, 6], [7, 8, 9, 0]] * 20
     })
     ds.drop(lambda ins: len(ins["y"]) < 3, inplace=True)
     self.assertEqual(len(ds), 20)
Exemplo n.º 27
0
 def test_add_field_v2(self):
     ds = DataSet({"x": [3, 4]})
     ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']],
                  is_input=True,
                  is_target=True)
     # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y')
     print(ds)
Exemplo n.º 28
0
    def test_save_load(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
        ds.save("./my_ds.pkl")
        self.assertTrue(os.path.exists("./my_ds.pkl"))

        ds_1 = DataSet.load("./my_ds.pkl")
        os.remove("my_ds.pkl")
Exemplo n.º 29
0
    def test_eq_transformers(self):
        weight_path = ''
        ds = DataSet({
            'words':
            ["this is a texta model vocab".split(), 'this is'.split()]
        })
        encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path)
        encoder.eval()
        encoder.index_datasets(ds, field_name='words')
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
        word_pieces_res = encoder(word_pieces)

        import transformers
        input1 = ' '.join(ds[0]['words'])
        input2 = ' '.join(ds[1]['words'])
        tokenizer = transformers.RobertaTokenizer.from_pretrained(weight_path)
        idx_list1 = tokenizer.encode(input1)
        idx_list2 = tokenizer.encode(input2)
        self.assertEqual(idx_list1, ds[0]['word_pieces'])
        self.assertEqual(idx_list2, ds[1]['word_pieces'])

        pad_value = tokenizer.encode('<pad>')[0]
        tensor = torch.nn.utils.rnn.pad_sequence(
            [torch.LongTensor(idx_list1),
             torch.LongTensor(idx_list2)],
            batch_first=True,
            padding_value=pad_value)
        roberta = transformers.RobertaModel.from_pretrained(
            weight_path, output_hidden_states=True)
        roberta.eval()
        output, pooled_output, hidden_states = roberta(
            tensor, attention_mask=tensor.ne(pad_value))

        self.assertEqual((output - word_pieces_res).sum(), 0)
Exemplo n.º 30
0
def get_data_bmeso(dataset):
    path=bmeso_data_path+dataset+'.char.bmes'
    data={'raw_chars':[],'target':[],'seq_len':[],'corpus':[],'chars':[]}
    with open(path,encoding='UTF-8') as file:
        raw_sentence=[]
        tags=[]
        for line in file:
            if line=='\n' and len(raw_sentence)>0:
                data['raw_chars'].append(''.join(raw_sentence))
                data['target'].append(tags)
                data['seq_len'].append(len(tags))
                data['corpus'].append('NER-Onto')
                data['chars'].append(raw_sentence)
                raw_sentence=[]
                tags=[]
            else:
                word,tag=line.strip().split()
                word=process_word(word)
                raw_sentence.append(word)
                if tag.endswith('-PER'):
                    tag=tag[0]+'-NR'
                elif tag.endswith('-LOC'):
                    tag=tag[0]+'-NS'
                elif tag.endswith('-GPE'):
                    tag=tag[0]+'-NS'
                elif tag.endswith('-ORG'):
                    tag=tag[0]+'-NT'
                tags.append(tag)
        data=DataSet(data)
        return data