示例#1
0
 def load(self, paths: Union[str, Dict[str, str]] = None,ratio_tr_d_te:tuple =ratio_tr_d_te) -> DataBundle:
     '''
      :param paths: 为str时,读入训练集合的所有训练样本,并在训练集的基础上按比例8:1:1划分
                 为Dict[str,str]时,通过test,val,train这种键值来pick训练、测试、验证集,(train必须要有)
                 如果没有val、test,从train中划分一定比例充当验证/测试集
     :return:
     '''
     paths =self.check_loader_paths(paths)  #此时的paths是字典{'train':XXX,..}
     datasets ={_:self._load(path=path) for _,path in paths.items()}
     # 对所有数据做shuffle 处理
     for name,ds in datasets.items():
         shuffled_ds =DataSet()
         indices =[_ for _ in range(len(ds))]
         random.shuffle(indices)
         for _ in indices:
             shuffled_ds.append(ds[_])
         datasets[name] =shuffled_ds
     # shuffle 处理结束
     if len(datasets) ==1:
         print('检测到只load train中的dataset,默认8:1:1拆分为train/test/val 三份集合')
         ds =datasets['train']
         train_count =int(len(ds)*(ratio_tr_d_te[0]/sum(ratio_tr_d_te)))
         test_count = int(len(ds)*(ratio_tr_d_te[2]/sum(ratio_tr_d_te)))
         return DataBundle(datasets={'train':ds[:train_count],'val':ds[train_count:-test_count],'test':ds[-test_count:]})
     elif len(datasets) ==3:
         print('检测到train,test,val,不需要从train划分')
         return DataBundle(datasets=datasets)
     elif 'val' not in datasets:
         print('检测到train,test,从train划分出val')
         ds = datasets['train']
         val_count = int(len(ds) * (ratio_tr_d_te[1] / sum(ratio_tr_d_te)))
         return DataBundle(datasets= {'train': ds[:-val_count], 'val': ds[-val_count:], 'test': datasets['test']})
     elif 'test' not in datasets:
         print('检测到train,val,从train划分出test')
         ds = datasets['train']
         test_count = int(len(ds) * (ratio_tr_d_te[2] / sum(ratio_tr_d_te)))
         return DataBundle(datasets={'train': ds[:-test_count], 'val': ds[-test_count:], 'test': datasets['test']})
示例#2
0
文件: loader.py 项目: yhcc/BertForRD
 def load(self, folder):
     data_bundle = DataBundle()
     for name in ['desc.json', 'dev.json', 'seen.json', 'train.json', 'unseen.json']:
         path = os.path.join(folder, name)
         dataset = DataSet()
         with open(path, 'r', encoding='utf-8') as f:
             data = json.load(f)
             for d in data:
                 word = d['word'].lower()
                 definition = d['definitions'].lower()
                 ins = Instance(word=word, definition=definition)
                 dataset.append(ins)
             data_bundle.set_dataset(dataset, name=name.split('.')[0])
     words = []
     with open(os.path.join(folder, 'target_words.txt'), 'r', encoding='utf-8') as f:
         for line in f:
             line = line.strip()
             if line:
                 words.append(line)
     setattr(data_bundle, 'target_words', words)
     return data_bundle
示例#3
0
    def process(self, paths):
        """
        
        :param paths: 
        :return:
            Dataset包含以下的field
                chars:
                bigrams:
                trigrams:
                pre_chars:
                pre_bigrams:
                pre_trigrams:
                seg_targets:
                seg_masks:
                seq_lens:
                char_labels:
                char_heads:
                gold_word_pairs:
                seg_targets:
                seg_masks:
                char_labels:
                char_heads:
                pun_masks:
                gold_label_word_pairs:
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()

        for name, path in paths.items():
            print(name,path)
            dataset = self.load(path)
            data.datasets[name] = dataset

        char_labels_vocab = Vocabulary(padding=None, unknown=None)

        def process(dataset, char_label_vocab):
            dataset.apply(add_word_lst, new_field_name='word_lst')
            dataset.apply(lambda x: list(chain(*x['word_lst'])), new_field_name='chars')
            dataset.apply(add_bigram, field_name='chars', new_field_name='bigrams')
            dataset.apply(add_trigram, field_name='chars', new_field_name='trigrams')
            dataset.apply(add_char_heads, new_field_name='char_heads')
            dataset.apply(add_char_labels, new_field_name='char_labels')
            dataset.apply(add_segs, new_field_name='seg_targets')
            dataset.apply(add_mask, new_field_name='seg_masks')
            dataset.add_seq_len('chars', new_field_name='seq_lens')
            dataset.apply(add_pun_masks, new_field_name='pun_masks')
            if len(char_label_vocab.word_count)==0:
                char_label_vocab.from_dataset(dataset, field_name='char_labels')
            char_label_vocab.index_dataset(dataset, field_name='char_labels')
            new_dataset = add_root(dataset)
            new_dataset.apply(add_word_pairs, new_field_name='gold_word_pairs', ignore_type=True)
            global add_label_word_pairs
            add_label_word_pairs = partial(add_label_word_pairs, label_vocab=char_label_vocab)
            new_dataset.apply(add_label_word_pairs, new_field_name='gold_label_word_pairs', ignore_type=True)

            new_dataset.set_pad_val('char_labels', -1)
            new_dataset.set_pad_val('char_heads', -1)

            return new_dataset

        for name in list(paths.keys()):
            dataset = data.datasets[name]
            dataset = process(dataset, char_labels_vocab)
            data.datasets[name] = dataset

        data.vocabs['char_labels'] = char_labels_vocab

        char_vocab = Vocabulary(min_freq=2).from_dataset(data.datasets['train'], field_name='chars')
        bigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='bigrams')
        trigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='trigrams')

        for name in ['chars', 'bigrams', 'trigrams']:
            vocab = Vocabulary().from_dataset(field_name=name, no_create_entry_dataset=list(data.datasets.values()))
            vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name='pre_' + name)
            data.vocabs['pre_{}'.format(name)] = vocab

        for name, vocab in zip(['chars', 'bigrams', 'trigrams'],
                        [char_vocab, bigram_vocab, trigram_vocab]):
            vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name=name)
            data.vocabs[name] = vocab

        for name, dataset in data.datasets.items():
            dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens', 'char_labels', 'char_heads', 'pre_chars',
                                  'pre_bigrams', 'pre_trigrams')
            dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets', 'seg_masks', 'char_labels',
                                   'char_heads',
                                   'pun_masks', 'gold_label_word_pairs')

        return data