示例#1
0
    def process(self, paths, **kwargs):
        data_info = DataBundle()
        for name in ['train', 'test', 'dev']:
            data_info.datasets[name] = self.load(paths[name])

        config = Config()
        vocab = Vocabulary().from_dataset(*data_info.datasets.values(),
                                          field_name='sentences')
        vocab.build_vocab()
        word2id = vocab.word2idx

        char_dict = preprocess.get_char_dict(config.char_path)
        data_info.vocabs = vocab

        genres = {
            g: i
            for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])
        }

        for name, ds in data_info.datasets.items():
            ds.apply(
                lambda x: preprocess.doc2numpy(x['sentences'],
                                               word2id,
                                               char_dict,
                                               max(config.filter),
                                               config.max_sentences,
                                               is_train=name == 'train')[0],
                new_field_name='doc_np')
            ds.apply(
                lambda x: preprocess.doc2numpy(x['sentences'],
                                               word2id,
                                               char_dict,
                                               max(config.filter),
                                               config.max_sentences,
                                               is_train=name == 'train')[1],
                new_field_name='char_index')
            ds.apply(
                lambda x: preprocess.doc2numpy(x['sentences'],
                                               word2id,
                                               char_dict,
                                               max(config.filter),
                                               config.max_sentences,
                                               is_train=name == 'train')[2],
                new_field_name='seq_len')
            ds.apply(lambda x: preprocess.speaker2numpy(
                x["speakers"], config.max_sentences, is_train=name == 'train'),
                     new_field_name='speaker_ids_np')
            ds.apply(lambda x: genres[x["doc_key"][:2]],
                     new_field_name='genre')

            ds.set_ignore_type('clusters')
            ds.set_padder('clusters', None)
            ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre",
                         "char_index", "seq_len")
            ds.set_target("clusters")

        # train_dev, test = self.ds.split(348 / (2802 + 343 + 348), shuffle=False)
        # train, dev = train_dev.split(343 / (2802 + 343), shuffle=False)

        return data_info
示例#2
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None,
                char_level_op=False):
      
        datasets = {}
        info = DataBundle()
        paths = check_dataloader_paths(paths)
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        if char_level_op:
            for dataset in datasets.values():
                dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

        datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')

        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {
            "words": src_vocab,
            "target": tgt_vocab
        }

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
示例#3
0
    def process(self, paths):
        
        def get_seq_len(instance):
            return len(instance['article'])

        print('Start loading datasets !!!')
        start = time()

        # load datasets
        datasets = {}
        for name in paths:
            datasets[name] = self._load(paths[name])
            
            datasets[name].apply(get_seq_len, new_field_name='seq_len')

            # set input and target
            datasets[name].set_input('article', 'segment_id', 'cls_id')
            datasets[name].set_target(Const.TARGET)
        
            # set padding value
            datasets[name].set_pad_val('article', 0)
            datasets[name].set_pad_val('segment_id', 0)
            datasets[name].set_pad_val('cls_id', -1)
            datasets[name].set_pad_val(Const.TARGET, 0)

        print('Finished in {}'.format(timedelta(seconds=time()-start)))

        return DataBundle(datasets=datasets)
示例#4
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None):
        
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataBundle()
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')
        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {
            "words": src_vocab,
            "target": tgt_vocab
        }

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
示例#5
0
    def process(self,
                paths,
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                src_embed_op: EmbeddingOption = None):
        input_name, target_name = 'words', 'target'
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)

        info = DataBundle(datasets=self.load(paths))
        _train_ds = [info.datasets[name]
                     for name in train_ds] if train_ds else info.datasets.values()
        src_vocab.from_dataset(*_train_ds, field_name=input_name)
        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        src_vocab.index_dataset(
            *info.datasets.values(),
            field_name=input_name, new_field_name=input_name)
        tgt_vocab.index_dataset(
            *info.datasets.values(),
            field_name=target_name, new_field_name=target_name)
        info.vocabs = {
            input_name: src_vocab,
            target_name: tgt_vocab
        }


        if src_embed_op is not None:
            src_embed_op.vocab = src_vocab
            init_emb = EmbedLoader.load_with_vocab(**src_embed_op)
            info.embeddings[input_name] = init_emb


        for name, dataset in info.datasets.items():
            dataset.set_input(input_name)
            dataset.set_target(target_name)
        return info
示例#6
0
    def process(self, paths: Union[str, Dict[str, str]],
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                embed_opt: EmbeddingOption = None,
                char_level_op=False,
                split_dev_op=True
                ):
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataBundle(datasets=self.load(paths))
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
        _train_ds = [info.datasets[name]
                     for name in train_ds] if train_ds else info.datasets.values()

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        input_name, target_name = 'words', 'target'
        info.vocabs={}
        #就分隔为char形式
        if char_level_op:
            for dataset in info.datasets.values():
                dataset.apply_field(wordtochar, field_name="words",new_field_name='chars')
        # if embed_opt is not None:
        #     embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
        #     info.embeddings['words'] = embed
        else:
            src_vocab.from_dataset(*_train_ds, field_name=input_name)
            src_vocab.index_dataset(*info.datasets.values(),field_name=input_name, new_field_name=input_name)
            info.vocabs[input_name]=src_vocab

        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        tgt_vocab.index_dataset(
            *info.datasets.values(),
            field_name=target_name, new_field_name=target_name)

        info.vocabs[target_name]=tgt_vocab

        if split_dev_op:
            info.datasets['train'], info.datasets['dev'] = info.datasets['train'].split(0.1, shuffle=False)

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
示例#7
0
    def process(self, paths):
        
        def truncate_articles(instance, max_nsents=self.max_nsents, max_ntokens=self.max_ntokens):
            article = [' '.join(sent.lower().split()[:max_ntokens]) for sent in instance['article']]
            return article[:max_nsents]

        def truncate_labels(instance):
            label = list(filter(lambda x: x < len(instance['article']), instance['label']))
            return label
        
        def bert_tokenize(instance, tokenizer, max_len, pad_value):
            article = instance['article']
            article = ' [SEP] [CLS] '.join(article)
            word_pieces = tokenizer.tokenize(article)[:(max_len - 2)]
            word_pieces = ['[CLS]'] + word_pieces + ['[SEP]']
            token_ids = tokenizer.convert_tokens_to_ids(word_pieces)
            while len(token_ids) < max_len:
                token_ids.append(pad_value)
            assert len(token_ids) == max_len
            return token_ids

        def get_seg_id(instance, max_len, sep_id):
            _segs = [-1] + [i for i, idx in enumerate(instance['article']) if idx == sep_id]
            segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
            segment_id = []
            for i, length in enumerate(segs):
                if i % 2 == 0:
                    segment_id += length * [0]
                else:
                    segment_id += length * [1]
            while len(segment_id) < max_len:
                segment_id.append(0)
            return segment_id
        
        def get_cls_id(instance, cls_id):
            classification_id = [i for i, idx in enumerate(instance['article']) if idx == cls_id]
            return classification_id
        
        def get_labels(instance):
            labels = [0] * len(instance['cls_id'])
            label_idx = list(filter(lambda x: x < len(instance['cls_id']), instance['label']))
            for idx in label_idx:
                labels[idx] = 1
            return labels

        datasets = {}
        for name in paths:
            datasets[name] = self._load(paths[name])
            
            # remove empty samples
            datasets[name].drop(lambda ins: len(ins['article']) == 0 or len(ins['label']) == 0)
            
            # truncate articles
            datasets[name].apply(lambda ins: truncate_articles(ins, self.max_nsents, self.max_ntokens), new_field_name='article')
            
            # truncate labels
            datasets[name].apply(truncate_labels, new_field_name='label')
            
            # tokenize and convert tokens to id
            datasets[name].apply(lambda ins: bert_tokenize(ins, self.tokenizer, self.max_len, self.pad_id), new_field_name='article')
            
            # get segment id
            datasets[name].apply(lambda ins: get_seg_id(ins, self.max_len, self.sep_id), new_field_name='segment_id')
            
            # get classification id
            datasets[name].apply(lambda ins: get_cls_id(ins, self.cls_id), new_field_name='cls_id')

            # get label
            datasets[name].apply(get_labels, new_field_name='label')
            
            # rename filed
            datasets[name].rename_field('article', Const.INPUTS(0))
            datasets[name].rename_field('segment_id', Const.INPUTS(1))
            datasets[name].rename_field('cls_id', Const.INPUTS(2))
            datasets[name].rename_field('lbael', Const.TARGET)

            # set input and target
            datasets[name].set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2))
            datasets[name].set_target(Const.TARGET)
            
            # set paddding value
            datasets[name].set_pad_val('article', 0)

        return DataBundle(datasets=datasets)
示例#8
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                word_vocab_opt: VocabularyOption = None,
                lower: bool = False):
        """
        读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略

        :param paths:
        :param word_vocab_opt: vocabulary的初始化值
        :param lower: 是否将所有字母转为小写。
        :return:
        """
        # 读取数据
        paths = check_dataloader_paths(paths)
        data = DataBundle()
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]
        for name, path in paths.items():
            dataset = self.load(path)
            dataset.apply_field(lambda words: words,
                                field_name='raw_words',
                                new_field_name=Const.INPUT)
            if lower:
                dataset.words.lower()
            data.datasets[name] = dataset

        # 对construct vocab
        word_vocab = Vocabulary(
            min_freq=2) if word_vocab_opt is None else Vocabulary(
                **word_vocab_opt)
        word_vocab.from_dataset(data.datasets['train'],
                                field_name=Const.INPUT,
                                no_create_entry_dataset=[
                                    dataset
                                    for name, dataset in data.datasets.items()
                                    if name != 'train'
                                ])
        word_vocab.index_dataset(*data.datasets.values(),
                                 field_name=Const.INPUT,
                                 new_field_name=Const.INPUT)
        data.vocabs[Const.INPUT] = word_vocab

        # cap words
        cap_word_vocab = Vocabulary()
        cap_word_vocab.from_dataset(
            data.datasets['train'],
            field_name='raw_words',
            no_create_entry_dataset=[
                dataset for name, dataset in data.datasets.items()
                if name != 'train'
            ])
        cap_word_vocab.index_dataset(*data.datasets.values(),
                                     field_name='raw_words',
                                     new_field_name='cap_words')
        input_fields.append('cap_words')
        data.vocabs['cap_words'] = cap_word_vocab

        # 对target建vocab
        target_vocab = Vocabulary(unknown=None, padding=None)
        target_vocab.from_dataset(*data.datasets.values(),
                                  field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data
示例#9
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                word_vocab_opt: VocabularyOption = None,
                lower: bool = True) -> DataBundle:
        """
        读取并处理数据。返回的DataInfo包含以下的内容
            vocabs:
                word: Vocabulary
                target: Vocabulary
            datasets:
                train: DataSet
                    words: List[int], 被设置为input
                    target: int. label,被同时设置为input和target
                    seq_len: int. 句子的长度,被同时设置为input和target
                    raw_words: List[str]
                xxx(根据传入的paths可能有所变化)

        :param paths:
        :param word_vocab_opt: vocabulary的初始化值
        :param lower: 是否使用小写
        :return:
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]
        for name, path in paths.items():
            dataset = self.load(path)
            dataset.apply_field(lambda words: words,
                                field_name='raw_words',
                                new_field_name=Const.INPUT)
            if lower:
                dataset.words.lower()
            data.datasets[name] = dataset

        # 对construct vocab
        word_vocab = Vocabulary(
            min_freq=2) if word_vocab_opt is None else Vocabulary(
                **word_vocab_opt)
        word_vocab.from_dataset(data.datasets['train'],
                                field_name=Const.INPUT,
                                no_create_entry_dataset=[
                                    dataset
                                    for name, dataset in data.datasets.items()
                                    if name != 'train'
                                ])
        word_vocab.index_dataset(*data.datasets.values(),
                                 field_name=Const.INPUT,
                                 new_field_name=Const.INPUT)
        data.vocabs[Const.INPUT] = word_vocab

        # cap words
        cap_word_vocab = Vocabulary()
        cap_word_vocab.from_dataset(*data.datasets.values(),
                                    field_name='raw_words')
        cap_word_vocab.index_dataset(*data.datasets.values(),
                                     field_name='raw_words',
                                     new_field_name='cap_words')
        input_fields.append('cap_words')
        data.vocabs['cap_words'] = cap_word_vocab

        # 对target建vocab
        target_vocab = Vocabulary(unknown=None, padding=None)
        target_vocab.from_dataset(*data.datasets.values(),
                                  field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data
示例#10
0
    def process(self, paths):
        """
        
        :param paths: 
        :return:
            Dataset包含以下的field
                chars:
                bigrams:
                trigrams:
                pre_chars:
                pre_bigrams:
                pre_trigrams:
                seg_targets:
                seg_masks:
                seq_lens:
                char_labels:
                char_heads:
                gold_word_pairs:
                seg_targets:
                seg_masks:
                char_labels:
                char_heads:
                pun_masks:
                gold_label_word_pairs:
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()

        for name, path in paths.items():
            dataset = self.load(path)
            data.datasets[name] = dataset

        char_labels_vocab = Vocabulary(padding=None, unknown=None)

        def process(dataset, char_label_vocab):
            dataset.apply(add_word_lst, new_field_name='word_lst')
            dataset.apply(lambda x: list(chain(*x['word_lst'])), new_field_name='chars')
            dataset.apply(add_bigram, field_name='chars', new_field_name='bigrams')
            dataset.apply(add_trigram, field_name='chars', new_field_name='trigrams')
            dataset.apply(add_char_heads, new_field_name='char_heads')
            dataset.apply(add_char_labels, new_field_name='char_labels')
            dataset.apply(add_segs, new_field_name='seg_targets')
            dataset.apply(add_mask, new_field_name='seg_masks')
            dataset.add_seq_len('chars', new_field_name='seq_lens')
            dataset.apply(add_pun_masks, new_field_name='pun_masks')
            if len(char_label_vocab.word_count)==0:
                char_label_vocab.from_dataset(dataset, field_name='char_labels')
            char_label_vocab.index_dataset(dataset, field_name='char_labels')
            new_dataset = add_root(dataset)
            new_dataset.apply(add_word_pairs, new_field_name='gold_word_pairs', ignore_type=True)
            global add_label_word_pairs
            add_label_word_pairs = partial(add_label_word_pairs, label_vocab=char_label_vocab)
            new_dataset.apply(add_label_word_pairs, new_field_name='gold_label_word_pairs', ignore_type=True)

            new_dataset.set_pad_val('char_labels', -1)
            new_dataset.set_pad_val('char_heads', -1)

            return new_dataset

        for name in list(paths.keys()):
            dataset = data.datasets[name]
            dataset = process(dataset, char_labels_vocab)
            data.datasets[name] = dataset

        data.vocabs['char_labels'] = char_labels_vocab

        char_vocab = Vocabulary(min_freq=2).from_dataset(data.datasets['train'], field_name='chars')
        bigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='bigrams')
        trigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='trigrams')

        for name in ['chars', 'bigrams', 'trigrams']:
            vocab = Vocabulary().from_dataset(field_name=name, no_create_entry_dataset=list(data.datasets.values()))
            vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name='pre_' + name)
            data.vocabs['pre_{}'.format(name)] = vocab

        for name, vocab in zip(['chars', 'bigrams', 'trigrams'],
                        [char_vocab, bigram_vocab, trigram_vocab]):
            vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name=name)
            data.vocabs[name] = vocab

        for name, dataset in data.datasets.items():
            dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens', 'char_labels', 'char_heads', 'pre_chars',
                                  'pre_bigrams', 'pre_trigrams')
            dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets', 'seg_masks', 'char_labels',
                                   'char_heads',
                                   'pun_masks', 'gold_label_word_pairs')

        return data
示例#11
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                char_vocab_opt: VocabularyOption = None,
                char_embed_opt: EmbeddingOption = None,
                bigram_vocab_opt: VocabularyOption = None,
                bigram_embed_opt: EmbeddingOption = None,
                L: int = 4):
        """
        支持的数据格式为一行一个sample,并且用空格隔开不同的词语。例如

        Option::

            共同  创造  美好  的  新  世纪  ——  二○○一年  新年  贺词
            (  二○○○年  十二月  三十一日  )  (  附  图片  1  张  )
            女士  们  ,  先生  们  ,  同志  们  ,  朋友  们  :

        paths支持两种格式,第一种是str,第二种是Dict[str, str].

        Option::

            # 1. str类型
            # 1.1 传入具体的文件路径
            data = SigHanLoader('bmes').process('/path/to/cws/data.txt') # 将读取data.txt的内容
            # 包含以下的内容data.vocabs['chars']:Vocabulary对象,
            #             data.vocabs['target']: Vocabulary对象,根据encoding_type可能会没有该值
            #             data.embeddings['chars']: Embedding对象. 只有提供了预训练的词向量的路径才有该项
            #             data.datasets['train']: DataSet对象
            #                   包含的field有:
            #                       raw_chars: list[str], 每个元素是一个汉字
            #                       chars: list[int], 每个元素是汉字对应的index
            #                       target: list[int], 根据encoding_type有对应的变化
            # 1.2 传入一个目录, 里面必须包含train.txt文件
            data = SigHanLoader('bmes').process('path/to/cws/') #将尝试在该目录下读取 train.txt, test.txt以及dev.txt
            # 包含以下的内容data.vocabs['chars']: Vocabulary对象
            #             data.vocabs['target']:Vocabulary对象
            #             data.embeddings['chars']: 仅在提供了预训练embedding路径的情况下,为Embedding对象;
            #             data.datasets['train']: DataSet对象
            #                    包含的field有:
            #                       raw_chars: list[str], 每个元素是一个汉字
            #                       chars: list[int], 每个元素是汉字对应的index
            #                       target: list[int], 根据encoding_type有对应的变化
            #             data.datasets['dev']: DataSet对象,如果文件夹下包含了dev.txt;内容与data.datasets['train']一样

            # 2. dict类型, key是文件的名称,value是对应的读取路径. 必须包含'train'这个key
            paths = {'train': '/path/to/train/train.txt', 'test':'/path/to/test/test.txt', 'dev':'/path/to/dev/dev.txt'}
            data = SigHanLoader(paths).process(paths)
            # 结果与传入目录时是一致的,但是可以传入多个数据集。data.datasets中的key将与这里传入的一致

        :param paths: 支持传入目录,文件路径,以及dict。
        :param char_vocab_opt: 用于构建chars的vocabulary参数,默认为min_freq=2
        :param char_embed_opt: 用于读取chars的Embedding的参数,默认不读取pretrained的embedding
        :param bigram_vocab_opt: 用于构建bigram的vocabulary参数,默认不使用bigram, 仅在指定该参数的情况下会带有bigrams这个field。
            为List[int], 每个instance长度与chars一样, abcde的bigram为ab bc cd de e<eos>
        :param bigram_embed_opt: 用于读取预训练bigram的参数,仅在传入bigram_vocab_opt有效
        :param L: 当target_type为shift_relay时传入的segment长度
        :return:
        """
        # 推荐大家使用这个check_data_loader_paths进行paths的验证
        paths = check_dataloader_paths(paths)
        datasets = {}
        data = DataBundle()
        bigram = bigram_vocab_opt is not None
        for name, path in paths.items():
            dataset = self.load(path, bigram=bigram)
            datasets[name] = dataset
        input_fields = []
        target_fields = []
        # 创建vocab
        char_vocab = Vocabulary(
            min_freq=2) if char_vocab_opt is None else Vocabulary(
                **char_vocab_opt)
        char_vocab.from_dataset(datasets['train'], field_name='raw_chars')
        char_vocab.index_dataset(*datasets.values(),
                                 field_name='raw_chars',
                                 new_field_name='chars')
        data.vocabs[Const.CHAR_INPUT] = char_vocab
        input_fields.extend([Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET])
        target_fields.append(Const.TARGET)
        # 创建target
        if self.target_type == 'bmes':
            target_vocab = Vocabulary(unknown=None, padding=None)
            target_vocab.add_word_lst(['B'] * 4 + ['M'] * 3 + ['E'] * 2 +
                                      ['S'])
            target_vocab.index_dataset(*datasets.values(), field_name='target')
            data.vocabs[Const.TARGET] = target_vocab
        if char_embed_opt is not None:
            char_embed = EmbedLoader.load_with_vocab(**char_embed_opt,
                                                     vocab=char_vocab)
            data.embeddings['chars'] = char_embed
        if bigram:
            bigram_vocab = Vocabulary(**bigram_vocab_opt)
            bigram_vocab.from_dataset(datasets['train'], field_name='bigrams')
            bigram_vocab.index_dataset(*datasets.values(),
                                       field_name='bigrams')
            data.vocabs['bigrams'] = bigram_vocab
            if bigram_embed_opt is not None:
                bigram_embed = EmbedLoader.load_with_vocab(**bigram_embed_opt,
                                                           vocab=bigram_vocab)
                data.embeddings['bigrams'] = bigram_embed
            input_fields.append('bigrams')
        if self.target_type == 'shift_relay':
            func = partial(self._clip_target, L=L)
            for name, dataset in datasets.items():
                res = dataset.apply_field(func, field_name='target')
                relay_target = [res_i[0] for res_i in res]
                relay_mask = [res_i[1] for res_i in res]
                dataset.add_field('relay_target',
                                  relay_target,
                                  is_input=True,
                                  is_target=False,
                                  ignore_type=False)
                dataset.add_field('relay_mask',
                                  relay_mask,
                                  is_input=True,
                                  is_target=False,
                                  ignore_type=False)
        if self.target_type == 'shift_relay':
            input_fields.extend(['end_seg_mask'])
            target_fields.append('start_seg_mask')
        # 将dataset加入DataInfo
        for name, dataset in datasets.items():
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)
            data.datasets[name] = dataset

        return data
示例#12
0
    def process(
        self,
        paths: Union[str, Dict[str, str]],
        dataset_name: str = None,
        to_lower=False,
        seq_len_type: str = None,
        bert_tokenizer: str = None,
        cut_text: int = None,
        get_index=True,
        auto_pad_length: int = None,
        auto_pad_token: str = '<pad>',
        set_input: Union[list, str, bool] = True,
        set_target: Union[list, str, bool] = True,
        concat: Union[str, list, bool] = None,
    ) -> DataBundle:
        """
        :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹,
            则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和
            对应的全路径文件名。
        :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义
            这个数据集的名字,如果不定义则默认为train。
        :param bool to_lower: 是否将文本自动转为小写。默认值为False。
        :param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` :
            提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和
            attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len
        :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径
        :param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。
        :param bool get_index: 是否需要根据词表将文本转为index
        :param int auto_pad_length: 是否需要将文本自动pad到一定长度(超过这个长度的文本将会被截掉),默认为不会自动pad
        :param str auto_pad_token: 自动pad的内容
        :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False
            则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input,
            于此同时其他field不会被设置为input。默认值为True。
        :param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。
        :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。
            如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果
            传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]'].
        :return:
        """
        if isinstance(set_input, str):
            set_input = [set_input]
        if isinstance(set_target, str):
            set_target = [set_target]
        if isinstance(set_input, bool):
            auto_set_input = set_input
        else:
            auto_set_input = False
        if isinstance(set_target, bool):
            auto_set_target = set_target
        else:
            auto_set_target = False
        if isinstance(paths, str):
            if os.path.isdir(paths):
                path = {
                    n: os.path.join(paths, self.paths[n])
                    for n in self.paths.keys()
                }
            else:
                path = {
                    dataset_name if dataset_name is not None else 'train':
                    paths
                }
        else:
            path = paths

        data_info = DataBundle()
        for data_name in path.keys():
            data_info.datasets[data_name] = self._load(path[data_name])

        for data_name, data_set in data_info.datasets.items():
            if auto_set_input:
                data_set.set_input(Const.INPUTS(0), Const.INPUTS(1))
            if auto_set_target:
                if Const.TARGET in data_set.get_field_names():
                    data_set.set_target(Const.TARGET)

        if to_lower:
            for data_name, data_set in data_info.datasets.items():
                data_set.apply(
                    lambda x: [w.lower() for w in x[Const.INPUTS(0)]],
                    new_field_name=Const.INPUTS(0),
                    is_input=auto_set_input)
                data_set.apply(
                    lambda x: [w.lower() for w in x[Const.INPUTS(1)]],
                    new_field_name=Const.INPUTS(1),
                    is_input=auto_set_input)

        if bert_tokenizer is not None:
            if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR:
                PRETRAIN_URL = _get_base_url('bert')
                model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer]
                model_url = PRETRAIN_URL + model_name
                model_dir = cached_path(model_url)
                # 检查是否存在
            elif os.path.isdir(bert_tokenizer):
                model_dir = bert_tokenizer
            else:
                raise ValueError(
                    f"Cannot recognize BERT tokenizer from {bert_tokenizer}.")

            words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]')
            with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f:
                lines = f.readlines()
            lines = [line.strip() for line in lines]
            words_vocab.add_word_lst(lines)
            words_vocab.build_vocab()

            tokenizer = BertTokenizer.from_pretrained(model_dir)

            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(
                            lambda x: tokenizer.tokenize(' '.join(x[fields])),
                            new_field_name=fields,
                            is_input=auto_set_input)

        if isinstance(concat, bool):
            concat = 'default' if concat else None
        if concat is not None:
            if isinstance(concat, str):
                CONCAT_MAP = {
                    'bert': ['[CLS]', '[SEP]', '', '[SEP]'],
                    'default': ['', '<sep>', '', '']
                }
                if concat.lower() in CONCAT_MAP:
                    concat = CONCAT_MAP[concat]
                else:
                    concat = 4 * [concat]
            assert len(concat) == 4, \
                f'Please choose a list with 4 symbols which at the beginning of first sentence ' \
                f'the end of first sentence, the begin of second sentence, and the end of second' \
                f'sentence. Your input is {concat}'

            for data_name, data_set in data_info.datasets.items():
                data_set.apply(
                    lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[
                        1]] + [concat[2]] + x[Const.INPUTS(1)] + [concat[3]],
                    new_field_name=Const.INPUT)
                data_set.apply(
                    lambda x: [w for w in x[Const.INPUT] if len(w) > 0],
                    new_field_name=Const.INPUT,
                    is_input=auto_set_input)

        if seq_len_type is not None:
            if seq_len_type == 'seq_len':  #
                for data_name, data_set in data_info.datasets.items():
                    for fields in data_set.get_field_names():
                        if Const.INPUT in fields:
                            data_set.apply(lambda x: len(x[fields]),
                                           new_field_name=fields.replace(
                                               Const.INPUT, Const.INPUT_LEN),
                                           is_input=auto_set_input)
            elif seq_len_type == 'mask':
                for data_name, data_set in data_info.datasets.items():
                    for fields in data_set.get_field_names():
                        if Const.INPUT in fields:
                            data_set.apply(lambda x: [1] * len(x[fields]),
                                           new_field_name=fields.replace(
                                               Const.INPUT, Const.INPUT_LEN),
                                           is_input=auto_set_input)
            elif seq_len_type == 'bert':
                for data_name, data_set in data_info.datasets.items():
                    if Const.INPUT not in data_set.get_field_names():
                        raise KeyError(
                            f'Field ``{Const.INPUT}`` not in {data_name} data set: '
                            f'got {data_set.get_field_names()}')
                    data_set.apply(lambda x: [0] *
                                   (len(x[Const.INPUTS(0)]) + 2) + [1] *
                                   (len(x[Const.INPUTS(1)]) + 1),
                                   new_field_name=Const.INPUT_LENS(0),
                                   is_input=auto_set_input)
                    data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]),
                                   new_field_name=Const.INPUT_LENS(1),
                                   is_input=auto_set_input)

        if auto_pad_length is not None:
            cut_text = min(
                auto_pad_length,
                cut_text if cut_text is not None else auto_pad_length)

        if cut_text is not None:
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if (Const.INPUT
                            in fields) or ((Const.INPUT_LEN in fields) and
                                           (seq_len_type != 'seq_len')):
                        data_set.apply(lambda x: x[fields][:cut_text],
                                       new_field_name=fields,
                                       is_input=auto_set_input)

        data_set_list = [d for n, d in data_info.datasets.items()]
        assert len(data_set_list) > 0, f'There are NO data sets in data info!'

        if bert_tokenizer is None:
            words_vocab = Vocabulary(padding=auto_pad_token)
            words_vocab = words_vocab.from_dataset(
                *[d for n, d in data_info.datasets.items() if 'train' in n],
                field_name=[
                    n for n in data_set_list[0].get_field_names()
                    if (Const.INPUT in n)
                ],
                no_create_entry_dataset=[
                    d for n, d in data_info.datasets.items()
                    if 'train' not in n
                ])
        target_vocab = Vocabulary(padding=None, unknown=None)
        target_vocab = target_vocab.from_dataset(
            *[d for n, d in data_info.datasets.items() if 'train' in n],
            field_name=Const.TARGET)
        data_info.vocabs = {
            Const.INPUT: words_vocab,
            Const.TARGET: target_vocab
        }

        if get_index:
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(
                            lambda x:
                            [words_vocab.to_index(w) for w in x[fields]],
                            new_field_name=fields,
                            is_input=auto_set_input)

                if Const.TARGET in data_set.get_field_names():
                    data_set.apply(
                        lambda x: target_vocab.to_index(x[Const.TARGET]),
                        new_field_name=Const.TARGET,
                        is_input=auto_set_input,
                        is_target=auto_set_target)

        if auto_pad_length is not None:
            if seq_len_type == 'seq_len':
                raise RuntimeError(
                    f'the sequence will be padded with the length {auto_pad_length}, '
                    f'so the seq_len_type cannot be `{seq_len_type}`!')
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(
                            lambda x: x[fields] +
                            [words_vocab.to_index(words_vocab.padding)] *
                            (auto_pad_length - len(x[fields])),
                            new_field_name=fields,
                            is_input=auto_set_input)
                    elif (Const.INPUT_LEN
                          in fields) and (seq_len_type != 'seq_len'):
                        data_set.apply(lambda x: x[fields] + [0] *
                                       (auto_pad_length - len(x[fields])),
                                       new_field_name=fields,
                                       is_input=auto_set_input)

        for data_name, data_set in data_info.datasets.items():
            if isinstance(set_input, list):
                data_set.set_input(*[
                    inputs for inputs in set_input
                    if inputs in data_set.get_field_names()
                ])
            if isinstance(set_target, list):
                data_set.set_target(*[
                    target for target in set_target
                    if target in data_set.get_field_names()
                ])

        return data_info
示例#13
0
    def process(self, paths, bigrams=False, trigrams=False):
        """

        :param paths:
        :param bool, bigrams: 是否包含生成bigram feature, [a, b, c, d] -> [ab, bc, cd, d<eos>]
        :param bool, trigrams: 是否包含trigram feature,[a, b, c, d] -> [abc, bcd, cd<eos>, d<eos><eos>]
        :return: DataBundle
            包含以下的fields
                raw_chars: List[str]
                chars: List[int]
                seq_len: int, 字的长度
                bigrams: List[int], optional
                trigrams: List[int], optional
                target: List[int]
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()
        input_fields = [Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, path in paths.items():
            dataset = self.load(path)
            if bigrams:
                dataset.apply_field(lambda raw_chars: [
                    c1 + c2
                    for c1, c2 in zip(raw_chars, raw_chars[1:] + ['<eos>'])
                ],
                                    field_name='raw_chars',
                                    new_field_name='bigrams')

            if trigrams:
                dataset.apply_field(lambda raw_chars: [
                    c1 + c2 + c3
                    for c1, c2, c3 in zip(raw_chars, raw_chars[1:] + ['<eos>'],
                                          raw_chars[2:] + ['<eos>'] * 2)
                ],
                                    field_name='raw_chars',
                                    new_field_name='trigrams')
            data.datasets[name] = dataset

        char_vocab = Vocabulary().from_dataset(
            data.datasets['train'],
            field_name='raw_chars',
            no_create_entry_dataset=[
                dataset for name, dataset in data.datasets.items()
                if name != 'train'
            ])
        char_vocab.index_dataset(*data.datasets.values(),
                                 field_name='raw_chars',
                                 new_field_name=Const.CHAR_INPUT)
        data.vocabs[Const.CHAR_INPUT] = char_vocab

        target_vocab = Vocabulary(unknown=None, padding=None).from_dataset(
            data.datasets['train'], field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        if bigrams:
            bigram_vocab = Vocabulary().from_dataset(
                data.datasets['train'],
                field_name='bigrams',
                no_create_entry_dataset=[
                    dataset for name, dataset in data.datasets.items()
                    if name != 'train'
                ])
            bigram_vocab.index_dataset(*data.datasets.values(),
                                       field_name='bigrams',
                                       new_field_name='bigrams')
            data.vocabs['bigrams'] = bigram_vocab
            input_fields.append('bigrams')

        if trigrams:
            trigram_vocab = Vocabulary().from_dataset(
                data.datasets['train'],
                field_name='trigrams',
                no_create_entry_dataset=[
                    dataset for name, dataset in data.datasets.items()
                    if name != 'train'
                ])
            trigram_vocab.index_dataset(*data.datasets.values(),
                                        field_name='trigrams',
                                        new_field_name='trigrams')
            data.vocabs['trigrams'] = trigram_vocab
            input_fields.append('trigrams')

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.CHAR_INPUT)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data
示例#14
0
    def process(self,
                paths,
                vocab_size,
                vocab_path,
                sent_max_len,
                doc_max_timesteps,
                domain=False,
                tag=False,
                load_vocab_file=True):
        """
        :param paths: dict  path for each dataset
        :param vocab_size: int  max_size for vocab
        :param vocab_path: str  vocab path
        :param sent_max_len: int    max token number of the sentence
        :param doc_max_timesteps: int   max sentence number of the document
        :param domain: bool  build vocab for publication, use 'X' for unknown
        :param tag: bool  build vocab for tag, use 'X' for unknown
        :param load_vocab_file: bool  build vocab (False) or load vocab (True)
        :return: DataBundle
            datasets: dict  keys correspond to the paths dict
            vocabs: dict  key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True)
            embeddings: optional
        """
        def _pad_sent(text_wd):
            pad_text_wd = []
            for sent_wd in text_wd:
                if len(sent_wd) < sent_max_len:
                    pad_num = sent_max_len - len(sent_wd)
                    sent_wd.extend([WORD_PAD] * pad_num)
                else:
                    sent_wd = sent_wd[:sent_max_len]
                pad_text_wd.append(sent_wd)
            return pad_text_wd

        def _token_mask(text_wd):
            token_mask_list = []
            for sent_wd in text_wd:
                token_num = len(sent_wd)
                if token_num < sent_max_len:
                    mask = [1] * token_num + [0] * (sent_max_len - token_num)
                else:
                    mask = [1] * sent_max_len
                token_mask_list.append(mask)
            return token_mask_list

        def _pad_label(label):
            text_len = len(label)
            if text_len < doc_max_timesteps:
                pad_label = label + [0] * (doc_max_timesteps - text_len)
            else:
                pad_label = label[:doc_max_timesteps]
            return pad_label

        def _pad_doc(text_wd):
            text_len = len(text_wd)
            if text_len < doc_max_timesteps:
                padding = [WORD_PAD] * sent_max_len
                pad_text = text_wd + [padding] * (doc_max_timesteps - text_len)
            else:
                pad_text = text_wd[:doc_max_timesteps]
            return pad_text

        def _sent_mask(text_wd):
            text_len = len(text_wd)
            if text_len < doc_max_timesteps:
                sent_mask = [1] * text_len + [0] * (doc_max_timesteps -
                                                    text_len)
            else:
                sent_mask = [1] * doc_max_timesteps
            return sent_mask

        datasets = {}
        train_ds = None
        for key, value in paths.items():
            ds = self.load(value)
            # pad sent
            ds.apply(lambda x: _pad_sent(x["text_wd"]),
                     new_field_name="pad_text_wd")
            ds.apply(lambda x: _token_mask(x["text_wd"]),
                     new_field_name="pad_token_mask")
            # pad document
            ds.apply(lambda x: _pad_doc(x["pad_text_wd"]),
                     new_field_name="pad_text")
            ds.apply(lambda x: _sent_mask(x["pad_text_wd"]),
                     new_field_name="seq_len")
            ds.apply(lambda x: _pad_label(x["flatten_label"]),
                     new_field_name="pad_label")

            # rename field
            ds.rename_field("pad_text", Const.INPUT)
            ds.rename_field("seq_len", Const.INPUT_LEN)
            ds.rename_field("pad_label", Const.TARGET)

            # set input and target
            ds.set_input(Const.INPUT, Const.INPUT_LEN)
            ds.set_target(Const.TARGET, Const.INPUT_LEN)

            datasets[key] = ds
            if "train" in key:
                train_ds = datasets[key]

        vocab_dict = {}
        if load_vocab_file == False:
            logger.info("[INFO] Build new vocab from training dataset!")
            if train_ds == None:
                raise ValueError("Lack train file to build vocabulary!")

            vocabs = Vocabulary(max_size=vocab_size,
                                padding=WORD_PAD,
                                unknown=WORD_UNK)
            vocabs.from_dataset(train_ds, field_name=["text_wd", "summary_wd"])
            vocab_dict["vocab"] = vocabs
        else:
            logger.info("[INFO] Load existing vocab from %s!" % vocab_path)
            word_list = []
            with open(vocab_path, 'r', encoding='utf8') as vocab_f:
                cnt = 2  # pad and unk
                for line in vocab_f:
                    pieces = line.split("\t")
                    word_list.append(pieces[0])
                    cnt += 1
                    if cnt > vocab_size:
                        break
            vocabs = Vocabulary(max_size=vocab_size,
                                padding=WORD_PAD,
                                unknown=WORD_UNK)
            vocabs.add_word_lst(word_list)
            vocabs.build_vocab()
            vocab_dict["vocab"] = vocabs

        if domain == True:
            domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK)
            domaindict.from_dataset(train_ds, field_name="publication")
            vocab_dict["domain"] = domaindict
        if tag == True:
            tagdict = Vocabulary(padding=None, unknown=TAG_UNK)
            tagdict.from_dataset(train_ds, field_name="tag")
            vocab_dict["tag"] = tagdict

        for ds in datasets.values():
            vocab_dict["vocab"].index_dataset(ds,
                                              field_name=Const.INPUT,
                                              new_field_name=Const.INPUT)

        return DataBundle(vocabs=vocab_dict, datasets=datasets)