Exemplo n.º 1
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                embed_opt: EmbeddingOption = None,
                char_level_op=False):
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataInfo(datasets=self.load(paths))
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(
            **src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
        _train_ds = [info.datasets[name] for name in train_ds
                     ] if train_ds else info.datasets.values()

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        input_name, target_name = 'words', 'target'
        info.vocabs = {}
        #就分隔为char形式
        if char_level_op:
            for dataset in info.datasets.values():
                dataset.apply_field(wordtochar,
                                    field_name="words",
                                    new_field_name='chars')
        # if embed_opt is not None:
        #     embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
        #     info.embeddings['words'] = embed
        else:
            src_vocab.from_dataset(*_train_ds, field_name=input_name)
            src_vocab.index_dataset(*info.datasets.values(),
                                    field_name=input_name,
                                    new_field_name=input_name)
            info.vocabs[input_name] = src_vocab

        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        tgt_vocab.index_dataset(*info.datasets.values(),
                                field_name=target_name,
                                new_field_name=target_name)

        info.vocabs[target_name] = tgt_vocab

        info.datasets['train'], info.datasets['dev'] = info.datasets[
            'train'].split(0.1, shuffle=False)

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Exemplo n.º 2
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None,
                char_level_op=False):

        datasets = {}
        info = DataBundle()
        paths = check_dataloader_paths(paths)
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        if char_level_op:
            for dataset in datasets.values():
                dataset.apply_field(wordtochar,
                                    field_name="words",
                                    new_field_name='chars')

        datasets["train"], datasets["dev"] = datasets["train"].split(
            0.1, shuffle=False)

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(
            **src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')

        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {"words": src_vocab, "target": tgt_vocab}

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt,
                                                vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Exemplo n.º 3
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None):
        
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataBundle()
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')
        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {
            "words": src_vocab,
            "target": tgt_vocab
        }

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Exemplo n.º 4
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                word_vocab_opt: VocabularyOption = None,
                lower: bool = False):
        """
        读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略

        :param paths:
        :param word_vocab_opt: vocabulary的初始化值
        :param lower: 是否将所有字母转为小写。
        :return:
        """
        # 读取数据
        paths = check_dataloader_paths(paths)
        data = DataInfo()
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]
        for name, path in paths.items():
            dataset = self.load(path)
            dataset.apply_field(lambda words: words,
                                field_name='raw_words',
                                new_field_name=Const.INPUT)
            if lower:
                dataset.words.lower()
            data.datasets[name] = dataset

        # 对construct vocab
        word_vocab = Vocabulary(
            min_freq=2) if word_vocab_opt is None else Vocabulary(
                **word_vocab_opt)
        word_vocab.from_dataset(data.datasets['train'],
                                field_name=Const.INPUT,
                                no_create_entry_dataset=[
                                    dataset
                                    for name, dataset in data.datasets.items()
                                    if name != 'train'
                                ])
        word_vocab.index_dataset(*data.datasets.values(),
                                 field_name=Const.INPUT,
                                 new_field_name=Const.INPUT)
        data.vocabs[Const.INPUT] = word_vocab

        # cap words
        cap_word_vocab = Vocabulary()
        cap_word_vocab.from_dataset(
            data.datasets['train'],
            field_name='raw_words',
            no_create_entry_dataset=[
                dataset for name, dataset in data.datasets.items()
                if name != 'train'
            ])
        cap_word_vocab.index_dataset(*data.datasets.values(),
                                     field_name='raw_words',
                                     new_field_name='cap_words')
        input_fields.append('cap_words')
        data.vocabs['cap_words'] = cap_word_vocab

        # 对target建vocab
        target_vocab = Vocabulary(unknown=None, padding=None)
        target_vocab.from_dataset(*data.datasets.values(),
                                  field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data
Exemplo n.º 5
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                word_vocab_opt: VocabularyOption = None,
                lower: bool = True) -> DataBundle:
        """
        读取并处理数据。返回的DataInfo包含以下的内容
            vocabs:
                word: Vocabulary
                target: Vocabulary
            datasets:
                train: DataSet
                    words: List[int], 被设置为input
                    target: int. label,被同时设置为input和target
                    seq_len: int. 句子的长度,被同时设置为input和target
                    raw_words: List[str]
                xxx(根据传入的paths可能有所变化)

        :param paths:
        :param word_vocab_opt: vocabulary的初始化值
        :param lower: 是否使用小写
        :return:
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]
        for name, path in paths.items():
            dataset = self.load(path)
            dataset.apply_field(lambda words: words,
                                field_name='raw_words',
                                new_field_name=Const.INPUT)
            if lower:
                dataset.words.lower()
            data.datasets[name] = dataset

        # 对construct vocab
        word_vocab = Vocabulary(
            min_freq=2) if word_vocab_opt is None else Vocabulary(
                **word_vocab_opt)
        word_vocab.from_dataset(data.datasets['train'],
                                field_name=Const.INPUT,
                                no_create_entry_dataset=[
                                    dataset
                                    for name, dataset in data.datasets.items()
                                    if name != 'train'
                                ])
        word_vocab.index_dataset(*data.datasets.values(),
                                 field_name=Const.INPUT,
                                 new_field_name=Const.INPUT)
        data.vocabs[Const.INPUT] = word_vocab

        # cap words
        cap_word_vocab = Vocabulary()
        cap_word_vocab.from_dataset(*data.datasets.values(),
                                    field_name='raw_words')
        cap_word_vocab.index_dataset(*data.datasets.values(),
                                     field_name='raw_words',
                                     new_field_name='cap_words')
        input_fields.append('cap_words')
        data.vocabs['cap_words'] = cap_word_vocab

        # 对target建vocab
        target_vocab = Vocabulary(unknown=None, padding=None)
        target_vocab.from_dataset(*data.datasets.values(),
                                  field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data
Exemplo n.º 6
0
    def process(self, paths):
        """
        
        :param paths: 
        :return:
            Dataset包含以下的field
                chars:
                bigrams:
                trigrams:
                pre_chars:
                pre_bigrams:
                pre_trigrams:
                seg_targets:
                seg_masks:
                seq_lens:
                char_labels:
                char_heads:
                gold_word_pairs:
                seg_targets:
                seg_masks:
                char_labels:
                char_heads:
                pun_masks:
                gold_label_word_pairs:
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()

        for name, path in paths.items():
            dataset = self.load(path)
            data.datasets[name] = dataset

        char_labels_vocab = Vocabulary(padding=None, unknown=None)

        def process(dataset, char_label_vocab):
            dataset.apply(add_word_lst, new_field_name='word_lst')
            dataset.apply(lambda x: list(chain(*x['word_lst'])), new_field_name='chars')
            dataset.apply(add_bigram, field_name='chars', new_field_name='bigrams')
            dataset.apply(add_trigram, field_name='chars', new_field_name='trigrams')
            dataset.apply(add_char_heads, new_field_name='char_heads')
            dataset.apply(add_char_labels, new_field_name='char_labels')
            dataset.apply(add_segs, new_field_name='seg_targets')
            dataset.apply(add_mask, new_field_name='seg_masks')
            dataset.add_seq_len('chars', new_field_name='seq_lens')
            dataset.apply(add_pun_masks, new_field_name='pun_masks')
            if len(char_label_vocab.word_count)==0:
                char_label_vocab.from_dataset(dataset, field_name='char_labels')
            char_label_vocab.index_dataset(dataset, field_name='char_labels')
            new_dataset = add_root(dataset)
            new_dataset.apply(add_word_pairs, new_field_name='gold_word_pairs', ignore_type=True)
            global add_label_word_pairs
            add_label_word_pairs = partial(add_label_word_pairs, label_vocab=char_label_vocab)
            new_dataset.apply(add_label_word_pairs, new_field_name='gold_label_word_pairs', ignore_type=True)

            new_dataset.set_pad_val('char_labels', -1)
            new_dataset.set_pad_val('char_heads', -1)

            return new_dataset

        for name in list(paths.keys()):
            dataset = data.datasets[name]
            dataset = process(dataset, char_labels_vocab)
            data.datasets[name] = dataset

        data.vocabs['char_labels'] = char_labels_vocab

        char_vocab = Vocabulary(min_freq=2).from_dataset(data.datasets['train'], field_name='chars')
        bigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='bigrams')
        trigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='trigrams')

        for name in ['chars', 'bigrams', 'trigrams']:
            vocab = Vocabulary().from_dataset(field_name=name, no_create_entry_dataset=list(data.datasets.values()))
            vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name='pre_' + name)
            data.vocabs['pre_{}'.format(name)] = vocab

        for name, vocab in zip(['chars', 'bigrams', 'trigrams'],
                        [char_vocab, bigram_vocab, trigram_vocab]):
            vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name=name)
            data.vocabs[name] = vocab

        for name, dataset in data.datasets.items():
            dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens', 'char_labels', 'char_heads', 'pre_chars',
                                  'pre_bigrams', 'pre_trigrams')
            dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets', 'seg_masks', 'char_labels',
                                   'char_heads',
                                   'pun_masks', 'gold_label_word_pairs')

        return data
Exemplo n.º 7
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                char_vocab_opt: VocabularyOption = None,
                char_embed_opt: EmbeddingOption = None,
                bigram_vocab_opt: VocabularyOption = None,
                bigram_embed_opt: EmbeddingOption = None,
                L: int = 4):
        """
        支持的数据格式为一行一个sample,并且用空格隔开不同的词语。例如

        Option::

            共同  创造  美好  的  新  世纪  ——  二○○一年  新年  贺词
            (  二○○○年  十二月  三十一日  )  (  附  图片  1  张  )
            女士  们  ,  先生  们  ,  同志  们  ,  朋友  们  :

        paths支持两种格式,第一种是str,第二种是Dict[str, str].

        Option::

            # 1. str类型
            # 1.1 传入具体的文件路径
            data = SigHanLoader('bmes').process('/path/to/cws/data.txt') # 将读取data.txt的内容
            # 包含以下的内容data.vocabs['chars']:Vocabulary对象,
            #             data.vocabs['target']: Vocabulary对象,根据encoding_type可能会没有该值
            #             data.embeddings['chars']: Embedding对象. 只有提供了预训练的词向量的路径才有该项
            #             data.datasets['train']: DataSet对象
            #                   包含的field有:
            #                       raw_chars: list[str], 每个元素是一个汉字
            #                       chars: list[int], 每个元素是汉字对应的index
            #                       target: list[int], 根据encoding_type有对应的变化
            # 1.2 传入一个目录, 里面必须包含train.txt文件
            data = SigHanLoader('bmes').process('path/to/cws/') #将尝试在该目录下读取 train.txt, test.txt以及dev.txt
            # 包含以下的内容data.vocabs['chars']: Vocabulary对象
            #             data.vocabs['target']:Vocabulary对象
            #             data.embeddings['chars']: 仅在提供了预训练embedding路径的情况下,为Embedding对象;
            #             data.datasets['train']: DataSet对象
            #                    包含的field有:
            #                       raw_chars: list[str], 每个元素是一个汉字
            #                       chars: list[int], 每个元素是汉字对应的index
            #                       target: list[int], 根据encoding_type有对应的变化
            #             data.datasets['dev']: DataSet对象,如果文件夹下包含了dev.txt;内容与data.datasets['train']一样

            # 2. dict类型, key是文件的名称,value是对应的读取路径. 必须包含'train'这个key
            paths = {'train': '/path/to/train/train.txt', 'test':'/path/to/test/test.txt', 'dev':'/path/to/dev/dev.txt'}
            data = SigHanLoader(paths).process(paths)
            # 结果与传入目录时是一致的,但是可以传入多个数据集。data.datasets中的key将与这里传入的一致

        :param paths: 支持传入目录,文件路径,以及dict。
        :param char_vocab_opt: 用于构建chars的vocabulary参数,默认为min_freq=2
        :param char_embed_opt: 用于读取chars的Embedding的参数,默认不读取pretrained的embedding
        :param bigram_vocab_opt: 用于构建bigram的vocabulary参数,默认不使用bigram, 仅在指定该参数的情况下会带有bigrams这个field。
            为List[int], 每个instance长度与chars一样, abcde的bigram为ab bc cd de e<eos>
        :param bigram_embed_opt: 用于读取预训练bigram的参数,仅在传入bigram_vocab_opt有效
        :param L: 当target_type为shift_relay时传入的segment长度
        :return:
        """
        # 推荐大家使用这个check_data_loader_paths进行paths的验证
        paths = check_dataloader_paths(paths)
        datasets = {}
        data = DataBundle()
        bigram = bigram_vocab_opt is not None
        for name, path in paths.items():
            dataset = self.load(path, bigram=bigram)
            datasets[name] = dataset
        input_fields = []
        target_fields = []
        # 创建vocab
        char_vocab = Vocabulary(
            min_freq=2) if char_vocab_opt is None else Vocabulary(
                **char_vocab_opt)
        char_vocab.from_dataset(datasets['train'], field_name='raw_chars')
        char_vocab.index_dataset(*datasets.values(),
                                 field_name='raw_chars',
                                 new_field_name='chars')
        data.vocabs[Const.CHAR_INPUT] = char_vocab
        input_fields.extend([Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET])
        target_fields.append(Const.TARGET)
        # 创建target
        if self.target_type == 'bmes':
            target_vocab = Vocabulary(unknown=None, padding=None)
            target_vocab.add_word_lst(['B'] * 4 + ['M'] * 3 + ['E'] * 2 +
                                      ['S'])
            target_vocab.index_dataset(*datasets.values(), field_name='target')
            data.vocabs[Const.TARGET] = target_vocab
        if char_embed_opt is not None:
            char_embed = EmbedLoader.load_with_vocab(**char_embed_opt,
                                                     vocab=char_vocab)
            data.embeddings['chars'] = char_embed
        if bigram:
            bigram_vocab = Vocabulary(**bigram_vocab_opt)
            bigram_vocab.from_dataset(datasets['train'], field_name='bigrams')
            bigram_vocab.index_dataset(*datasets.values(),
                                       field_name='bigrams')
            data.vocabs['bigrams'] = bigram_vocab
            if bigram_embed_opt is not None:
                bigram_embed = EmbedLoader.load_with_vocab(**bigram_embed_opt,
                                                           vocab=bigram_vocab)
                data.embeddings['bigrams'] = bigram_embed
            input_fields.append('bigrams')
        if self.target_type == 'shift_relay':
            func = partial(self._clip_target, L=L)
            for name, dataset in datasets.items():
                res = dataset.apply_field(func, field_name='target')
                relay_target = [res_i[0] for res_i in res]
                relay_mask = [res_i[1] for res_i in res]
                dataset.add_field('relay_target',
                                  relay_target,
                                  is_input=True,
                                  is_target=False,
                                  ignore_type=False)
                dataset.add_field('relay_mask',
                                  relay_mask,
                                  is_input=True,
                                  is_target=False,
                                  ignore_type=False)
        if self.target_type == 'shift_relay':
            input_fields.extend(['end_seg_mask'])
            target_fields.append('start_seg_mask')
        # 将dataset加入DataInfo
        for name, dataset in datasets.items():
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)
            data.datasets[name] = dataset

        return data
Exemplo n.º 8
0
    def process(self, paths, bigrams=False, trigrams=False):
        """

        :param paths:
        :param bool, bigrams: 是否包含生成bigram feature, [a, b, c, d] -> [ab, bc, cd, d<eos>]
        :param bool, trigrams: 是否包含trigram feature,[a, b, c, d] -> [abc, bcd, cd<eos>, d<eos><eos>]
        :return: DataBundle
            包含以下的fields
                raw_chars: List[str]
                chars: List[int]
                seq_len: int, 字的长度
                bigrams: List[int], optional
                trigrams: List[int], optional
                target: List[int]
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()
        input_fields = [Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, path in paths.items():
            dataset = self.load(path)
            if bigrams:
                dataset.apply_field(lambda raw_chars: [
                    c1 + c2
                    for c1, c2 in zip(raw_chars, raw_chars[1:] + ['<eos>'])
                ],
                                    field_name='raw_chars',
                                    new_field_name='bigrams')

            if trigrams:
                dataset.apply_field(lambda raw_chars: [
                    c1 + c2 + c3
                    for c1, c2, c3 in zip(raw_chars, raw_chars[1:] + ['<eos>'],
                                          raw_chars[2:] + ['<eos>'] * 2)
                ],
                                    field_name='raw_chars',
                                    new_field_name='trigrams')
            data.datasets[name] = dataset

        char_vocab = Vocabulary().from_dataset(
            data.datasets['train'],
            field_name='raw_chars',
            no_create_entry_dataset=[
                dataset for name, dataset in data.datasets.items()
                if name != 'train'
            ])
        char_vocab.index_dataset(*data.datasets.values(),
                                 field_name='raw_chars',
                                 new_field_name=Const.CHAR_INPUT)
        data.vocabs[Const.CHAR_INPUT] = char_vocab

        target_vocab = Vocabulary(unknown=None, padding=None).from_dataset(
            data.datasets['train'], field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        if bigrams:
            bigram_vocab = Vocabulary().from_dataset(
                data.datasets['train'],
                field_name='bigrams',
                no_create_entry_dataset=[
                    dataset for name, dataset in data.datasets.items()
                    if name != 'train'
                ])
            bigram_vocab.index_dataset(*data.datasets.values(),
                                       field_name='bigrams',
                                       new_field_name='bigrams')
            data.vocabs['bigrams'] = bigram_vocab
            input_fields.append('bigrams')

        if trigrams:
            trigram_vocab = Vocabulary().from_dataset(
                data.datasets['train'],
                field_name='trigrams',
                no_create_entry_dataset=[
                    dataset for name, dataset in data.datasets.items()
                    if name != 'train'
                ])
            trigram_vocab.index_dataset(*data.datasets.values(),
                                        field_name='trigrams',
                                        new_field_name='trigrams')
            data.vocabs['trigrams'] = trigram_vocab
            input_fields.append('trigrams')

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.CHAR_INPUT)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data