Пример #1
0
    def process(self, data_bundle: DataBundle):
        _add_chars_field(data_bundle, lower=False)

        data_bundle.apply_field(self.encoding_func, field_name=Const.TARGET, new_field_name=Const.TARGET)

        # 将所有digit转为0
        data_bundle.apply_field(lambda chars:[''.join(['0' if c.isdigit() else c for c in char]) for char in chars],
            field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT)

        #
        input_field_names = [Const.CHAR_INPUT]
        if self.bigrams:
            data_bundle.apply_field(lambda chars:[c1+c2 for c1,c2 in zip(chars, chars[1:]+['<eos>'])],
                                    field_name=Const.CHAR_INPUT, new_field_name='bigrams')
            input_field_names.append('bigrams')

        # index
        _indexize(data_bundle, input_field_names=input_field_names, target_field_names=Const.TARGET)

        input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.CHAR_INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Пример #2
0
    def process(self, data_bundle: DataBundle) -> DataBundle:
        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(self.convert_tag,
                                field_name=Const.TARGET,
                                new_field_name=Const.TARGET)

        _add_words_field(data_bundle, lower=self.lower)

        if self.word_shape:
            data_bundle.apply_field(word_shape,
                                    field_name='raw_words',
                                    new_field_name='word_shapes')
            data_bundle.set_input('word_shapes')
        data_bundle.apply_field(lambda chars: [
            ''.join(['0' if c.isdigit() else c for c in char])
            for char in chars
        ],
                                field_name=Const.INPUT,
                                new_field_name=Const.INPUT)

        # index
        _indexize(data_bundle)
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)
        return data_bundle
Пример #3
0
    def process(self, data_bundle: DataBundle) -> DataBundle:
        """
        支持的DataSet的field为

        .. csv-table::
           :header: "raw_words", "target"

           "[Nadim, Ladki]", "[B-PER, I-PER]"
           "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]"
           "[...]", "[...]"

        :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]在传入DataBundle基础上原位修改。
        :return DataBundle:
        """
        # 转换tag
        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(self.convert_tag,
                                field_name=Const.TARGET,
                                new_field_name=Const.TARGET)

        _add_words_field(data_bundle, lower=self.lower)

        if self.word_shape:
            data_bundle.apply_field(word_shape,
                                    field_name='raw_words',
                                    new_field_name='word_shapes')
            data_bundle.set_input('word_shapes')

        # 将所有digit转为0
        data_bundle.apply_field(lambda chars: [
            ''.join(['0' if c.isdigit() else c for c in char])
            for char in chars
        ],
                                field_name=Const.INPUT,
                                new_field_name=Const.INPUT)

        # index
        _indexize(data_bundle)

        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Пример #4
0
    def process(self, data_bundle):
        data_bundle.copy_field(C.RAW_CHAR, C.CHAR_INPUT)
        input_fields = [C.TARGET, C.CHAR_INPUT, C.INPUT_LEN]
        target_fields = [C.TARGET, C.INPUT_LEN]
        if self.bigram:
            for dataset in data_bundle.datasets.values():
                dataset.apply_field(
                    lambda chars:
                    [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                    field_name=C.CHAR_INPUT,
                    new_field_name='bigrams')
            bigram_vocab = Vocabulary()
            bigram_vocab.from_dataset(
                data_bundle.get_dataset('train'),
                field_name='bigrams',
                no_create_entry_dataset=[
                    ds for name, ds in data_bundle.datasets.items()
                    if name != 'train'
                ])
            bigram_vocab.index_dataset(*data_bundle.datasets.values(),
                                       field_name='bigrams')
            data_bundle.set_vocab(bigram_vocab, field_name='bigrams')
            input_fields.append('bigrams')

        _add_chars_field(data_bundle, lower=False)

        # index
        _indexize(data_bundle,
                  input_field_names=C.CHAR_INPUT,
                  target_field_names=C.TARGET)

        for name, dataset in data_bundle.datasets.items():
            dataset.set_pad_val(C.TARGET, self.target_pad_val)
            dataset.add_seq_len(C.CHAR_INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Пример #5
0
 def process(self, data_bundle):
     data_bundle = _add_words_field(data_bundle)
     data_bundle = _indexize(
         data_bundle=data_bundle,
         input_field_names=[Const.INPUT, "pos", "deprel"])
     for name, dataset in data_bundle.datasets.items():
         dataset.add_seq_len(Const.INPUT)
         data_bundle.get_vocab(Const.INPUT).index_dataset(
             dataset, field_name="asp", new_field_name="aspect")
     data_bundle.apply(self._get_post, new_field_name="post")
     data_bundle.apply(self._get_mask, new_field_name="aspmask")
     data_bundle.set_input(Const.INPUT, Const.INPUT_LEN, "pos", "dephead",
                           "deprel", "aspect", "fidx", "tidx", "post",
                           "aspmask")
     data_bundle.set_target(Const.TARGET)
     return data_bundle
Пример #6
0
    def process(self, data_bundle: DataBundle):
        """
        可处理的DataSet应具备如下的field

        .. csv-table::
            :header: "raw_words", "target"

            "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育"
            "...", "..."

        :param data_bundle:
        :return:
        """
        # 根据granularity设置tag
        # 由原来的固定tagmap,修改为根据数据集获取tagmap
        targets_vocabs = get_data_bundle_tags(data_bundle)
        self.tag_map = {tag_name: tag_name for tag_name in targets_vocabs}
        data_bundle = self._granularize(data_bundle=data_bundle,
                                        tag_map=self.tag_map)
        # clean,lower

        # CWS(tokenize)
        data_bundle = self._tokenize(data_bundle=data_bundle,
                                     field_name='raw_chars',
                                     new_field_name='chars')
        input_field_names = [Const.CHAR_INPUT]

        # n-grams
        if self.bigrams:
            for name, dataset in data_bundle.iter_datasets():
                dataset.apply_field(
                    lambda chars:
                    [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                    field_name=Const.CHAR_INPUT,
                    new_field_name='bigrams')
            input_field_names.append('bigrams')
        if self.trigrams:
            for name, dataset in data_bundle.iter_datasets():
                dataset.apply_field(lambda chars: [
                    c1 + c2 + c3
                    for c1, c2, c3 in zip(chars, chars[1:] + ['<eos>'], chars[
                        2:] + ['<eos>'] * 2)
                ],
                                    field_name=Const.CHAR_INPUT,
                                    new_field_name='trigrams')
            input_field_names.append('trigrams')

        # index
        data_bundle = _indexize(data_bundle=data_bundle,
                                input_field_names=Const.CHAR_INPUT)
        # add length
        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(field_name=Const.CHAR_INPUT,
                                new_field_name=Const.INPUT_LEN)

        # input_fields包含的字段名称
        # input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
        input_fields = [Const.INPUT_LEN] + input_field_names
        target_fields = [Const.TARGET]

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Пример #7
0
    def process(self, data_bundle: DataBundle) -> DataBundle:
        """
        可以处理的DataSet需要包含raw_words列

        .. csv-table::
           :header: "raw_words"

           "上海 浦东 开发 与 法制 建设 同步"
           "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )"
           "..."

        :param data_bundle:
        :return:
        """
        data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT)

        if self.replace_num_alpha:
            data_bundle.apply_field(_find_and_replace_alpha_spans,
                                    Const.CHAR_INPUT, Const.CHAR_INPUT)
            data_bundle.apply_field(_find_and_replace_digit_spans,
                                    Const.CHAR_INPUT, Const.CHAR_INPUT)

        self._tokenize(data_bundle)
        input_field_names = [Const.CHAR_INPUT]
        target_field_names = []

        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(
                lambda chars: _word_lens_to_relay(map(len, chars)),
                field_name=Const.CHAR_INPUT,
                new_field_name=Const.TARGET)
            dataset.apply_field(
                lambda chars: _word_lens_to_start_seg_mask(map(len, chars)),
                field_name=Const.CHAR_INPUT,
                new_field_name='start_seg_mask')
            dataset.apply_field(
                lambda chars: _word_lens_to_end_seg_mask(map(len, chars)),
                field_name=Const.CHAR_INPUT,
                new_field_name='end_seg_mask')
            dataset.apply_field(lambda chars: list(chain(*chars)),
                                field_name=Const.CHAR_INPUT,
                                new_field_name=Const.CHAR_INPUT)
            target_field_names.append('start_seg_mask')
            input_field_names.append('end_seg_mask')
        if self.bigrams:
            for name, dataset in data_bundle.datasets.items():
                dataset.apply_field(
                    lambda chars:
                    [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                    field_name=Const.CHAR_INPUT,
                    new_field_name='bigrams')
            input_field_names.append('bigrams')

        _indexize(data_bundle, ['chars', 'bigrams'], [])

        func = partial(_clip_target, L=self.L)
        for name, dataset in data_bundle.datasets.items():
            res = dataset.apply_field(func, field_name='target')
            relay_target = [res_i[0] for res_i in res]
            relay_mask = [res_i[1] for res_i in res]
            dataset.add_field('relay_target',
                              relay_target,
                              is_input=True,
                              is_target=False,
                              ignore_type=False)
            dataset.add_field('relay_mask',
                              relay_mask,
                              is_input=True,
                              is_target=False,
                              ignore_type=False)
            input_field_names.append('relay_target')
            input_field_names.append('relay_mask')

        input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
        target_fields = [Const.TARGET, Const.INPUT_LEN] + target_field_names
        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.CHAR_INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle