Пример #1
0
    def convert_texts_to_ids(self, batch_text):
        """将一个batch的明文text转成id
        :param batch_text:
        :return:
        """
        src_ids = []
        for text in batch_text:
            if self.field_config.need_convert:
                tokens = self.tokenizer.tokenize(text)
                src_id = self.tokenizer.convert_tokens_to_ids(tokens)
            else:
                if isinstance(text, str):
                    src_id = text.split(" ")
                src_id = [int(i) for i in text]

            # 加上截断策略
            if len(src_id) > self.field_config.max_seq_len - 2:
                src_id = truncation_words(src_id, self.field_config.max_seq_len - 2, self.field_config.truncation_type)
            unk_id = self.tokenizer.vocabulary.vocab_dict[self.field_config.tokenizer_info["unk_token"]] 
            src_id.insert(0, unk_id)
            src_id.append(unk_id)
            src_ids.append(src_id)
        
        return_list = []
        padded_ids, mask_ids, batch_seq_lens = pad_batch_data(src_ids,
                                                              pad_idx=self.field_config.padding_id,
                                                              return_input_mask=True,
                                                              return_seq_lens=True)
        return_list.append(padded_ids)
        return_list.append(mask_ids)
        return_list.append(batch_seq_lens)

        return return_list
Пример #2
0
    def convert_texts_to_ids(self, batch_text):
        """将一个batch的明文text转成id
        :param batch_text:
        :return:
        """
        src_ids = []
        for text in batch_text:
            if self.field_config.need_convert:
                tokens = self.tokenizer.tokenize(text)
                src_id = self.tokenizer.convert_tokens_to_ids(tokens)
            else:
                src_id = text.split(" ")

            # 加上截断策略
            if len(src_id) > self.field_config.max_seq_len:
                src_id = truncation_words(src_id, self.field_config.max_seq_len, self.field_config.truncation_type)
            src_ids.append(src_id)

        return_list = []
        padded_ids, batch_seq_lens = pad_batch_data(src_ids,
                                                              pad_idx=self.field_config.padding_id,
                                                              return_input_mask=False,
                                                              return_seq_lens=True)
        return_list.append(padded_ids)
        return_list.append(batch_seq_lens)

        return return_list
Пример #3
0
    def convert_texts_to_ids(self, batch_text):
        """ 明文序列化
        :return: id_list
        """
        src_ids = []
        for text in batch_text:
            if self.tokenizer and self.field_config.need_convert:
                tokens = self.tokenizer.tokenize(text)
                src_id = self.tokenizer.convert_tokens_to_ids(tokens)
            else:
                src_id = text.split(" ")

            # 加上截断策略
            if len(src_id) > self.field_config.max_seq_len:
                src_id = truncation_words(src_id,
                                          self.field_config.max_seq_len,
                                          self.field_config.truncation_type)
            src_ids.append(src_id)

        data_type = "int64" if self.field_config.data_type == DataShape.INT else "float32"

        padded_ids, batch_seq_lens = pad_batch_data(
            src_ids,
            insts_data_type=data_type,
            pad_idx=self.field_config.padding_id,
            return_input_mask=False,
            return_seq_lens=True)
        return_list = []
        return_list.append(padded_ids)
        return_list.append(batch_seq_lens)
        return return_list
Пример #4
0
def convert_texts_to_ids(batch_text_a, tokenizer=None, max_seq_len=512, truncation_type=0, padding_id=0):
    src_ids = []
    position_ids = []
    task_ids = []
    sentence_ids = []

    for text in batch_text_a:
        tokens_text = tokenizer.tokenize(text)
        # 加上截断策略
        if len(tokens_text) > max_seq_len - 2:
            tokens_text = truncation_words(tokens_text, max_seq_len - 2, truncation_type)
        tokens = []
        tokens.append("[CLS]")
        for token in tokens_text:
            tokens.append(token)
        tokens.append("[SEP]")
        src_id = tokenizer.convert_tokens_to_ids(tokens)

        src_ids.append(src_id)
        pos_id = list(range(len(src_id)))
        task_id = [0] * len(src_id)
        sentence_id = [0] * len(src_id)
        position_ids.append(pos_id)
        task_ids.append(task_id)
        sentence_ids.append(sentence_id)

    return_list = []
    padded_ids, input_mask, batch_seq_lens = pad_batch_data(src_ids,
                                                            pad_idx=padding_id,
                                                            return_input_mask=True,
                                                            return_seq_lens=True)
    sent_ids_batch = pad_batch_data(sentence_ids, pad_idx=padding_id)
    pos_ids_batch = pad_batch_data(position_ids, pad_idx=padding_id)
    task_ids_batch = pad_batch_data(task_ids, pad_idx=padding_id)

    return_list.append(padded_ids)  # append src_ids
    return_list.append(sent_ids_batch)  # append sent_ids
    return_list.append(pos_ids_batch)  # append pos_ids
    return_list.append(input_mask)  # append mask
    return_list.append(task_ids_batch)  # append task_ids
    return_list.append(batch_seq_lens)  # append seq_lens

    return return_list
Пример #5
0
def convert_text_to_id(text, field_config):
    """将一个明文样本转换成id
    :param text: 明文文本
    :param field_config : Field类型
    :return:
    """
    if not text:
        raise ValueError("text input is None")
    if not isinstance(field_config, Field):
        raise TypeError("field_config input is must be Field class")

    if field_config.need_convert:
        tokenizer = field_config.tokenizer
        tokens = tokenizer.tokenize(text)
        ids = tokenizer.convert_tokens_to_ids(tokens)
    else:
        ids = text.split(" ")

    # 加上截断策略
    if len(ids) > field_config.max_seq_len:
        ids = truncation_words(ids, field_config.max_seq_len, field_config.truncation_type)

    return ids
Пример #6
0
    def convert_texts_to_ids(self, batch_text):
        """将一个batch的明文text转成id
        :param batch_text:
        :return:
        """
        src_ids = []
        position_ids = []
        task_ids = []
        sentence_ids = []
        for text in batch_text:
            if self.field_config.need_convert:
                tokens_text = self.tokenizer.tokenize(text)
                # 加上截断策略
                if len(tokens_text) > self.field_config.max_seq_len - 2:
                    tokens_text = truncation_words(
                        tokens_text, self.field_config.max_seq_len - 2,
                        self.field_config.truncation_type)
                tokens = []
                tokens.append("[CLS]")
                for token in tokens_text:
                    tokens.append(token)
                tokens.append("[SEP]")
                src_id = self.tokenizer.convert_tokens_to_ids(tokens)
            else:
                if isinstance(text, str):
                    src_id = text.split(" ")
                src_id = [int(i) for i in text]
                if len(src_id) > self.field_config.max_seq_len - 2:
                    src_id = truncation_words(
                        src_id, self.field_config.max_seq_len - 2,
                        self.field_config.truncation_type)
                src_id.insert(0, self.tokenizer.covert_token_to_id("[CLS]"))
                src_id.append(self.tokenizer.covert_token_to_id("[SEP]"))

            src_ids.append(src_id)
            pos_id = list(range(len(src_id)))
            task_id = [0] * len(src_id)
            sentence_id = [0] * len(src_id)
            position_ids.append(pos_id)
            task_ids.append(task_id)
            sentence_ids.append(sentence_id)

        return_list = []
        padded_ids, input_mask, batch_seq_lens = pad_batch_data(
            src_ids,
            pad_idx=self.field_config.padding_id,
            return_input_mask=True,
            return_seq_lens=True)
        sent_ids_batch = pad_batch_data(sentence_ids,
                                        pad_idx=self.field_config.padding_id)
        pos_ids_batch = pad_batch_data(position_ids,
                                       pad_idx=self.field_config.padding_id)
        task_ids_batch = pad_batch_data(task_ids,
                                        pad_idx=self.field_config.padding_id)

        return_list.append(padded_ids)  # append src_ids
        return_list.append(sent_ids_batch)  # append sent_ids
        return_list.append(pos_ids_batch)  # append pos_ids
        return_list.append(input_mask)  # append mask
        return_list.append(task_ids_batch)  # append task_ids
        return_list.append(batch_seq_lens)  # append seq_lens

        return return_list
Пример #7
0
    def convert_texts_to_ids(self, batch_text):
        """将一个batch的明文text转成id
        :param batch_text:
        :return:
        """
        src_ids = []
        position_ids = []
        task_ids = []
        sentence_ids = []
        batch_text_a, batch_text_b = batch_text
        assert len(batch_text_a) == len(batch_text_b)

        for text_a, text_b in zip(batch_text_a, batch_text_b):
            if self.field_config.need_convert:
                tokens_text_a = self.tokenizer.tokenize(text_a)
                tokens_text_b = self.tokenizer.tokenize(text_b)
                # 加上截断策略
                truncate_seq_pair(tokens_text_a, tokens_text_b,
                                  self.field_config.max_seq_len - 3)
                text_a_len, text_b_len = len(tokens_text_a), len(tokens_text_b)
                tokens_text = tokens_text_a + ["[SEP]"] + tokens_text_b
                tokens = []
                tokens.append("[CLS]")
                for token in tokens_text:
                    tokens.append(token)
                tokens.append("[SEP]")
                src_id = self.tokenizer.convert_tokens_to_ids(tokens)
            else:
                src_a_id = text_a.split(" ")
                src_b_id = text_b.split(" ")
                truncate_seq_pair(src_a_id, src_b_id,
                                  self.field_config.max_seq_len - 3)
                text_a_len, text_b_len = len(src_a_id), len(src_b_id)
                src_id = src_a_id + ["[SEP]"] + src_b_id

                if len(src_id) > self.field_config.max_seq_len - 2:
                    src_id = truncation_words(
                        src_id, self.field_config.max_seq_len - 2,
                        self.field_config.truncation_type)
                    src_id.insert(0,
                                  self.tokenizer.covert_token_to_id("[CLS]"))
                    src_id.append(self.tokenizer.covert_token_to_id("[SEP]"))

            src_ids.append(src_id)
            pos_id = list(range(len(src_id)))
            task_id = [0] * len(src_id)
            sentence_id = [0] * (text_a_len + 2) + [1] * (text_b_len + 1)
            position_ids.append(pos_id)
            task_ids.append(task_id)
            sentence_ids.append(sentence_id)

        return_list = []

        padded_ids, input_mask, batch_seq_lens = pad_batch_data(
            src_ids,
            pad_idx=self.field_config.padding_id,
            return_input_mask=True,
            return_seq_lens=True)
        sent_ids_batch = pad_batch_data(sentence_ids,
                                        pad_idx=self.field_config.padding_id)
        pos_ids_batch = pad_batch_data(position_ids,
                                       pad_idx=self.field_config.padding_id)
        task_ids_batch = pad_batch_data(task_ids,
                                        pad_idx=self.field_config.padding_id)

        return_list.append(padded_ids)  # append src_ids
        return_list.append(sent_ids_batch)  # append sent_ids
        return_list.append(pos_ids_batch)  # append pos_ids
        return_list.append(input_mask)  # append mask
        return_list.append(task_ids_batch)  # append task_ids
        return_list.append(batch_seq_lens)  # append seq_lens

        return return_list