def convert_texts_to_ids(self, batch_text): """将一个batch的明文text转成id :param batch_text: :return: """ src_ids = [] for text in batch_text: if self.field_config.need_convert: tokens = self.tokenizer.tokenize(text) src_id = self.tokenizer.convert_tokens_to_ids(tokens) else: if isinstance(text, str): src_id = text.split(" ") src_id = [int(i) for i in text] # 加上截断策略 if len(src_id) > self.field_config.max_seq_len - 2: src_id = truncation_words(src_id, self.field_config.max_seq_len - 2, self.field_config.truncation_type) unk_id = self.tokenizer.vocabulary.vocab_dict[self.field_config.tokenizer_info["unk_token"]] src_id.insert(0, unk_id) src_id.append(unk_id) src_ids.append(src_id) return_list = [] padded_ids, mask_ids, batch_seq_lens = pad_batch_data(src_ids, pad_idx=self.field_config.padding_id, return_input_mask=True, return_seq_lens=True) return_list.append(padded_ids) return_list.append(mask_ids) return_list.append(batch_seq_lens) return return_list
def convert_texts_to_ids(self, batch_text): """将一个batch的明文text转成id :param batch_text: :return: """ src_ids = [] for text in batch_text: if self.field_config.need_convert: tokens = self.tokenizer.tokenize(text) src_id = self.tokenizer.convert_tokens_to_ids(tokens) else: src_id = text.split(" ") # 加上截断策略 if len(src_id) > self.field_config.max_seq_len: src_id = truncation_words(src_id, self.field_config.max_seq_len, self.field_config.truncation_type) src_ids.append(src_id) return_list = [] padded_ids, batch_seq_lens = pad_batch_data(src_ids, pad_idx=self.field_config.padding_id, return_input_mask=False, return_seq_lens=True) return_list.append(padded_ids) return_list.append(batch_seq_lens) return return_list
def convert_texts_to_ids(self, batch_text): """ 明文序列化 :return: id_list """ src_ids = [] for text in batch_text: if self.tokenizer and self.field_config.need_convert: tokens = self.tokenizer.tokenize(text) src_id = self.tokenizer.convert_tokens_to_ids(tokens) else: src_id = text.split(" ") # 加上截断策略 if len(src_id) > self.field_config.max_seq_len: src_id = truncation_words(src_id, self.field_config.max_seq_len, self.field_config.truncation_type) src_ids.append(src_id) data_type = "int64" if self.field_config.data_type == DataShape.INT else "float32" padded_ids, batch_seq_lens = pad_batch_data( src_ids, insts_data_type=data_type, pad_idx=self.field_config.padding_id, return_input_mask=False, return_seq_lens=True) return_list = [] return_list.append(padded_ids) return_list.append(batch_seq_lens) return return_list
def convert_texts_to_ids(batch_text_a, tokenizer=None, max_seq_len=512, truncation_type=0, padding_id=0): src_ids = [] position_ids = [] task_ids = [] sentence_ids = [] for text in batch_text_a: tokens_text = tokenizer.tokenize(text) # 加上截断策略 if len(tokens_text) > max_seq_len - 2: tokens_text = truncation_words(tokens_text, max_seq_len - 2, truncation_type) tokens = [] tokens.append("[CLS]") for token in tokens_text: tokens.append(token) tokens.append("[SEP]") src_id = tokenizer.convert_tokens_to_ids(tokens) src_ids.append(src_id) pos_id = list(range(len(src_id))) task_id = [0] * len(src_id) sentence_id = [0] * len(src_id) position_ids.append(pos_id) task_ids.append(task_id) sentence_ids.append(sentence_id) return_list = [] padded_ids, input_mask, batch_seq_lens = pad_batch_data(src_ids, pad_idx=padding_id, return_input_mask=True, return_seq_lens=True) sent_ids_batch = pad_batch_data(sentence_ids, pad_idx=padding_id) pos_ids_batch = pad_batch_data(position_ids, pad_idx=padding_id) task_ids_batch = pad_batch_data(task_ids, pad_idx=padding_id) return_list.append(padded_ids) # append src_ids return_list.append(sent_ids_batch) # append sent_ids return_list.append(pos_ids_batch) # append pos_ids return_list.append(input_mask) # append mask return_list.append(task_ids_batch) # append task_ids return_list.append(batch_seq_lens) # append seq_lens return return_list
def convert_text_to_id(text, field_config): """将一个明文样本转换成id :param text: 明文文本 :param field_config : Field类型 :return: """ if not text: raise ValueError("text input is None") if not isinstance(field_config, Field): raise TypeError("field_config input is must be Field class") if field_config.need_convert: tokenizer = field_config.tokenizer tokens = tokenizer.tokenize(text) ids = tokenizer.convert_tokens_to_ids(tokens) else: ids = text.split(" ") # 加上截断策略 if len(ids) > field_config.max_seq_len: ids = truncation_words(ids, field_config.max_seq_len, field_config.truncation_type) return ids
def convert_texts_to_ids(self, batch_text): """将一个batch的明文text转成id :param batch_text: :return: """ src_ids = [] position_ids = [] task_ids = [] sentence_ids = [] for text in batch_text: if self.field_config.need_convert: tokens_text = self.tokenizer.tokenize(text) # 加上截断策略 if len(tokens_text) > self.field_config.max_seq_len - 2: tokens_text = truncation_words( tokens_text, self.field_config.max_seq_len - 2, self.field_config.truncation_type) tokens = [] tokens.append("[CLS]") for token in tokens_text: tokens.append(token) tokens.append("[SEP]") src_id = self.tokenizer.convert_tokens_to_ids(tokens) else: if isinstance(text, str): src_id = text.split(" ") src_id = [int(i) for i in text] if len(src_id) > self.field_config.max_seq_len - 2: src_id = truncation_words( src_id, self.field_config.max_seq_len - 2, self.field_config.truncation_type) src_id.insert(0, self.tokenizer.covert_token_to_id("[CLS]")) src_id.append(self.tokenizer.covert_token_to_id("[SEP]")) src_ids.append(src_id) pos_id = list(range(len(src_id))) task_id = [0] * len(src_id) sentence_id = [0] * len(src_id) position_ids.append(pos_id) task_ids.append(task_id) sentence_ids.append(sentence_id) return_list = [] padded_ids, input_mask, batch_seq_lens = pad_batch_data( src_ids, pad_idx=self.field_config.padding_id, return_input_mask=True, return_seq_lens=True) sent_ids_batch = pad_batch_data(sentence_ids, pad_idx=self.field_config.padding_id) pos_ids_batch = pad_batch_data(position_ids, pad_idx=self.field_config.padding_id) task_ids_batch = pad_batch_data(task_ids, pad_idx=self.field_config.padding_id) return_list.append(padded_ids) # append src_ids return_list.append(sent_ids_batch) # append sent_ids return_list.append(pos_ids_batch) # append pos_ids return_list.append(input_mask) # append mask return_list.append(task_ids_batch) # append task_ids return_list.append(batch_seq_lens) # append seq_lens return return_list
def convert_texts_to_ids(self, batch_text): """将一个batch的明文text转成id :param batch_text: :return: """ src_ids = [] position_ids = [] task_ids = [] sentence_ids = [] batch_text_a, batch_text_b = batch_text assert len(batch_text_a) == len(batch_text_b) for text_a, text_b in zip(batch_text_a, batch_text_b): if self.field_config.need_convert: tokens_text_a = self.tokenizer.tokenize(text_a) tokens_text_b = self.tokenizer.tokenize(text_b) # 加上截断策略 truncate_seq_pair(tokens_text_a, tokens_text_b, self.field_config.max_seq_len - 3) text_a_len, text_b_len = len(tokens_text_a), len(tokens_text_b) tokens_text = tokens_text_a + ["[SEP]"] + tokens_text_b tokens = [] tokens.append("[CLS]") for token in tokens_text: tokens.append(token) tokens.append("[SEP]") src_id = self.tokenizer.convert_tokens_to_ids(tokens) else: src_a_id = text_a.split(" ") src_b_id = text_b.split(" ") truncate_seq_pair(src_a_id, src_b_id, self.field_config.max_seq_len - 3) text_a_len, text_b_len = len(src_a_id), len(src_b_id) src_id = src_a_id + ["[SEP]"] + src_b_id if len(src_id) > self.field_config.max_seq_len - 2: src_id = truncation_words( src_id, self.field_config.max_seq_len - 2, self.field_config.truncation_type) src_id.insert(0, self.tokenizer.covert_token_to_id("[CLS]")) src_id.append(self.tokenizer.covert_token_to_id("[SEP]")) src_ids.append(src_id) pos_id = list(range(len(src_id))) task_id = [0] * len(src_id) sentence_id = [0] * (text_a_len + 2) + [1] * (text_b_len + 1) position_ids.append(pos_id) task_ids.append(task_id) sentence_ids.append(sentence_id) return_list = [] padded_ids, input_mask, batch_seq_lens = pad_batch_data( src_ids, pad_idx=self.field_config.padding_id, return_input_mask=True, return_seq_lens=True) sent_ids_batch = pad_batch_data(sentence_ids, pad_idx=self.field_config.padding_id) pos_ids_batch = pad_batch_data(position_ids, pad_idx=self.field_config.padding_id) task_ids_batch = pad_batch_data(task_ids, pad_idx=self.field_config.padding_id) return_list.append(padded_ids) # append src_ids return_list.append(sent_ids_batch) # append sent_ids return_list.append(pos_ids_batch) # append pos_ids return_list.append(input_mask) # append mask return_list.append(task_ids_batch) # append task_ids return_list.append(batch_seq_lens) # append seq_lens return return_list