def get_bert_input_for_sim(self, sentence_series1, sentence_series2, label_series, batch_size, max_seq_length, bert_prained_path, unique_label_list=None, shuffle=True, language="chinese"): """ :param sentence_series1: :param sentence_series2: :param label_series: :param batch_size: :param max_seq_length: :param bert_prained_path: :param unique_label_list: :param shuffle: :param language: :return: """ sentence_list1, sentence_list2, label_list = list(sentence_series1), list(sentence_series2), list(label_series) # get label_to_id if unique_label_list is not None: label_to_id = {k: v for v, k in enumerate(unique_label_list)} else: label_to_id = {k: v for v, k in enumerate(self._get_unique_lable_list(label_list))} # get bert tokenizer tokenizer = self._get_bert_tokenizer(bert_prained_path) # deal data for sentence1 token_ids_list1, token_type_ids_list1, attention_masks_list1 = list(), list(), list() for sentence in sentence_list1: data_ids = self._deal_one_sentence_for_bert(sentence, language, max_seq_length, tokenizer) # data token_ids_list1.append(data_ids["token_ids"]) token_type_ids_list1.append(data_ids["token_type_ids"]) attention_masks_list1.append(data_ids["attention_masks"]) # deal data for sentence2 token_ids_list2, token_type_ids_list2, attention_masks_list2 = list(), list(), list() for sen in sentence_series2: dict_ids = self._deal_one_sentence_for_bert(sen, language, max_seq_length, tokenizer) # data token_ids_list2.append(dict_ids["token_ids"]) token_type_ids_list2.append(dict_ids["token_type_ids"]) attention_masks_list2.append(dict_ids["attention_masks"]) # deal data for label label_id_list = [label_to_id[lab] for lab in label_series] # data dict data_dict = { "token_ids1": token_ids_list1, "token_type_ids1": token_type_ids_list1, "attention_masks1": attention_masks_list1, "token_ids2": token_ids_list2, "token_type_ids2": token_type_ids_list2, "attention_masks2": attention_masks_list2, "label_ids": label_id_list, } dataiter = DataIter(batch_size, max_seq_length, num_works=4).get_iter_for_bert(data_dict, shuffle) return dataiter
def get_bert_input_for_double_sentence_add_feature(self, sentence_series1, sentence_series2, features_list, label_series, batch_size, max_seq_length, bert_prained_path, unique_label_list=None, shuffle=False, language="chinese", drop_last=True): """ :param sentence_series1: :param sentence_series2: :param features_list: :param label_series: :param batch_size: :param max_seq_length: :param bert_prained_path: :param unique_label_list: :param shuffle: :param language: :param drop_last: :return: """ sentence_list1, sentence_list2, label_list, features_list = list(sentence_series1), list(sentence_series2), list(label_series), list(features_list) # get label_to_id if unique_label_list is not None: label_to_id = {k: v for v, k in enumerate(unique_label_list)} else: label_to_id = {k: v for v, k in enumerate(self._get_unique_lable_list(label_list))} # get bert tokenize tokenizer = self._get_bert_tokenizer(bert_prained_path) # convert sentences to token_id & token type id & attention mask token_ids_list, label_ids_list = list(), list() token_type_ids_list, attention_masks_list = list(), list() for sentence1, sentence2 in zip(sentence_list1, sentence_list2): data_ids = self._deal_two_sentence_for_bert(sentence1, sentence2, language, max_seq_length, tokenizer) # data token_ids_list.append(data_ids["token_ids"]) token_type_ids_list.append(data_ids["token_type_ids"]) attention_masks_list.append(data_ids["attention_masks"]) # convert label id label_ids_list = [label_to_id[label] for label in label_list] # data dict data_dict = { "token_ids": token_ids_list, "token_type_ids": token_type_ids_list, "attention_masks": attention_masks_list, "label_ids": label_ids_list, "features_list": features_list } dataiter = DataIter(batch_size, max_seq_length, num_works=4).get_iter_for_bert(data_dict, shuffle, drop_last=drop_last) return dataiter
def get_word2vec_input_for_seq(self, sentence_series, label_series, batch_size, max_seq_length, word2id, label_to_id=None, shuffle=True, drop_last=True, language="chinese"): """ :param sentence_series: :param label_series: :param batch_size: :param max_seq_length: :param word2id: :param label_to_id: :param shuffle: :param drop_last: :param language: :return: """ sentence_list, label_list = list(sentence_series), list(label_series) # get label_to_id if label_to_id is not None: pass else: label_to_id = {k: v+1 for v, k in enumerate(self._get_unique_lable_list(list(chain.from_iterable(label_list))))} label_to_id["<PAD>"] = 0 # get bert tokenize # convert sentences to token_id & token type id & attention mask # convert label id token_ids_list, label_ids_list, token_len_list = list(), list(), list() for sentence, label in zip(sentence_list, label_list): dict_ids = self._deal_one_sentence_one_label_for_word2vec(sentence, label, max_seq_length, word2id, label_to_id, language) token_ids_list.append(dict_ids["token_ids"]) label_ids_list.append(dict_ids["label_ids"]) token_len_list.append(dict_ids["token_len"]) # 对原始数据的句子长度进行控制 raw_sentence = list() raw_label = list() for sentence, label in zip(sentence_list, label_list): sentence = sentence.split(" ") label = label.split(" ") if len(sentence) > max_seq_length - 2: raw_sentence.append(sentence[:max_seq_length - 2]) raw_label.append(label[:max_seq_length - 2]) else: raw_sentence.append(sentence) raw_label.append(label) # data dict data_dict = { "raw_x": raw_sentence, "raw_y": raw_label, "token_ids": token_ids_list, "label_ids": label_ids_list, "token_len": token_len_list } dataiter = DataIter(batch_size, max_seq_length, num_works=4).get_iter_for_bert(data_dict, shuffle, drop_last) return dataiter
def get_bert_input_for_double_sen(self, sentence_series1, sentence_series2, label_series, batch_size, max_seq_length, bert_prained_path, unique_label_list=None, shuffle=True, language="chinese", drop_last=False): """ :param sentence_series1: :param sentence_series2: :param label_series: :param batch_size: :param max_seq_length: :param bert_prained_path: :param unique_label_list: :param shuffle: :param language: :return: """ sentence_list1, sentence_list2, label_list = list(sentence_series1), list(sentence_series2), list(label_series) # get label_to_id if unique_label_list is not None: label_to_id = {k: v for v, k in enumerate(unique_label_list)} else: label_to_id = {k: v for v, k in enumerate(self._get_unique_lable_list(label_list))} # get bert tokenizer tokenizer = self._get_bert_tokenizer(bert_prained_path) # deal data for sentence1 token_ids_list, token_type_ids_list, attention_masks_list = list(), list(), list() for sentence1, sentence2 in zip(sentence_list1, sentence_list2): data_ids = self._deal_two_sentence_for_bert(sentence1, sentence2, language, max_seq_length, tokenizer) # data token_ids_list.append(data_ids["token_ids"]) token_type_ids_list.append(data_ids["token_type_ids"]) attention_masks_list.append(data_ids["attention_masks"]) # deal data for label label_id_list=list() for lab in label_series: if isinstance(lab, int): label_id_list.append(label_to_id[lab]) else: label_id_list.append(label_to_id[int(lab)]) # data dict data_dict = { "token_ids": token_ids_list, "token_type_ids": token_type_ids_list, "attention_masks": attention_masks_list, "label_ids": label_id_list, } dataiter = DataIter(batch_size, max_seq_length, num_works=4).get_double_sentence_iter_for_bert(data_dict, shuffle, drop_last=drop_last) return dataiter
def get_bert_input_for_cls_add_feature(self, sentence_series, features_list, label_series, batch_size, max_seq_length, bert_prained_path, unique_label_list=None, shuffle=False, language="chinese", drop_last=True): """ :param sentence_series: pandas.Series :param label_series: pandas.Series :param batch_size: int :param max_seq_length: int :param bert_prained_path: bert 预训练模型的路径 :param unique_label_list: 所有不同的label的集合 :param shuffle: 是否对数据的顺序进行打乱 :param language: 处理的文本的语言类型 :return: """ sentence_list, label_list, features_list = list(sentence_series), list(label_series), list(features_list) # get label_to_id if unique_label_list is not None: label_to_id = {k: v for v, k in enumerate(unique_label_list)} else: label_to_id = {k: v for v, k in enumerate(self._get_unique_lable_list(label_list))} # get bert tokenize tokenizer = self._get_bert_tokenizer(bert_prained_path) # convert sentences to token_id & token type id & attention mask token_ids_list, label_ids_list = list(), list() token_type_ids_list, attention_masks_list = list(), list() for sentence in sentence_list: dict_ids = self._deal_one_sentence_for_bert(sentence, language, max_seq_length, tokenizer) token_ids_list.append(dict_ids["token_ids"]) token_type_ids_list.append(dict_ids["token_type_ids"]) attention_masks_list.append(dict_ids["attention_masks"]) # convert label id label_ids_list = [label_to_id[label] for label in label_list] # data dict data_dict = { "token_ids": token_ids_list, "token_type_ids": token_type_ids_list, "attention_masks": attention_masks_list, "label_ids": label_ids_list, "features_list": features_list } dataiter = DataIter(batch_size, max_seq_length, num_works=4).get_iter_for_bert(data_dict, shuffle, drop_last=drop_last) return dataiter
def get_bert_input_for_context_cls(self, sentence_list_f, sentence_list_m, sentence_list_b, label_list, batch_size, max_seq_length, bert_prained_path, unique_label_list, shuffle=True, language="chinese", drop_last=False): """ :param sentence_list_f: :param sentence_list_m: :param sentence_list_b: :param label_list: :param batch_size: :param max_seq_length: :param bert_prained_path: :param unique_label_list: :param shuffle: :param language: :return: """ # get label_to_id label_to_id = {k: v for v, k in enumerate(unique_label_list)} # get bert tokenize tokenizer = self._get_bert_tokenizer(bert_prained_path) # data dict data_dict = dict() # convert sentences to token_id & token type id & attention mask if sentence_list_f is not None: token_ids_list_f = list() token_type_ids_list_f, attention_masks_list_f = list(), list() for sentence in sentence_list_f: dict_ids = self._deal_one_sentence_for_bert(sentence, language, max_seq_length, tokenizer) token_ids_list_f.append(dict_ids["token_ids"]) token_type_ids_list_f.append(dict_ids["token_type_ids"]) attention_masks_list_f.append(dict_ids["attention_masks"]) data_dict["token_ids_f"] = token_ids_list_f data_dict["token_type_ids_f"] = token_type_ids_list_f data_dict["attention_masks_f"] = attention_masks_list_f # convert sentences to token_id & token type id & attention mask if sentence_list_m is not None: token_ids_list_m = list() token_type_ids_list_m, attention_masks_list_m = list(), list() for sentence in sentence_list_m: dict_ids = self._deal_one_sentence_for_bert(sentence, language, max_seq_length, tokenizer) token_ids_list_m.append(dict_ids["token_ids"]) token_type_ids_list_m.append(dict_ids["token_type_ids"]) attention_masks_list_m.append(dict_ids["attention_masks"]) data_dict["token_ids_m"] = token_ids_list_m data_dict["token_type_ids_m"] = token_type_ids_list_m data_dict["attention_masks_m"] = attention_masks_list_m # convert sentences to token_id & token type id & attention mask if sentence_list_b is not None: token_ids_list_b = list() token_type_ids_list_b, attention_masks_list_b = list(), list() for sentence in sentence_list_b: dict_ids = self._deal_one_sentence_for_bert(sentence, language, max_seq_length, tokenizer) token_ids_list_b.append(dict_ids["token_ids"]) token_type_ids_list_b.append(dict_ids["token_type_ids"]) attention_masks_list_b.append(dict_ids["attention_masks"]) data_dict["token_ids_b"] = token_ids_list_b data_dict["token_type_ids_b"] = token_type_ids_list_b data_dict["attention_masks_b"] = attention_masks_list_b # convert label id label_ids_list = [label_to_id[label] for label in label_list] data_dict["label_ids"] = label_ids_list dataiter = DataIter(batch_size, max_seq_length, num_works=self.num_works).get_iter_for_bert(data_dict, shuffle, drop_last=drop_last) return dataiter