Пример #1
0
    def get_bert_input_for_sim(self, sentence_series1, sentence_series2, label_series, batch_size, max_seq_length,
                          bert_prained_path, unique_label_list=None, shuffle=True, language="chinese"):
        """

        :param sentence_series1:
        :param sentence_series2:
        :param label_series:
        :param batch_size:
        :param max_seq_length:
        :param bert_prained_path:
        :param unique_label_list:
        :param shuffle:
        :param language:
        :return:
        """
        sentence_list1, sentence_list2, label_list = list(sentence_series1), list(sentence_series2), list(label_series)

        # get label_to_id
        if unique_label_list is not None:
            label_to_id = {k: v for v, k in enumerate(unique_label_list)}
        else:
            label_to_id = {k: v for v, k in enumerate(self._get_unique_lable_list(label_list))}

        # get bert tokenizer
        tokenizer = self._get_bert_tokenizer(bert_prained_path)

        # deal data for sentence1
        token_ids_list1, token_type_ids_list1, attention_masks_list1 = list(), list(), list()
        for sentence in sentence_list1:
            data_ids = self._deal_one_sentence_for_bert(sentence, language, max_seq_length, tokenizer)
            # data
            token_ids_list1.append(data_ids["token_ids"])
            token_type_ids_list1.append(data_ids["token_type_ids"])
            attention_masks_list1.append(data_ids["attention_masks"])

        # deal data for sentence2
        token_ids_list2, token_type_ids_list2, attention_masks_list2 = list(), list(), list()
        for sen in sentence_series2:
            dict_ids = self._deal_one_sentence_for_bert(sen, language, max_seq_length, tokenizer)
            # data
            token_ids_list2.append(dict_ids["token_ids"])
            token_type_ids_list2.append(dict_ids["token_type_ids"])
            attention_masks_list2.append(dict_ids["attention_masks"])
        # deal data for label
        label_id_list = [label_to_id[lab] for lab in label_series]

        # data dict
        data_dict = {
            "token_ids1": token_ids_list1,
            "token_type_ids1": token_type_ids_list1,
            "attention_masks1": attention_masks_list1,
            "token_ids2": token_ids_list2,
            "token_type_ids2": token_type_ids_list2,
            "attention_masks2": attention_masks_list2,
            "label_ids": label_id_list,
        }
        dataiter = DataIter(batch_size, max_seq_length, num_works=4).get_iter_for_bert(data_dict, shuffle)
        return dataiter
Пример #2
0
    def get_bert_input_for_double_sentence_add_feature(self, sentence_series1, sentence_series2, features_list,
                                                       label_series, batch_size, max_seq_length, bert_prained_path,
                                                       unique_label_list=None, shuffle=False, language="chinese",
                                                       drop_last=True):
        """
        :param sentence_series1:
        :param sentence_series2:
        :param features_list:
        :param label_series:
        :param batch_size:
        :param max_seq_length:
        :param bert_prained_path:
        :param unique_label_list:
        :param shuffle:
        :param language:
        :param drop_last:
        :return:
        """
        sentence_list1, sentence_list2, label_list, features_list = list(sentence_series1), list(sentence_series2), list(label_series), list(features_list)

        # get label_to_id
        if unique_label_list is not None:
            label_to_id = {k: v for v, k in enumerate(unique_label_list)}
        else:
            label_to_id = {k: v for v, k in enumerate(self._get_unique_lable_list(label_list))}

        # get bert tokenize
        tokenizer = self._get_bert_tokenizer(bert_prained_path)

        # convert sentences to token_id & token type id & attention mask
        token_ids_list, label_ids_list = list(), list()
        token_type_ids_list, attention_masks_list = list(), list()

        for sentence1, sentence2 in zip(sentence_list1, sentence_list2):

            data_ids = self._deal_two_sentence_for_bert(sentence1, sentence2, language, max_seq_length, tokenizer)

            # data
            token_ids_list.append(data_ids["token_ids"])
            token_type_ids_list.append(data_ids["token_type_ids"])
            attention_masks_list.append(data_ids["attention_masks"])

        # convert label id
        label_ids_list = [label_to_id[label] for label in label_list]

        # data dict
        data_dict = {
            "token_ids": token_ids_list,
            "token_type_ids": token_type_ids_list,
            "attention_masks": attention_masks_list,
            "label_ids": label_ids_list,
            "features_list": features_list
        }
        dataiter = DataIter(batch_size, max_seq_length, num_works=4).get_iter_for_bert(data_dict, shuffle,
                                                                                       drop_last=drop_last)
        return dataiter
Пример #3
0
    def get_word2vec_input_for_seq(self, sentence_series, label_series, batch_size, max_seq_length,
                                   word2id, label_to_id=None, shuffle=True, drop_last=True, language="chinese"):
        """
        :param sentence_series:
        :param label_series:
        :param batch_size:
        :param max_seq_length:
        :param word2id:
        :param label_to_id:
        :param shuffle:
        :param drop_last:
        :param language:
        :return:
        """
        sentence_list, label_list = list(sentence_series), list(label_series)

        # get label_to_id
        if label_to_id is not None:
            pass
        else:
            label_to_id = {k: v+1 for v, k in enumerate(self._get_unique_lable_list(list(chain.from_iterable(label_list))))}
            label_to_id["<PAD>"] = 0

        # get bert tokenize
        # convert sentences to token_id & token type id & attention mask
        # convert label id
        token_ids_list, label_ids_list, token_len_list = list(), list(), list()
        for sentence, label in zip(sentence_list, label_list):
            dict_ids = self._deal_one_sentence_one_label_for_word2vec(sentence, label, max_seq_length, word2id, label_to_id, language)
            token_ids_list.append(dict_ids["token_ids"])
            label_ids_list.append(dict_ids["label_ids"])
            token_len_list.append(dict_ids["token_len"])

        # 对原始数据的句子长度进行控制
        raw_sentence = list()
        raw_label = list()
        for sentence, label in zip(sentence_list, label_list):
            sentence = sentence.split(" ")
            label = label.split(" ")
            if len(sentence) > max_seq_length - 2:
                raw_sentence.append(sentence[:max_seq_length - 2])
                raw_label.append(label[:max_seq_length - 2])
            else:
                raw_sentence.append(sentence)
                raw_label.append(label)

        # data dict
        data_dict = {
            "raw_x": raw_sentence,
            "raw_y": raw_label,
            "token_ids": token_ids_list,
            "label_ids": label_ids_list,
            "token_len": token_len_list
        }
        dataiter = DataIter(batch_size, max_seq_length, num_works=4).get_iter_for_bert(data_dict, shuffle, drop_last)
        return dataiter
Пример #4
0
    def get_bert_input_for_double_sen(self, sentence_series1, sentence_series2, label_series, batch_size, max_seq_length,
                          bert_prained_path, unique_label_list=None, shuffle=True, language="chinese", drop_last=False):
        """
        :param sentence_series1:
        :param sentence_series2:
        :param label_series:
        :param batch_size:
        :param max_seq_length:
        :param bert_prained_path:
        :param unique_label_list:
        :param shuffle:
        :param language:
        :return:
        """
        sentence_list1, sentence_list2, label_list = list(sentence_series1), list(sentence_series2), list(label_series)

        # get label_to_id
        if unique_label_list is not None:
            label_to_id = {k: v for v, k in enumerate(unique_label_list)}
        else:
            label_to_id = {k: v for v, k in enumerate(self._get_unique_lable_list(label_list))}

        # get bert tokenizer
        tokenizer = self._get_bert_tokenizer(bert_prained_path)

        # deal data for sentence1
        token_ids_list, token_type_ids_list, attention_masks_list = list(), list(), list()
        for sentence1, sentence2 in zip(sentence_list1, sentence_list2):

            data_ids = self._deal_two_sentence_for_bert(sentence1, sentence2, language, max_seq_length, tokenizer)

            # data
            token_ids_list.append(data_ids["token_ids"])
            token_type_ids_list.append(data_ids["token_type_ids"])
            attention_masks_list.append(data_ids["attention_masks"])

        # deal data for label
        label_id_list=list()
        for lab in label_series:
            if isinstance(lab, int):
                label_id_list.append(label_to_id[lab])
            else:
                label_id_list.append(label_to_id[int(lab)])

        # data dict
        data_dict = {
            "token_ids": token_ids_list,
            "token_type_ids": token_type_ids_list,
            "attention_masks": attention_masks_list,
            "label_ids": label_id_list,
        }
        dataiter = DataIter(batch_size, max_seq_length, num_works=4).get_double_sentence_iter_for_bert(data_dict, shuffle, drop_last=drop_last)
        return dataiter
Пример #5
0
    def get_bert_input_for_cls_add_feature(self, sentence_series, features_list, label_series, batch_size, max_seq_length,
                                           bert_prained_path, unique_label_list=None, shuffle=False, language="chinese", drop_last=True):
        """
        :param sentence_series: pandas.Series
        :param label_series: pandas.Series
        :param batch_size: int
        :param max_seq_length: int
        :param bert_prained_path: bert 预训练模型的路径
        :param unique_label_list: 所有不同的label的集合
        :param shuffle: 是否对数据的顺序进行打乱
        :param language: 处理的文本的语言类型
        :return:
        """
        sentence_list, label_list, features_list = list(sentence_series), list(label_series), list(features_list)

        # get label_to_id
        if unique_label_list is not None:
            label_to_id = {k: v for v, k in enumerate(unique_label_list)}
        else:
            label_to_id = {k: v for v, k in enumerate(self._get_unique_lable_list(label_list))}

        # get bert tokenize
        tokenizer = self._get_bert_tokenizer(bert_prained_path)

        # convert sentences to token_id & token type id & attention mask
        token_ids_list, label_ids_list = list(), list()
        token_type_ids_list, attention_masks_list = list(), list()
        for sentence in sentence_list:
            dict_ids = self._deal_one_sentence_for_bert(sentence, language, max_seq_length, tokenizer)
            token_ids_list.append(dict_ids["token_ids"])
            token_type_ids_list.append(dict_ids["token_type_ids"])
            attention_masks_list.append(dict_ids["attention_masks"])

        # convert label id
        label_ids_list = [label_to_id[label] for label in label_list]

        # data dict
        data_dict = {
            "token_ids": token_ids_list,
            "token_type_ids": token_type_ids_list,
            "attention_masks": attention_masks_list,
            "label_ids": label_ids_list,
            "features_list": features_list
        }
        dataiter = DataIter(batch_size, max_seq_length, num_works=4).get_iter_for_bert(data_dict, shuffle, drop_last=drop_last)
        return dataiter
Пример #6
0
    def get_bert_input_for_context_cls(self, sentence_list_f, sentence_list_m, sentence_list_b, label_list, batch_size,
                                       max_seq_length, bert_prained_path, unique_label_list, shuffle=True, language="chinese", drop_last=False):
        """
        :param sentence_list_f:
        :param sentence_list_m:
        :param sentence_list_b:
        :param label_list:
        :param batch_size:
        :param max_seq_length:
        :param bert_prained_path:
        :param unique_label_list:
        :param shuffle:
        :param language:
        :return:
        """
        # get label_to_id
        label_to_id = {k: v for v, k in enumerate(unique_label_list)}

        # get bert tokenize
        tokenizer = self._get_bert_tokenizer(bert_prained_path)

        # data dict
        data_dict = dict()

        # convert sentences to token_id & token type id & attention mask
        if sentence_list_f is not None:
            token_ids_list_f = list()
            token_type_ids_list_f, attention_masks_list_f = list(), list()
            for sentence in sentence_list_f:
                dict_ids = self._deal_one_sentence_for_bert(sentence, language, max_seq_length, tokenizer)
                token_ids_list_f.append(dict_ids["token_ids"])
                token_type_ids_list_f.append(dict_ids["token_type_ids"])
                attention_masks_list_f.append(dict_ids["attention_masks"])
            data_dict["token_ids_f"] = token_ids_list_f
            data_dict["token_type_ids_f"] = token_type_ids_list_f
            data_dict["attention_masks_f"] = attention_masks_list_f

        # convert sentences to token_id & token type id & attention mask
        if sentence_list_m is not None:
            token_ids_list_m = list()
            token_type_ids_list_m, attention_masks_list_m = list(), list()
            for sentence in sentence_list_m:
                dict_ids = self._deal_one_sentence_for_bert(sentence, language, max_seq_length, tokenizer)
                token_ids_list_m.append(dict_ids["token_ids"])
                token_type_ids_list_m.append(dict_ids["token_type_ids"])
                attention_masks_list_m.append(dict_ids["attention_masks"])
            data_dict["token_ids_m"] = token_ids_list_m
            data_dict["token_type_ids_m"] = token_type_ids_list_m
            data_dict["attention_masks_m"] = attention_masks_list_m

        # convert sentences to token_id & token type id & attention mask
        if sentence_list_b is not None:
            token_ids_list_b = list()
            token_type_ids_list_b, attention_masks_list_b = list(), list()
            for sentence in sentence_list_b:
                dict_ids = self._deal_one_sentence_for_bert(sentence, language, max_seq_length, tokenizer)
                token_ids_list_b.append(dict_ids["token_ids"])
                token_type_ids_list_b.append(dict_ids["token_type_ids"])
                attention_masks_list_b.append(dict_ids["attention_masks"])
            data_dict["token_ids_b"] = token_ids_list_b
            data_dict["token_type_ids_b"] = token_type_ids_list_b
            data_dict["attention_masks_b"] = attention_masks_list_b

        # convert label id
        label_ids_list = [label_to_id[label] for label in label_list]

        data_dict["label_ids"] = label_ids_list

        dataiter = DataIter(batch_size, max_seq_length, num_works=self.num_works).get_iter_for_bert(data_dict, shuffle, drop_last=drop_last)
        return dataiter