def make_dataset(sample_train, sample_dev, label2index, tokenizer, cut_off=False): log_obj.info(">>>>>>>>>>>>>样本数目=%s %s<<<<<<<<<<<<<<" % (len(sample_train), len(sample_dev))) sample_train = list( filter(lambda x: x["label"] in label2index, sample_train)) # true的留下 sample_dev = list(filter(lambda x: x["label"] in label2index, sample_dev)) # true的留下 log_obj.info(">>>>>>>>>>>>>过滤后样本数目=%s %s<<<<<<<<<<" % (len(sample_train), len(sample_dev))) train_examples = _make_examples(sample_train) test_examples = _make_examples(sample_dev) if cut_off: train_examples = train_examples[:properties.BATCH_SIZE] test_examples = test_examples[:properties.EVAL_BATCH_SIZE] # 这里会处理label的转换 train_ds = tokenize_utils.convert_examples_to_features( train_examples, tokenizer, label2index) test_ds = tokenize_utils.convert_examples_to_features( test_examples, tokenizer, label2index) train_ds = train_ds.shuffle(2 * properties.BATCH_SIZE).batch( properties.BATCH_SIZE) # .repeat(-1)会一直Repeat下去 test_ds = test_ds.batch(properties.EVAL_BATCH_SIZE) return train_ds, test_ds, len(train_examples), len(test_examples)
def process(path, train_to_path, dev_to_path, extend_sample_map): """ :param extend_sample_map: 过采样比例 """ # 划分训练与验证集 model0_sample = utils.load_json_file(path) sample_dic = {} for i in model0_sample: if i["label"] in sample_dic: sample_dic[i["label"]].append(i) else: sample_dic[i["label"]] = [i] train_examples = [] test_examples = [] for l, lis in sample_dic.items(): a, b = train_test_split(lis, test_size=properties.test_dev_size) train_examples.extend(a) test_examples.extend(b) print("train vs dev=", len(train_examples), len(test_examples)) # 扩充训练集 train_dic = {} for i in train_examples: if i["label"] in train_dic: train_dic[i["label"]].append(i) else: train_dic[i["label"]] = [i] for k, v in train_dic.items(): print(k, len(v)) for label, ratio in extend_sample_map.items(): if ratio <= 1: tmp_lis = copy.deepcopy(train_dic[label]) tmp_lis = shuffle_list(tmp_lis) train_dic[label] = tmp_lis[:int(len(tmp_lis) * ratio)] else: tmp_lis = copy.deepcopy(train_dic[label]) tmp_lis = shuffle_list(tmp_lis) for j in range(math.ceil(ratio)): train_dic[label].extend(tmp_lis) train_dic[label] = train_dic[label][:int(len(tmp_lis) * ratio)] print("重新采样后") for k, v in train_dic.items(): print(k, len(v)) train_examples = [] for l, lis in train_dic.items(): train_examples.extend(lis) train_examples = shuffle_list(train_examples) test_examples = shuffle_list(test_examples) log_obj.info("划分 训练集 : 验证集 = %s : %s" % (len(train_examples), len(test_examples))) utils.dump_json_file(train_to_path, train_examples) utils.dump_json_file(dev_to_path, test_examples)
def predict(self, input_lis, task_name): ds = self.make_dataset(input_lis) if ds is None: return [] y_predict = [] for _, inputs in enumerate(ds): if _ % 100 == 0: log_obj.info("第%s批", (_)) outputs = self.model(inputs[0], task_name, training=False) res = tf.math.argmax(outputs, axis=-1) y_predict.extend(res.numpy().tolist()) assert len(y_predict) == len(input_lis), "len not match" return y_predict
def _make_examples(sample): all_labels = [x["label"] for x in sample] all_labels = Counter(all_labels) log_obj.info("label分类统计%s" % (all_labels)) examples = [] for _, x in enumerate(sample): text_a, text_b = utils.trunc_seq_pair(x["text_a"], x.get("text_b", None), max_len=properties.MAX_LEN) examples.append( InputExample(x.get("id", None), text_a, text_b=text_b, label=x.get("label", None))) return examples
def inspect_data(path): sample = utils.load_json_file(path) key_fq = Counter([i["label"] for i in sample]) sorted_key_fq = sorted(key_fq.items(), key=lambda x: x[1], reverse=True) log_obj.info("样本总数" + str(len(sample))) log_obj.info("类别总数" + str(len(key_fq))) log_obj.info(key_fq) log_obj.info(sorted_key_fq) return sorted_key_fq
def train_a_dataset(model, train_ds, test_ds, task_name, EPOCHS=10, to_model_path=""): optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) loss_fuc = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) train_loss = tf.keras.metrics.Mean(name='train_loss') train_acc = tf.keras.metrics.SparseCategoricalAccuracy("train_acc") dev_loss = tf.keras.metrics.Mean(name='dev_loss') dev_acc = tf.keras.metrics.SparseCategoricalAccuracy("dev_acc") dev_recall = tf.keras.metrics.Mean(name='dev_recall') dev_precision = tf.keras.metrics.Mean(name='dev_precision') dev_f1 = tf.keras.metrics.Mean(name='dev_f1') @tf.function def train_step(inputs): with tf.GradientTape() as tape: outputs = model( inputs[0], task=task_name, training=True) # inputs 0位置3个dic,1位置label outputs 是logits loss = loss_fuc(inputs[1], outputs) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) train_loss(loss) train_acc(inputs[1], outputs) def dev_step(inputs): outputs = model(inputs[0], task=task_name, training=False) loss = loss_fuc(inputs[1], outputs) dev_loss(loss) dev_acc(inputs[1], outputs) predict_labels = tf.argmax(outputs, axis=-1) labels = inputs[1] dev_precision( metrics.precision_score(labels.numpy(), predict_labels.numpy(), average='macro')) dev_recall( metrics.recall_score(labels.numpy(), predict_labels.numpy(), average='macro')) dev_f1( metrics.f1_score(labels.numpy(), predict_labels.numpy(), average='macro')) for e in range(EPOCHS): log_obj.info(">>>>>>epoch=%s<<<<<<" % (e)) train_loss.reset_states() train_acc.reset_states() dev_loss.reset_states() dev_acc.reset_states() dev_precision.reset_states() dev_recall.reset_states() dev_f1.reset_states() for _, batch_inputs in enumerate(train_ds): train_step(batch_inputs) if _ % 800 == 0: # 50批打印一次 log_obj.info('train Batch={} loss={:.4f} acc={:.4f}'.format( _, train_loss.result(), train_acc.result())) for _, batch_dev_inputs in enumerate(test_ds): dev_step(batch_dev_inputs) if _ % 300 == 0: # 50批打印一次 log_obj.info( 'dev Batch={} loss={:.4f} acc={:.4f} precision={:.4f} recall={:.4f} f1={:.4f}' .format(_, dev_loss.result(), dev_acc.result(), dev_precision.result(), dev_recall.result(), dev_f1.result()))
"data/tn_train.json", "data/tn_dev.json", extend_sample_map={'114': 3}) oce_train = utils.load_json_file("data/oce_train.json") oce_dev = utils.load_json_file("data/oce_dev.json") ocn_train = utils.load_json_file("data/ocn_train.json") ocn_dev = utils.load_json_file("data/ocn_dev.json") tn_train = utils.load_json_file("data/tn_train.json") tn_dev = utils.load_json_file("data/tn_dev.json") bert_dense_model = BertTrmHeadModel(bert_model_path, oce_cls_num=len(oce_label2index), ocn_cls_num=len(ocn_label2index), tn_cls_num=len(tn_label2index)) log_obj.info( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>oce<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<") train_ds, test_ds, trn, ten = train_func.make_dataset( oce_train, oce_dev, label2index=oce_label2index, tokenizer=bert_dense_model.tokenizer, cut_off=do_cut_samples) train_func.train_a_dataset(bert_dense_model, train_ds, test_ds, task_name="oce", EPOCHS=2, to_model_path=to_model_path) log_obj.info( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>tn<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
def convert_examples_to_features( examples, tokenizer, label_map, max_length=properties.MAX_LEN, # pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, return_tfdataset=True ): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ features = [] for (ex_index, example) in enumerate(examples): if ex_index % 1000 == 0: log_obj.info("Writing example %d/%d" % (ex_index, len(examples))) inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, return_token_type_ids=True, truncation=True ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) assert len(attention_mask) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length ) assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length ) label = 0 if example.label in label_map: label = label_map[example.label] # if ex_index < 3: # print("*** Example ***") # print("guid: %s" % (example.guid)) # print("input_ids: %s" % " ".join([str(x) for x in input_ids])) # print("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) # print("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) # print("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label ) ) if return_tfdataset: def gen(): for ex in features: yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, }, ex.label, ) return tf.data.Dataset.from_generator( gen, ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), }, tf.TensorShape([]), ), ) return features