예제 #1
0
def make_dataset(sample_train,
                 sample_dev,
                 label2index,
                 tokenizer,
                 cut_off=False):
    log_obj.info(">>>>>>>>>>>>>样本数目=%s %s<<<<<<<<<<<<<<" %
                 (len(sample_train), len(sample_dev)))
    sample_train = list(
        filter(lambda x: x["label"] in label2index, sample_train))  # true的留下
    sample_dev = list(filter(lambda x: x["label"] in label2index,
                             sample_dev))  # true的留下
    log_obj.info(">>>>>>>>>>>>>过滤后样本数目=%s %s<<<<<<<<<<" %
                 (len(sample_train), len(sample_dev)))
    train_examples = _make_examples(sample_train)
    test_examples = _make_examples(sample_dev)
    if cut_off:
        train_examples = train_examples[:properties.BATCH_SIZE]
        test_examples = test_examples[:properties.EVAL_BATCH_SIZE]

    # 这里会处理label的转换
    train_ds = tokenize_utils.convert_examples_to_features(
        train_examples, tokenizer, label2index)
    test_ds = tokenize_utils.convert_examples_to_features(
        test_examples, tokenizer, label2index)
    train_ds = train_ds.shuffle(2 * properties.BATCH_SIZE).batch(
        properties.BATCH_SIZE)  # .repeat(-1)会一直Repeat下去
    test_ds = test_ds.batch(properties.EVAL_BATCH_SIZE)
    return train_ds, test_ds, len(train_examples), len(test_examples)
예제 #2
0
def process(path, train_to_path, dev_to_path, extend_sample_map):
    """
    :param extend_sample_map:  过采样比例
    """
    # 划分训练与验证集
    model0_sample = utils.load_json_file(path)
    sample_dic = {}
    for i in model0_sample:
        if i["label"] in sample_dic:
            sample_dic[i["label"]].append(i)
        else:
            sample_dic[i["label"]] = [i]

    train_examples = []
    test_examples = []
    for l, lis in sample_dic.items():
        a, b = train_test_split(lis, test_size=properties.test_dev_size)
        train_examples.extend(a)
        test_examples.extend(b)
    print("train vs dev=", len(train_examples), len(test_examples))

    # 扩充训练集
    train_dic = {}
    for i in train_examples:
        if i["label"] in train_dic:
            train_dic[i["label"]].append(i)
        else:
            train_dic[i["label"]] = [i]

    for k, v in train_dic.items():
        print(k, len(v))
    for label, ratio in extend_sample_map.items():
        if ratio <= 1:
            tmp_lis = copy.deepcopy(train_dic[label])
            tmp_lis = shuffle_list(tmp_lis)
            train_dic[label] = tmp_lis[:int(len(tmp_lis) * ratio)]
        else:
            tmp_lis = copy.deepcopy(train_dic[label])
            tmp_lis = shuffle_list(tmp_lis)
            for j in range(math.ceil(ratio)):
                train_dic[label].extend(tmp_lis)
            train_dic[label] = train_dic[label][:int(len(tmp_lis) * ratio)]
    print("重新采样后")
    for k, v in train_dic.items():
        print(k, len(v))
    train_examples = []
    for l, lis in train_dic.items():
        train_examples.extend(lis)

    train_examples = shuffle_list(train_examples)
    test_examples = shuffle_list(test_examples)
    log_obj.info("划分 训练集 : 验证集 = %s : %s" %
                 (len(train_examples), len(test_examples)))

    utils.dump_json_file(train_to_path, train_examples)
    utils.dump_json_file(dev_to_path, test_examples)
예제 #3
0
 def predict(self, input_lis, task_name):
     ds = self.make_dataset(input_lis)
     if ds is None:
         return []
     y_predict = []
     for _, inputs in enumerate(ds):
         if _ % 100 == 0:
             log_obj.info("第%s批", (_))
         outputs = self.model(inputs[0], task_name, training=False)
         res = tf.math.argmax(outputs, axis=-1)
         y_predict.extend(res.numpy().tolist())
     assert len(y_predict) == len(input_lis), "len not match"
     return y_predict
예제 #4
0
def _make_examples(sample):
    all_labels = [x["label"] for x in sample]
    all_labels = Counter(all_labels)
    log_obj.info("label分类统计%s" % (all_labels))
    examples = []
    for _, x in enumerate(sample):
        text_a, text_b = utils.trunc_seq_pair(x["text_a"],
                                              x.get("text_b", None),
                                              max_len=properties.MAX_LEN)
        examples.append(
            InputExample(x.get("id", None),
                         text_a,
                         text_b=text_b,
                         label=x.get("label", None)))
    return examples
예제 #5
0
def inspect_data(path):
    sample = utils.load_json_file(path)
    key_fq = Counter([i["label"] for i in sample])
    sorted_key_fq = sorted(key_fq.items(), key=lambda x: x[1], reverse=True)
    log_obj.info("样本总数" + str(len(sample)))
    log_obj.info("类别总数" + str(len(key_fq)))
    log_obj.info(key_fq)
    log_obj.info(sorted_key_fq)
    return sorted_key_fq
예제 #6
0
def train_a_dataset(model,
                    train_ds,
                    test_ds,
                    task_name,
                    EPOCHS=10,
                    to_model_path=""):
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
    loss_fuc = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_acc = tf.keras.metrics.SparseCategoricalAccuracy("train_acc")
    dev_loss = tf.keras.metrics.Mean(name='dev_loss')
    dev_acc = tf.keras.metrics.SparseCategoricalAccuracy("dev_acc")
    dev_recall = tf.keras.metrics.Mean(name='dev_recall')
    dev_precision = tf.keras.metrics.Mean(name='dev_precision')
    dev_f1 = tf.keras.metrics.Mean(name='dev_f1')

    @tf.function
    def train_step(inputs):
        with tf.GradientTape() as tape:
            outputs = model(
                inputs[0], task=task_name,
                training=True)  # inputs 0位置3个dic,1位置label  outputs 是logits
            loss = loss_fuc(inputs[1], outputs)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        train_loss(loss)
        train_acc(inputs[1], outputs)

    def dev_step(inputs):
        outputs = model(inputs[0], task=task_name, training=False)
        loss = loss_fuc(inputs[1], outputs)

        dev_loss(loss)
        dev_acc(inputs[1], outputs)

        predict_labels = tf.argmax(outputs, axis=-1)
        labels = inputs[1]
        dev_precision(
            metrics.precision_score(labels.numpy(),
                                    predict_labels.numpy(),
                                    average='macro'))
        dev_recall(
            metrics.recall_score(labels.numpy(),
                                 predict_labels.numpy(),
                                 average='macro'))
        dev_f1(
            metrics.f1_score(labels.numpy(),
                             predict_labels.numpy(),
                             average='macro'))

    for e in range(EPOCHS):
        log_obj.info(">>>>>>epoch=%s<<<<<<" % (e))
        train_loss.reset_states()
        train_acc.reset_states()

        dev_loss.reset_states()
        dev_acc.reset_states()
        dev_precision.reset_states()
        dev_recall.reset_states()
        dev_f1.reset_states()

        for _, batch_inputs in enumerate(train_ds):
            train_step(batch_inputs)
            if _ % 800 == 0:  # 50批打印一次
                log_obj.info('train Batch={} loss={:.4f} acc={:.4f}'.format(
                    _, train_loss.result(), train_acc.result()))

        for _, batch_dev_inputs in enumerate(test_ds):
            dev_step(batch_dev_inputs)
            if _ % 300 == 0:  # 50批打印一次
                log_obj.info(
                    'dev Batch={} loss={:.4f} acc={:.4f} precision={:.4f} recall={:.4f} f1={:.4f}'
                    .format(_, dev_loss.result(), dev_acc.result(),
                            dev_precision.result(), dev_recall.result(),
                            dev_f1.result()))
예제 #7
0
                             "data/tn_train.json",
                             "data/tn_dev.json",
                             extend_sample_map={'114': 3})

oce_train = utils.load_json_file("data/oce_train.json")
oce_dev = utils.load_json_file("data/oce_dev.json")
ocn_train = utils.load_json_file("data/ocn_train.json")
ocn_dev = utils.load_json_file("data/ocn_dev.json")
tn_train = utils.load_json_file("data/tn_train.json")
tn_dev = utils.load_json_file("data/tn_dev.json")

bert_dense_model = BertTrmHeadModel(bert_model_path,
                                    oce_cls_num=len(oce_label2index),
                                    ocn_cls_num=len(ocn_label2index),
                                    tn_cls_num=len(tn_label2index))
log_obj.info(
    ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>oce<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
train_ds, test_ds, trn, ten = train_func.make_dataset(
    oce_train,
    oce_dev,
    label2index=oce_label2index,
    tokenizer=bert_dense_model.tokenizer,
    cut_off=do_cut_samples)
train_func.train_a_dataset(bert_dense_model,
                           train_ds,
                           test_ds,
                           task_name="oce",
                           EPOCHS=2,
                           to_model_path=to_model_path)

log_obj.info(
    ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>tn<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
예제 #8
0
def convert_examples_to_features(
        examples,
        tokenizer,
        label_map,
        max_length=properties.MAX_LEN,
        #     pad_on_left=False,
        pad_token=0,
        pad_token_segment_id=0,
        mask_padding_with_zero=True,
        return_tfdataset=True
):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """

    features = []
    for (ex_index, example) in enumerate(examples):

        if ex_index % 1000 == 0:
            log_obj.info("Writing example %d/%d" % (ex_index, len(examples)))

        inputs = tokenizer.encode_plus(
            example.text_a, example.text_b,
            add_special_tokens=True, max_length=max_length,
            return_token_type_ids=True, truncation=True
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)

        input_ids = input_ids + ([pad_token] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
            len(attention_mask), max_length
        )
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
            len(token_type_ids), max_length
        )

        label = 0
        if example.label in label_map:
            label = label_map[example.label]

        # if ex_index < 3:
        #     print("*** Example ***")
        #     print("guid: %s" % (example.guid))
        #     print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        #     print("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
        #     print("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
        #     print("label: %s (id = %d)" % (example.label, label))

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label
            )
        )

    if return_tfdataset:

        def gen():
            for ex in features:
                yield (
                    {
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids,
                    },
                    ex.label,
                )

        return tf.data.Dataset.from_generator(
            gen,
            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )

    return features