Пример #1
0
def main(_):

    graph = tf.Graph()
    with graph.as_default():
        sess_config = tf.ConfigProto()
        import random
        name_to_features = {
                "input_ids":
                    tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "input_mask":
                    tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "segment_ids":
                    tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "masked_lm_positions":
                    tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.int64),
                "masked_lm_ids":
                    tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.int64),
                "masked_lm_weights":
                    tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.float32),
                "label_ids":
                    tf.FixedLenFeature([], tf.int64),
                }

        params = Bunch({})
        params.epoch = 1
        params.batch_size = FLAGS.batch_size
        def parse_folder(path):
            files = os.listdir(path)
            output = []
            for file_name in files:
                output.append(os.path.join(path, file_name))
            random.shuffle(output)
            return output
        print(params["batch_size"], "===batch size===")
        jd_test = parse_folder(FLAGS.train_result_file)
        input_fn = tf_data_utils.train_input_fn(jd_test, tf_data_utils._decode_record, name_to_features, params)
        
        sess = tf.Session(config=sess_config)
        
        init_op = tf.group(
                    tf.local_variables_initializer())
        sess.run(init_op)
        
        i = 0
        cnt = 0
        while True:
            try:
                features = sess.run(input_fn)
                i += 1
                cnt += 1
            except tf.errors.OutOfRangeError:
                print("End of dataset")
                break
        print(i*FLAGS.batch_size)
Пример #2
0
        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t
        for name in ["input_ids", "input_mask", "segment_ids"]:
            example[name] = tf.reshape(example[name], [-1, max_seq_length])
        return example

    params = Bunch({})
    params.epoch = FLAGS.epoch
    params.batch_size = FLAGS.batch_size
    train_features = tf_data_utils.train_input_fn(FLAGS.train_file,
                                                  _decode_record,
                                                  name_to_features, params)
    eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file, _decode_record,
                                                name_to_features, params)

    [train_op, train_loss, train_per_example_loss,
     train_logits] = model_train_fn(train_features, [],
                                    tf.estimator.ModeKeys.TRAIN)
    [_, eval_loss, eval_per_example_loss,
     eval_logits] = model_eval_fn(eval_features, [],
                                  tf.estimator.ModeKeys.EVAL)
    result = metric_fn(eval_features, eval_logits, eval_loss)

    model_io_fn.set_saver()

    init_op = tf.group(tf.global_variables_initializer(),
def train_eval_fn(FLAGS, init_checkpoint, train_file, dev_file, checkpoint_dir,
                  **kargs):

    graph = tf.Graph()
    with graph.as_default():
        import json

        config = model_config_parser(FLAGS)

        train_size = int(FLAGS.train_size)
        init_lr = FLAGS.init_lr

        distillation_config = Bunch(
            json.load(tf.gfile.Open(FLAGS.multi_task_config)))

        if FLAGS.use_tpu:
            warmup_ratio = config.get('warmup', 0.1)

            num_train_steps = int(train_size / FLAGS.batch_size * FLAGS.epoch)

            num_warmup_steps = int(num_train_steps * warmup_ratio)

            print('==num warmup steps==', num_warmup_steps)

            print(" model type {}".format(FLAGS.model_type))

            print(num_train_steps, num_warmup_steps, "=============",
                  kargs.get('num_gpus', 1), '==number of gpus==')
            tf.logging.info("***** Running evaluation *****")
            tf.logging.info("***** train steps : %d", num_train_steps)
            max_eval_steps = int(int(FLAGS.eval_size) / FLAGS.batch_size)

            clip_norm_scale = 1.0
            lr_scale = 1.0
            lr = init_lr

            checkpoint_dir = checkpoint_dir

            opt_config = Bunch({
                "init_lr":
                lr,
                "num_train_steps":
                num_train_steps,
                "num_warmup_steps":
                num_warmup_steps,
                "train_op":
                kargs.get("train_op", "adam"),
                "decay":
                kargs.get("decay", "no"),
                "warmup":
                kargs.get("warmup", "no"),
                "clip_norm":
                config.get("clip_norm", 1.0),
                "grad_clip":
                config.get("grad_clip", "global_norm"),
                "use_tpu":
                1
            })

        else:
            warmup_ratio = config.get('warmup', 0.1)
            worker_count = kargs.get('worker_count', 1)
            task_index = kargs.get('task_index', 0)
            is_chief = kargs.get('is_chief', 0)

            if FLAGS.if_shard == "0":
                train_size = FLAGS.train_size
                epoch = int(FLAGS.epoch / worker_count)
            elif FLAGS.if_shard == "1":
                print("==number of gpus==", kargs.get('num_gpus', 1))
                train_size = int(FLAGS.train_size / worker_count /
                                 kargs.get('num_gpus', 1))
                # train_size = int(FLAGS.train_size)
                epoch = FLAGS.epoch
            else:
                train_size = int(FLAGS.train_size / worker_count)
                epoch = FLAGS.epoch

            num_train_steps = int(train_size / FLAGS.batch_size * epoch)
            if config.get('ln_type', 'postln') == 'postln':
                num_warmup_steps = int(num_train_steps * warmup_ratio)
            elif config.get('ln_type', 'preln') == 'postln':
                num_warmup_steps = 0
            else:
                num_warmup_steps = int(num_train_steps * warmup_ratio)
            print('==num warmup steps==', num_warmup_steps)

            num_storage_steps = min(
                [int(train_size / FLAGS.batch_size), 10000])
            if num_storage_steps <= 100:
                num_storage_steps = 500

            num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size)

            print(
                "num_train_steps {}, num_eval_steps {}, num_storage_steps {}".
                format(num_train_steps, num_eval_steps, num_storage_steps))

            print(" model type {}".format(FLAGS.model_type))

            print(num_train_steps, num_warmup_steps, "=============",
                  kargs.get('num_gpus', 1), '==number of gpus==')

            if worker_count * kargs.get("num_gpus", 1) >= 2:
                clip_norm_scale = 1.0
                lr_scale = 0.8
            else:
                clip_norm_scale = 1.0
                lr_scale = 1.0
            lr = init_lr * worker_count * kargs.get("num_gpus", 1) * lr_scale
            if lr >= 1e-3:
                lr = 1e-3
            print('==init lr==', lr)
            if FLAGS.opt_type == "hvd" and hvd:
                checkpoint_dir = checkpoint_dir if task_index == 0 else None
            elif FLAGS.opt_type == "all_reduce":
                checkpoint_dir = checkpoint_dir
            elif FLAGS.opt_type == "collective_reduce":
                checkpoint_dir = checkpoint_dir if task_index == 0 else None
            elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
                checkpoint_dir = checkpoint_dir if task_index == 0 else None

            opt_config = Bunch({
                "init_lr":
                lr,
                "num_train_steps":
                num_train_steps,
                "num_warmup_steps":
                num_warmup_steps,
                "worker_count":
                worker_count,
                "gpu_count":
                worker_count * kargs.get("num_gpus", 1),
                "opt_type":
                FLAGS.opt_type,
                "is_chief":
                is_chief,
                "train_op":
                kargs.get("train_op", "adam"),
                "decay":
                kargs.get("decay", "no"),
                "warmup":
                kargs.get("warmup", "no"),
                "clip_norm":
                config.get("clip_norm", 1.0),
                "grad_clip":
                config.get("grad_clip", "global_norm"),
                "epoch":
                FLAGS.epoch,
                "strategy":
                FLAGS.distribution_strategy,
                "use_tpu":
                0
            })

        model_io_config = Bunch({"fix_lm": False})
        model_io_fn = model_io.ModelIO(model_io_config)

        num_classes = FLAGS.num_classes

        model_config_dict = {}
        num_labels_dict = {}
        init_checkpoint_dict = {}
        load_pretrained_dict = {}
        exclude_scope_dict = {}
        not_storage_params_dict = {}
        target_dict = {}

        for task_type in FLAGS.multi_task_type.split(","):
            print("==task type==", task_type)
            model_config_dict[task_type] = model_config_parser(
                Bunch(distillation_config[task_type]))
            print(task_type, distillation_config[task_type],
                  '=====task model config======')
            num_labels_dict[task_type] = distillation_config[task_type][
                "num_labels"]
            init_checkpoint_dict[task_type] = os.path.join(
                FLAGS.buckets,
                distillation_config[task_type]["init_checkpoint"])
            load_pretrained_dict[task_type] = distillation_config[task_type][
                "load_pretrained"]
            exclude_scope_dict[task_type] = distillation_config[task_type][
                "exclude_scope"]
            not_storage_params_dict[task_type] = distillation_config[
                task_type]["not_storage_params"]
            target_dict[task_type] = distillation_config[task_type]["target"]

        tf.logging.info("***** use tpu ***** %s", str(FLAGS.use_tpu))
        model_fn = classifier_model_fn_builder(
            model_config_dict,
            num_labels_dict,
            init_checkpoint_dict,
            load_pretrained_dict,
            model_io_config=model_io_config,
            opt_config=opt_config,
            exclude_scope_dict=exclude_scope_dict,
            not_storage_params_dict=not_storage_params_dict,
            target_dict=target_dict,
            use_tpu=FLAGS.use_tpu,
            **kargs)

        if FLAGS.use_tpu:
            from data_generator import tf_data_utils
            estimator = tf.contrib.tpu.TPUEstimator(
                use_tpu=True,
                model_fn=model_fn,
                config=kargs.get('run_config', {}),
                train_batch_size=FLAGS.batch_size,
                eval_batch_size=FLAGS.batch_size)
            tf.logging.info("****** do train ******* %s", str(FLAGS.do_train))
            if FLAGS.do_train:
                tf.logging.info("***** Running training *****")
                tf.logging.info("  Batch size = %d", FLAGS.batch_size)
                input_features = tf_data_utils.electra_input_fn_builder(
                    train_file,
                    FLAGS.max_length,
                    FLAGS.max_predictions_per_seq,
                    True,
                    num_cpu_threads=4)
                estimator.train(input_fn=input_features,
                                max_steps=num_train_steps)
            else:
                tf.logging.info("***** Running evaluation *****")
                tf.logging.info("  Batch size = %d", FLAGS.batch_size)
                eval_input_fn = tf_data_utils.electra_input_fn_builder(
                    input_files=dev_file,
                    max_seq_length=FLAGS.max_length,
                    max_predictions_per_seq=FLAGS.max_predictions_per_seq,
                    is_training=False)
                tf.logging.info("***** Begining Running evaluation *****")
                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=max_eval_steps)
                output_eval_file = os.path.join(checkpoint_dir,
                                                "eval_results.txt")
                with tf.gfile.GFile(output_eval_file, "w") as writer:
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
        else:
            from data_generator import distributed_tf_data_utils as tf_data_utils
            name_to_features = {
                "input_ids":
                tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "input_mask":
                tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "segment_ids":
                tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "input_ori_ids":
                tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "masked_lm_positions":
                tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.int64),
                "masked_lm_ids":
                tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.int64),
                "masked_lm_weights":
                tf.FixedLenFeature([FLAGS.max_predictions_per_seq],
                                   tf.float32),
                "next_sentence_labels":
                tf.FixedLenFeature([], tf.int64),
            }

            def _decode_record(record, name_to_features):
                """Decodes a record to a TensorFlow example.
				"""
                example = tf.parse_single_example(record, name_to_features)

                # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
                # So cast all int64 to int32.
                for name in list(example.keys()):
                    t = example[name]
                    if t.dtype == tf.int64:
                        t = tf.to_int32(t)
                    example[name] = t

                return example

            def _decode_batch_record(record, name_to_features):
                example = tf.parse_example(record, name_to_features)
                # for name in list(example.keys()):
                # 	t = example[name]
                # 	if t.dtype == tf.int64:
                # 		t = tf.to_int32(t)
                # 	example[name] = t

                return example

            params = Bunch({})
            params.epoch = FLAGS.epoch
            params.batch_size = FLAGS.batch_size

            if kargs.get("run_config", None):
                if kargs.get("parse_type", "parse_single") == "parse_single":
                    train_features = lambda: tf_data_utils.all_reduce_train_input_fn(
                        train_file,
                        _decode_record,
                        name_to_features,
                        params,
                        if_shard=FLAGS.if_shard,
                        worker_count=worker_count,
                        task_index=task_index)
                    eval_features = lambda: tf_data_utils.all_reduce_eval_input_fn(
                        dev_file,
                        _decode_record,
                        name_to_features,
                        params,
                        if_shard=FLAGS.if_shard,
                        worker_count=worker_count,
                        task_index=task_index)
                elif kargs.get("parse_type", "parse_single") == "parse_batch":
                    print("==apply parse example==")
                    train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn(
                        train_file,
                        _decode_batch_record,
                        name_to_features,
                        params,
                        if_shard=FLAGS.if_shard,
                        worker_count=worker_count,
                        task_index=task_index)
                    eval_features = lambda: tf_data_utils.all_reduce_eval_batch_input_fn(
                        dev_file,
                        _decode_batch_record,
                        name_to_features,
                        params,
                        if_shard=FLAGS.if_shard,
                        worker_count=worker_count,
                        task_index=task_index)

            else:
                train_features = lambda: tf_data_utils.train_input_fn(
                    train_file,
                    _decode_record,
                    name_to_features,
                    params,
                    if_shard=FLAGS.if_shard,
                    worker_count=worker_count,
                    task_index=task_index)

                eval_features = lambda: tf_data_utils.eval_input_fn(
                    dev_file,
                    _decode_record,
                    name_to_features,
                    params,
                    if_shard=FLAGS.if_shard,
                    worker_count=worker_count,
                    task_index=task_index)

            train_hooks = []
            eval_hooks = []

            sess_config = tf.ConfigProto(allow_soft_placement=False,
                                         log_device_placement=False)
            if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
                print("==no need for hook==")
            elif FLAGS.opt_type == "pai_soar" and pai:
                print("no need for hook")
            elif FLAGS.opt_type == "hvd" and hvd:
                sess_config.gpu_options.allow_growth = True
                sess_config.gpu_options.visible_device_list = str(
                    hvd.local_rank())
                print("==no need fo hook==")
            else:
                print("==no need for hooks==")

            if kargs.get("run_config", None):
                run_config = kargs.get("run_config", None)
                run_config = run_config.replace(
                    save_checkpoints_steps=num_storage_steps)
                print("==run config==", run_config.save_checkpoints_steps)
            else:
                run_config = tf.estimator.RunConfig(
                    model_dir=checkpoint_dir,
                    save_checkpoints_steps=num_storage_steps,
                    session_config=sess_config)

            if kargs.get("profiler", "profiler") == "profiler":
                if checkpoint_dir:
                    hooks = tf.train.ProfilerHook(
                        save_steps=100,
                        save_secs=None,
                        output_dir=os.path.join(checkpoint_dir, "profiler"),
                    )
                    train_hooks.append(hooks)
                    print("==add profiler hooks==")

            model_estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                     model_dir=checkpoint_dir,
                                                     config=run_config)

            train_being_time = time.time()
            tf.logging.info("==training distribution_strategy=={}".format(
                kargs.get("distribution_strategy", "MirroredStrategy")))
            if kargs.get("distribution_strategy",
                         "MirroredStrategy") == "MirroredStrategy":
                print("==apply single machine multi-card training==")

                train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                    max_steps=num_train_steps)

                eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                                  steps=num_eval_steps)

                model_estimator.train(input_fn=train_features,
                                      max_steps=num_train_steps,
                                      hooks=train_hooks)
                # tf.estimator.train(model_estimator, train_spec)

                train_end_time = time.time()
                print("==training time==", train_end_time - train_being_time)
                tf.logging.info("==training time=={}".format(train_end_time -
                                                             train_being_time))
                eval_results = model_estimator.evaluate(input_fn=eval_features,
                                                        steps=num_eval_steps)
                print(eval_results)

            elif kargs.get("distribution_strategy", "MirroredStrategy") in [
                    "ParameterServerStrategy", "CollectiveAllReduceStrategy"
            ]:
                print("==apply multi-machine machine multi-card training==")
                try:
                    print(os.environ['TF_CONFIG'], "==tf_run_config==")
                except:
                    print("==not tf config==")
                train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                    max_steps=num_train_steps)

                eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                                  steps=num_eval_steps)

                # tf.estimator.train(model_estimator, train_spec) # tf 1.12 doesn't need evaluate

                tf.estimator.train_and_evaluate(model_estimator, train_spec,
                                                eval_spec)
Пример #4
0
        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t
        for name in ["input_ids", "input_mask", "segment_ids"]:
            example[name] = tf.reshape(example[name], [-1, max_seq_length])
        return example

    params = Bunch({})
    params.epoch = 5
    params.batch_size = 6
    train_features = tf_data_utils.train_input_fn(
        "/data/xuht/concat/data/train.tfrecords", _decode_record,
        name_to_features, params)
    eval_features = tf_data_utils.eval_input_fn(
        "/data/xuht/concat/data/test.tfrecords", _decode_record,
        name_to_features, params)

    [train_op, train_loss, train_per_example_loss,
     train_logits] = model_train_fn(train_features, [],
                                    tf.estimator.ModeKeys.TRAIN)
    [_, eval_loss, eval_per_example_loss,
     eval_logits] = model_eval_fn(eval_features, [],
                                  tf.estimator.ModeKeys.EVAL)
    result = metric_fn(eval_features, eval_logits, eval_loss)

    model_io_fn.set_saver()
Пример #5
0
def main(_):
	graph = tf.Graph()
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
	with graph.as_default():
		import json

		os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id
		sess = tf.Session()

		config = json.load(open(FLAGS.config_file, "r"))

		student_config = Bunch(config)
		student_config.use_one_hot_embeddings = True
		student_config.scope = "student/bert"
		student_config.dropout_prob = 0.1
		student_config.label_type = "single_label"
		student_config.init_checkpoint = FLAGS.student_init_checkpoint

		temperature = 2.0  # 1.0
		distill_ratio = 0.7
		true_label_ratio = 0.3

		student_config.temperature = temperature
		student_config.distill_ratio = distill_ratio
		student_config.num_hidden_layers = 6

		json.dump(student_config, open(FLAGS.model_output+"/student_config.json", "w"))

		teacher_config = Bunch(config)
		teacher_config.use_one_hot_embeddings = True
		teacher_config.scope = "teacher/bert"
		teacher_config.dropout_prob = 0.1
		teacher_config.label_type = "single_label"
		teacher_config.init_checkpoint = FLAGS.teacher_init_checkpoint

		json.dump(teacher_config, open(FLAGS.model_output+"/teacher_config.json", "w"))

		model_config_dict = {"student":student_config, "teacher":teacher_config}
		init_checkpoint_dict = {"student":FLAGS.student_init_checkpoint,
							   "teacher":FLAGS.teacher_init_checkpoint}

		num_train_steps = int(
			FLAGS.train_size / FLAGS.batch_size * FLAGS.epoch)
		num_warmup_steps = int(num_train_steps * 0.1)

		num_storage_steps = int(FLAGS.train_size / FLAGS.batch_size)

		print(num_train_steps, num_warmup_steps, "=============")
		
		opt_config = Bunch({"init_lr":1e-5, 
							"num_train_steps":num_train_steps,
							"num_warmup_steps":num_warmup_steps})

		model_io_config = Bunch({"fix_lm":False})
		
		model_io_fn = model_io.ModelIO(model_io_config)
		
		num_choice = FLAGS.num_classes
		max_seq_length = FLAGS.max_length

		model_train_fn = distillation.distillation_model_fn(
			model_config_dict=model_config_dict,
			num_labels=num_choice,
			init_checkpoint_dict=init_checkpoint_dict,
			model_reuse=None,
			load_pretrained={"teacher":True, "student":True},
			model_io_fn=model_io_fn,
			model_io_config=model_io_config,
			opt_config=opt_config,
			student_input_name=["a", "b"],
			teacher_input_name=["a", "b"],
			unlabel_input_name=["ua", "ub"],
			temperature=temperature,
			exclude_scope_dict={"student":"student", "teacher":"teacher"},
			not_storage_params=["adam_m", "adam_v"],
			distillation_weight={"label":distill_ratio, 
								"unlabel":distill_ratio,
								"true_label":true_label_ratio},
			if_distill_unlabeled=False
		)

		model_eval_fn = distillation.distillation_model_fn(
			model_config_dict=model_config_dict,
			num_labels=num_choice,
			init_checkpoint_dict=init_checkpoint_dict,
			model_reuse=True,
			load_pretrained={"teacher":True, "student":True},
			model_io_fn=model_io_fn,
			model_io_config=model_io_config,
			opt_config=opt_config,
			student_input_name=["a", "b"],
			teacher_input_name=["a", "b"],
			unlabel_input_name=["ua", "ub"],
			temperature=temperature,
			exclude_scope_dict={"student":"student", "teacher":"teacher"},
			not_storage_params=["adam_m", "adam_v"],
			distillation_weight={"label":distill_ratio, 
								"unlabel":distill_ratio,
								"true_label":true_label_ratio},
			if_distill_unlabeled=False
		)

		def metric_fn(features, logits, loss):
			print(logits.get_shape(), "===logits shape===")
			pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
			prob = tf.nn.softmax(logits)
			accuracy = correct = tf.equal(
				tf.cast(pred_label, tf.int32),
				tf.cast(features["label_ids"], tf.int32)
			)
			accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
			return {"accuracy":accuracy, "loss":loss, "pred_label":pred_label, "label_ids":features["label_ids"]}
		
		name_to_features = {
				"input_ids_a":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"input_mask_a":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"segment_ids_a":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"input_ids_b":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"input_mask_b":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"segment_ids_b":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"label_ids":
						tf.FixedLenFeature([], tf.int64),
		}
		
		def _decode_record(record, name_to_features):
			"""Decodes a record to a TensorFlow example.
			"""
			example = tf.parse_single_example(record, name_to_features)

			# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
			# So cast all int64 to int32.
			for name in list(example.keys()):
				t = example[name]
				if t.dtype == tf.int64:
					t = tf.to_int32(t)
				example[name] = t
			return example

		params = Bunch({})
		params.epoch = FLAGS.epoch
		params.batch_size = FLAGS.batch_size
		# train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords",
		#                             _decode_record, name_to_features, params)
		# eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords",
		#                             _decode_record, name_to_features, params)

		train_features = tf_data_utils.train_input_fn(FLAGS.train_file,
									_decode_record, name_to_features, params)
		eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file,
									_decode_record, name_to_features, params)

		[train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN)
		[_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)
		result = metric_fn(eval_features, eval_logits, eval_loss)
		
		init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
		sess.run(init_op)
		
		def eval_fn(result):
			i = 0
			total_accuracy = 0
			label, label_id = [], []
			while True:
				try:
					eval_result = sess.run(result)
					total_accuracy += eval_result["accuracy"]
					label_id.extend(eval_result["label_ids"])
					label.extend(eval_result["pred_label"])
					i += 1
				except tf.errors.OutOfRangeError:
					print("End of dataset")
					break
			f1 = f1_score(label_id, label, average="macro")
			accuracy = accuracy_score(label_id, label)
			print("test accuracy accuracy {} {} f1 {}".format(total_accuracy/i, 
				accuracy, f1))
			return total_accuracy/ i, f1
		
		def train_fn(op, loss):
			i = 0
			cnt = 0
			total_loss = 0.0
			while True:
				try:
					[_, train_loss] = sess.run([op, loss])
					total_loss += train_loss
					i += 1
					cnt += 1
					if np.mod(i, num_storage_steps) == 0:
						print(total_loss/cnt)
						# model_io_fn.save_model(sess, "/data/xuht/wsdm19/data/model_11_15_focal_loss/oqmrc_{}.ckpt".format(int(i/8000)))
						model_io_fn.save_model(sess, FLAGS.model_output+"/oqmrc_{}.ckpt".format(int(i/num_storage_steps)))
						
						print("==successful storing model=={}".format(int(i/num_storage_steps)))
						total_loss = 0
						cnt = 0
				except tf.errors.OutOfRangeError:
					break
		print("===========begin to train============")        
		train_fn(train_op, train_loss)
		print("===========begin to eval============")
		accuracy, f1 = eval_fn(result)
		print("==accuracy {} f1 {}==".format(accuracy, f1))
		# model_io_fn.save_model(sess, "/data/xuht/wsdm19/data/model_11_15_focal_loss/oqmrc.ckpt")

		model_io_fn.save_model(sess, FLAGS.model_output+"/oqmrc.ckpt")
Пример #6
0
def main(_):

	hvd.init()

	sess_config = tf.ConfigProto()
	sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

	graph = tf.Graph()
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
	with graph.as_default():
		import json
		
		# config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r"))
		
		config = json.load(open(FLAGS.config_file, "r"))

		init_checkpoint = FLAGS.init_checkpoint
		print("===init checkoutpoint==={}".format(init_checkpoint))

		config = Bunch(config)
		config.use_one_hot_embeddings = True
		config.scope = "bert"
		config.dropout_prob = 0.1
		config.label_type = "single_label"
		config.lm_ratio = 0.0
		config.task_ratio = 1.0

		json.dump(config, open(FLAGS.model_output+"/config.json", "w"))

		init_lr = 1e-5

		if FLAGS.if_shard == "0":
			train_size = FLAGS.train_size
			epoch = int(FLAGS.epoch / hvd.size())
		elif FLAGS.if_shard == "1":
			train_size = int(FLAGS.train_size/hvd.size())
			epoch = FLAGS.epoch

		sess = tf.Session(config=sess_config)

		num_train_steps = int(
			train_size / FLAGS.batch_size * epoch)
		num_warmup_steps = int(num_train_steps * 0.1)

		num_storage_steps = int(train_size / FLAGS.batch_size)

		print(num_train_steps, num_warmup_steps, "=============")
		
		opt_config = Bunch({"init_lr":init_lr/(hvd.size()), 
							"num_train_steps":num_train_steps,
							"num_warmup_steps":num_warmup_steps})

		model_io_config = Bunch({"fix_lm":False})
		
		model_io_fn = model_io.ModelIO(model_io_config)

		optimizer_fn = optimizer.Optimizer(opt_config)
		
		num_choice = FLAGS.num_classes
		max_seq_length = FLAGS.max_length
		max_predictions_per_seq = FLAGS.max_predictions_per_seq

		model_train_fn = classifier_fn.classifier_model_fn_builder(config, 
												num_choice, init_checkpoint, 
												reuse=None, 
												load_pretrained=True,
												model_io_fn=model_io_fn,
												optimizer_fn=optimizer_fn,
												model_io_config=model_io_config, 
												opt_config=opt_config)


		model_eval_fn = classifier_fn.classifier_model_fn_builder(config, 
												num_choice, init_checkpoint, 
												reuse=True, 
												load_pretrained=True,
												model_io_fn=model_io_fn,
												optimizer_fn=optimizer_fn,
												model_io_config=model_io_config, 
												opt_config=opt_config)
		
		name_to_features = {
				"input_ids":
					tf.FixedLenFeature([max_seq_length], tf.int64),
				"input_mask":
					tf.FixedLenFeature([max_seq_length], tf.int64),
				"segment_ids":
					tf.FixedLenFeature([max_seq_length], tf.int64),
				"masked_lm_positions":
					tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
				"masked_lm_ids":
					tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
				"masked_lm_weights":
					tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
				"label_ids":
					tf.FixedLenFeature([], tf.int64),
				}

		def _decode_record(record, name_to_features):
			"""Decodes a record to a TensorFlow example.
			"""
			example = tf.parse_single_example(record, name_to_features)

			# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
			# So cast all int64 to int32.
			for name in list(example.keys()):
				t = example[name]
				if t.dtype == tf.int64:
					t = tf.to_int32(t)
				example[name] = t
			return example 

		params = Bunch({})
		params.epoch = epoch
		params.batch_size = FLAGS.batch_size

		def parse_folder(path):
			files = os.listdir(path)
			output = []
			for file_name in files:
				output.append(os.path.join(path, file_name))
			random.shuffle(output)
			return output

		train_features = tf_data_utils.train_input_fn(
									parse_folder(FLAGS.train_file),
									_decode_record, name_to_features, params)
		train_dict = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN)

		eval_features = tf_data_utils.eval_input_fn(
										parse_folder(FLAGS.dev_file),
										_decode_record, name_to_features, params)
		eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)

		model_io_fn.set_saver()
		
		init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
		sess.run(init_op)

		sess.run(hvd.broadcast_global_variables(0))
		
		def eval_fn(op_dict):
			i = 0
			eval_total_dict = {}
			while True:
				try:
					eval_result = sess.run(op_dict)
					for key in eval_result:
						if key in ["probabilities", "label_ids"]:
							if key in eval_total_dict:
								eval_total_dict[key].extend(eval_result[key])
							else:
								eval_total_dict[key] = []
								eval_total_dict[key].extend(eval_result[key])
					i += 1
				except tf.errors.OutOfRangeError:
					print("End of dataset")
					break

			for key in eval_result:
				if key not in ["probabilities", "label_ids"]:
					eval_total_dict[key] = eval_result[key]

			label_id = eval_total_dict["label_ids"]
			label = np.argmax(np.array(eval_total_dict["probabilities"]), axis=-1)

			macro_f1 = f1_score(label_id, label, average="macro")
			micro_f1 = f1_score(label_id, label, average="micro")
			accuracy = accuracy_score(label_id, label)

			print("test accuracy {} macro_f1 score {} micro_f1 {} masked_lm_accuracy {} sentence_f {}".format(accuracy, 
																		macro_f1,  micro_f1, 
																		eval_total_dict["masked_lm_accuracy"],
																		eval_total_dict["sentence_f"]))
			return eval_total_dict

		def run_eval(steps):
			import _pickle as pkl
			eval_features = tf_data_utils.eval_input_fn(
										parse_folder(FLAGS.dev_file),
										_decode_record, name_to_features, params)
			eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)
			sess.run(tf.local_variables_initializer())
			eval_finial_dict = eval_fn(eval_dict)
			if hvd.rank() == 0:
				pkl.dump(eval_finial_dict, open(FLAGS.model_output+"/eval_dict_{}.pkl".format(steps), "wb"))
			return eval_finial_dict
		
		def train_fn(op_dict):
			i = 0
			cnt = 0
			loss_dict = {}
			monitoring_train = []
			monitoring_eval = []
			while True:
				try:
					train_result = sess.run(op_dict)
					for key in train_result:
						if key == "train_op":
							continue
						else:
							if np.isnan(train_result[key]):
								print(train_loss, "get nan loss")
								break
							else:
								if key in loss_dict:
									loss_dict[key] += train_result[key]
								else:
									loss_dict[key] = train_result[key]
					
					i += 1
					cnt += 1
					
					if np.mod(i, num_storage_steps) == 0:
						string = ""
						for key in loss_dict:
							tmp = key + " " + str(loss_dict[key]/cnt) + "\t"
							string += tmp
						print(string)
						monitoring_train.append(loss_dict)

						eval_finial_dict = run_eval(int(i/num_storage_steps))
						monitoring_eval.append(eval_finial_dict)

						for key in loss_dict:
							loss_dict[key] = 0.0
						if hvd.rank() == 0:
							model_io_fn.save_model(sess, FLAGS.model_output+"/model_{}.ckpt".format(int(i/num_storage_steps)))
							print("==successful storing model=={}".format(int(i/num_storage_steps)))
						cnt = 0

				except tf.errors.OutOfRangeError:
					if hvd.rank() == 0:
						import _pickle as pkl
						pkl.dump({"train":monitoring_train,
							"eval":monitoring_eval}, open(FLAGS.model_output+"/monitoring.pkl", "wb"))

					break
		print("===========begin to train============")        
		train_fn(train_dict)
		if hvd.rank() == 0:
			model_io_fn.save_model(sess, FLAGS.model_output+"/model.ckpt")
			print("===========begin to eval============")
			eval_finial_dict = run_eval("final")