def main(_): graph = tf.Graph() with graph.as_default(): sess_config = tf.ConfigProto() import random name_to_features = { "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "masked_lm_positions": tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.int64), "masked_lm_ids": tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.int64), "masked_lm_weights": tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.float32), "label_ids": tf.FixedLenFeature([], tf.int64), } params = Bunch({}) params.epoch = 1 params.batch_size = FLAGS.batch_size def parse_folder(path): files = os.listdir(path) output = [] for file_name in files: output.append(os.path.join(path, file_name)) random.shuffle(output) return output print(params["batch_size"], "===batch size===") jd_test = parse_folder(FLAGS.train_result_file) input_fn = tf_data_utils.train_input_fn(jd_test, tf_data_utils._decode_record, name_to_features, params) sess = tf.Session(config=sess_config) init_op = tf.group( tf.local_variables_initializer()) sess.run(init_op) i = 0 cnt = 0 while True: try: features = sess.run(input_fn) i += 1 cnt += 1 except tf.errors.OutOfRangeError: print("End of dataset") break print(i*FLAGS.batch_size)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t for name in ["input_ids", "input_mask", "segment_ids"]: example[name] = tf.reshape(example[name], [-1, max_seq_length]) return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size train_features = tf_data_utils.train_input_fn(FLAGS.train_file, _decode_record, name_to_features, params) eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file, _decode_record, name_to_features, params) [train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN) [_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) result = metric_fn(eval_features, eval_logits, eval_loss) model_io_fn.set_saver() init_op = tf.group(tf.global_variables_initializer(),
def train_eval_fn(FLAGS, init_checkpoint, train_file, dev_file, checkpoint_dir, **kargs): graph = tf.Graph() with graph.as_default(): import json config = model_config_parser(FLAGS) train_size = int(FLAGS.train_size) init_lr = FLAGS.init_lr distillation_config = Bunch( json.load(tf.gfile.Open(FLAGS.multi_task_config))) if FLAGS.use_tpu: warmup_ratio = config.get('warmup', 0.1) num_train_steps = int(train_size / FLAGS.batch_size * FLAGS.epoch) num_warmup_steps = int(num_train_steps * warmup_ratio) print('==num warmup steps==', num_warmup_steps) print(" model type {}".format(FLAGS.model_type)) print(num_train_steps, num_warmup_steps, "=============", kargs.get('num_gpus', 1), '==number of gpus==') tf.logging.info("***** Running evaluation *****") tf.logging.info("***** train steps : %d", num_train_steps) max_eval_steps = int(int(FLAGS.eval_size) / FLAGS.batch_size) clip_norm_scale = 1.0 lr_scale = 1.0 lr = init_lr checkpoint_dir = checkpoint_dir opt_config = Bunch({ "init_lr": lr, "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "train_op": kargs.get("train_op", "adam"), "decay": kargs.get("decay", "no"), "warmup": kargs.get("warmup", "no"), "clip_norm": config.get("clip_norm", 1.0), "grad_clip": config.get("grad_clip", "global_norm"), "use_tpu": 1 }) else: warmup_ratio = config.get('warmup', 0.1) worker_count = kargs.get('worker_count', 1) task_index = kargs.get('task_index', 0) is_chief = kargs.get('is_chief', 0) if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / worker_count) elif FLAGS.if_shard == "1": print("==number of gpus==", kargs.get('num_gpus', 1)) train_size = int(FLAGS.train_size / worker_count / kargs.get('num_gpus', 1)) # train_size = int(FLAGS.train_size) epoch = FLAGS.epoch else: train_size = int(FLAGS.train_size / worker_count) epoch = FLAGS.epoch num_train_steps = int(train_size / FLAGS.batch_size * epoch) if config.get('ln_type', 'postln') == 'postln': num_warmup_steps = int(num_train_steps * warmup_ratio) elif config.get('ln_type', 'preln') == 'postln': num_warmup_steps = 0 else: num_warmup_steps = int(num_train_steps * warmup_ratio) print('==num warmup steps==', num_warmup_steps) num_storage_steps = min( [int(train_size / FLAGS.batch_size), 10000]) if num_storage_steps <= 100: num_storage_steps = 500 num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size) print( "num_train_steps {}, num_eval_steps {}, num_storage_steps {}". format(num_train_steps, num_eval_steps, num_storage_steps)) print(" model type {}".format(FLAGS.model_type)) print(num_train_steps, num_warmup_steps, "=============", kargs.get('num_gpus', 1), '==number of gpus==') if worker_count * kargs.get("num_gpus", 1) >= 2: clip_norm_scale = 1.0 lr_scale = 0.8 else: clip_norm_scale = 1.0 lr_scale = 1.0 lr = init_lr * worker_count * kargs.get("num_gpus", 1) * lr_scale if lr >= 1e-3: lr = 1e-3 print('==init lr==', lr) if FLAGS.opt_type == "hvd" and hvd: checkpoint_dir = checkpoint_dir if task_index == 0 else None elif FLAGS.opt_type == "all_reduce": checkpoint_dir = checkpoint_dir elif FLAGS.opt_type == "collective_reduce": checkpoint_dir = checkpoint_dir if task_index == 0 else None elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync": checkpoint_dir = checkpoint_dir if task_index == 0 else None opt_config = Bunch({ "init_lr": lr, "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "worker_count": worker_count, "gpu_count": worker_count * kargs.get("num_gpus", 1), "opt_type": FLAGS.opt_type, "is_chief": is_chief, "train_op": kargs.get("train_op", "adam"), "decay": kargs.get("decay", "no"), "warmup": kargs.get("warmup", "no"), "clip_norm": config.get("clip_norm", 1.0), "grad_clip": config.get("grad_clip", "global_norm"), "epoch": FLAGS.epoch, "strategy": FLAGS.distribution_strategy, "use_tpu": 0 }) model_io_config = Bunch({"fix_lm": False}) model_io_fn = model_io.ModelIO(model_io_config) num_classes = FLAGS.num_classes model_config_dict = {} num_labels_dict = {} init_checkpoint_dict = {} load_pretrained_dict = {} exclude_scope_dict = {} not_storage_params_dict = {} target_dict = {} for task_type in FLAGS.multi_task_type.split(","): print("==task type==", task_type) model_config_dict[task_type] = model_config_parser( Bunch(distillation_config[task_type])) print(task_type, distillation_config[task_type], '=====task model config======') num_labels_dict[task_type] = distillation_config[task_type][ "num_labels"] init_checkpoint_dict[task_type] = os.path.join( FLAGS.buckets, distillation_config[task_type]["init_checkpoint"]) load_pretrained_dict[task_type] = distillation_config[task_type][ "load_pretrained"] exclude_scope_dict[task_type] = distillation_config[task_type][ "exclude_scope"] not_storage_params_dict[task_type] = distillation_config[ task_type]["not_storage_params"] target_dict[task_type] = distillation_config[task_type]["target"] tf.logging.info("***** use tpu ***** %s", str(FLAGS.use_tpu)) model_fn = classifier_model_fn_builder( model_config_dict, num_labels_dict, init_checkpoint_dict, load_pretrained_dict, model_io_config=model_io_config, opt_config=opt_config, exclude_scope_dict=exclude_scope_dict, not_storage_params_dict=not_storage_params_dict, target_dict=target_dict, use_tpu=FLAGS.use_tpu, **kargs) if FLAGS.use_tpu: from data_generator import tf_data_utils estimator = tf.contrib.tpu.TPUEstimator( use_tpu=True, model_fn=model_fn, config=kargs.get('run_config', {}), train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size) tf.logging.info("****** do train ******* %s", str(FLAGS.do_train)) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.batch_size) input_features = tf_data_utils.electra_input_fn_builder( train_file, FLAGS.max_length, FLAGS.max_predictions_per_seq, True, num_cpu_threads=4) estimator.train(input_fn=input_features, max_steps=num_train_steps) else: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.batch_size) eval_input_fn = tf_data_utils.electra_input_fn_builder( input_files=dev_file, max_seq_length=FLAGS.max_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) tf.logging.info("***** Begining Running evaluation *****") result = estimator.evaluate(input_fn=eval_input_fn, steps=max_eval_steps) output_eval_file = os.path.join(checkpoint_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) else: from data_generator import distributed_tf_data_utils as tf_data_utils name_to_features = { "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "input_ori_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "masked_lm_positions": tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.int64), "masked_lm_ids": tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.int64), "masked_lm_weights": tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.float32), "next_sentence_labels": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example def _decode_batch_record(record, name_to_features): example = tf.parse_example(record, name_to_features) # for name in list(example.keys()): # t = example[name] # if t.dtype == tf.int64: # t = tf.to_int32(t) # example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size if kargs.get("run_config", None): if kargs.get("parse_type", "parse_single") == "parse_single": train_features = lambda: tf_data_utils.all_reduce_train_input_fn( train_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) eval_features = lambda: tf_data_utils.all_reduce_eval_input_fn( dev_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) elif kargs.get("parse_type", "parse_single") == "parse_batch": print("==apply parse example==") train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn( train_file, _decode_batch_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) eval_features = lambda: tf_data_utils.all_reduce_eval_batch_input_fn( dev_file, _decode_batch_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) else: train_features = lambda: tf_data_utils.train_input_fn( train_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) eval_features = lambda: tf_data_utils.eval_input_fn( dev_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) train_hooks = [] eval_hooks = [] sess_config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False) if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync": print("==no need for hook==") elif FLAGS.opt_type == "pai_soar" and pai: print("no need for hook") elif FLAGS.opt_type == "hvd" and hvd: sess_config.gpu_options.allow_growth = True sess_config.gpu_options.visible_device_list = str( hvd.local_rank()) print("==no need fo hook==") else: print("==no need for hooks==") if kargs.get("run_config", None): run_config = kargs.get("run_config", None) run_config = run_config.replace( save_checkpoints_steps=num_storage_steps) print("==run config==", run_config.save_checkpoints_steps) else: run_config = tf.estimator.RunConfig( model_dir=checkpoint_dir, save_checkpoints_steps=num_storage_steps, session_config=sess_config) if kargs.get("profiler", "profiler") == "profiler": if checkpoint_dir: hooks = tf.train.ProfilerHook( save_steps=100, save_secs=None, output_dir=os.path.join(checkpoint_dir, "profiler"), ) train_hooks.append(hooks) print("==add profiler hooks==") model_estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=checkpoint_dir, config=run_config) train_being_time = time.time() tf.logging.info("==training distribution_strategy=={}".format( kargs.get("distribution_strategy", "MirroredStrategy"))) if kargs.get("distribution_strategy", "MirroredStrategy") == "MirroredStrategy": print("==apply single machine multi-card training==") train_spec = tf.estimator.TrainSpec(input_fn=train_features, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, steps=num_eval_steps) model_estimator.train(input_fn=train_features, max_steps=num_train_steps, hooks=train_hooks) # tf.estimator.train(model_estimator, train_spec) train_end_time = time.time() print("==training time==", train_end_time - train_being_time) tf.logging.info("==training time=={}".format(train_end_time - train_being_time)) eval_results = model_estimator.evaluate(input_fn=eval_features, steps=num_eval_steps) print(eval_results) elif kargs.get("distribution_strategy", "MirroredStrategy") in [ "ParameterServerStrategy", "CollectiveAllReduceStrategy" ]: print("==apply multi-machine machine multi-card training==") try: print(os.environ['TF_CONFIG'], "==tf_run_config==") except: print("==not tf config==") train_spec = tf.estimator.TrainSpec(input_fn=train_features, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, steps=num_eval_steps) # tf.estimator.train(model_estimator, train_spec) # tf 1.12 doesn't need evaluate tf.estimator.train_and_evaluate(model_estimator, train_spec, eval_spec)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t for name in ["input_ids", "input_mask", "segment_ids"]: example[name] = tf.reshape(example[name], [-1, max_seq_length]) return example params = Bunch({}) params.epoch = 5 params.batch_size = 6 train_features = tf_data_utils.train_input_fn( "/data/xuht/concat/data/train.tfrecords", _decode_record, name_to_features, params) eval_features = tf_data_utils.eval_input_fn( "/data/xuht/concat/data/test.tfrecords", _decode_record, name_to_features, params) [train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN) [_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) result = metric_fn(eval_features, eval_logits, eval_loss) model_io_fn.set_saver()
def main(_): graph = tf.Graph() from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score with graph.as_default(): import json os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id sess = tf.Session() config = json.load(open(FLAGS.config_file, "r")) student_config = Bunch(config) student_config.use_one_hot_embeddings = True student_config.scope = "student/bert" student_config.dropout_prob = 0.1 student_config.label_type = "single_label" student_config.init_checkpoint = FLAGS.student_init_checkpoint temperature = 2.0 # 1.0 distill_ratio = 0.7 true_label_ratio = 0.3 student_config.temperature = temperature student_config.distill_ratio = distill_ratio student_config.num_hidden_layers = 6 json.dump(student_config, open(FLAGS.model_output+"/student_config.json", "w")) teacher_config = Bunch(config) teacher_config.use_one_hot_embeddings = True teacher_config.scope = "teacher/bert" teacher_config.dropout_prob = 0.1 teacher_config.label_type = "single_label" teacher_config.init_checkpoint = FLAGS.teacher_init_checkpoint json.dump(teacher_config, open(FLAGS.model_output+"/teacher_config.json", "w")) model_config_dict = {"student":student_config, "teacher":teacher_config} init_checkpoint_dict = {"student":FLAGS.student_init_checkpoint, "teacher":FLAGS.teacher_init_checkpoint} num_train_steps = int( FLAGS.train_size / FLAGS.batch_size * FLAGS.epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(FLAGS.train_size / FLAGS.batch_size) print(num_train_steps, num_warmup_steps, "=============") opt_config = Bunch({"init_lr":1e-5, "num_train_steps":num_train_steps, "num_warmup_steps":num_warmup_steps}) model_io_config = Bunch({"fix_lm":False}) model_io_fn = model_io.ModelIO(model_io_config) num_choice = FLAGS.num_classes max_seq_length = FLAGS.max_length model_train_fn = distillation.distillation_model_fn( model_config_dict=model_config_dict, num_labels=num_choice, init_checkpoint_dict=init_checkpoint_dict, model_reuse=None, load_pretrained={"teacher":True, "student":True}, model_io_fn=model_io_fn, model_io_config=model_io_config, opt_config=opt_config, student_input_name=["a", "b"], teacher_input_name=["a", "b"], unlabel_input_name=["ua", "ub"], temperature=temperature, exclude_scope_dict={"student":"student", "teacher":"teacher"}, not_storage_params=["adam_m", "adam_v"], distillation_weight={"label":distill_ratio, "unlabel":distill_ratio, "true_label":true_label_ratio}, if_distill_unlabeled=False ) model_eval_fn = distillation.distillation_model_fn( model_config_dict=model_config_dict, num_labels=num_choice, init_checkpoint_dict=init_checkpoint_dict, model_reuse=True, load_pretrained={"teacher":True, "student":True}, model_io_fn=model_io_fn, model_io_config=model_io_config, opt_config=opt_config, student_input_name=["a", "b"], teacher_input_name=["a", "b"], unlabel_input_name=["ua", "ub"], temperature=temperature, exclude_scope_dict={"student":"student", "teacher":"teacher"}, not_storage_params=["adam_m", "adam_v"], distillation_weight={"label":distill_ratio, "unlabel":distill_ratio, "true_label":true_label_ratio}, if_distill_unlabeled=False ) def metric_fn(features, logits, loss): print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) accuracy = correct = tf.equal( tf.cast(pred_label, tf.int32), tf.cast(features["label_ids"], tf.int32) ) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) return {"accuracy":accuracy, "loss":loss, "pred_label":pred_label, "label_ids":features["label_ids"]} name_to_features = { "input_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask_a": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64), "input_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask_b": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size # train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords", # _decode_record, name_to_features, params) # eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords", # _decode_record, name_to_features, params) train_features = tf_data_utils.train_input_fn(FLAGS.train_file, _decode_record, name_to_features, params) eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file, _decode_record, name_to_features, params) [train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN) [_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) result = metric_fn(eval_features, eval_logits, eval_loss) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) def eval_fn(result): i = 0 total_accuracy = 0 label, label_id = [], [] while True: try: eval_result = sess.run(result) total_accuracy += eval_result["accuracy"] label_id.extend(eval_result["label_ids"]) label.extend(eval_result["pred_label"]) i += 1 except tf.errors.OutOfRangeError: print("End of dataset") break f1 = f1_score(label_id, label, average="macro") accuracy = accuracy_score(label_id, label) print("test accuracy accuracy {} {} f1 {}".format(total_accuracy/i, accuracy, f1)) return total_accuracy/ i, f1 def train_fn(op, loss): i = 0 cnt = 0 total_loss = 0.0 while True: try: [_, train_loss] = sess.run([op, loss]) total_loss += train_loss i += 1 cnt += 1 if np.mod(i, num_storage_steps) == 0: print(total_loss/cnt) # model_io_fn.save_model(sess, "/data/xuht/wsdm19/data/model_11_15_focal_loss/oqmrc_{}.ckpt".format(int(i/8000))) model_io_fn.save_model(sess, FLAGS.model_output+"/oqmrc_{}.ckpt".format(int(i/num_storage_steps))) print("==successful storing model=={}".format(int(i/num_storage_steps))) total_loss = 0 cnt = 0 except tf.errors.OutOfRangeError: break print("===========begin to train============") train_fn(train_op, train_loss) print("===========begin to eval============") accuracy, f1 = eval_fn(result) print("==accuracy {} f1 {}==".format(accuracy, f1)) # model_io_fn.save_model(sess, "/data/xuht/wsdm19/data/model_11_15_focal_loss/oqmrc.ckpt") model_io_fn.save_model(sess, FLAGS.model_output+"/oqmrc.ckpt")
def main(_): hvd.init() sess_config = tf.ConfigProto() sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) graph = tf.Graph() from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score with graph.as_default(): import json # config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r")) config = json.load(open(FLAGS.config_file, "r")) init_checkpoint = FLAGS.init_checkpoint print("===init checkoutpoint==={}".format(init_checkpoint)) config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "bert" config.dropout_prob = 0.1 config.label_type = "single_label" config.lm_ratio = 0.0 config.task_ratio = 1.0 json.dump(config, open(FLAGS.model_output+"/config.json", "w")) init_lr = 1e-5 if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / hvd.size()) elif FLAGS.if_shard == "1": train_size = int(FLAGS.train_size/hvd.size()) epoch = FLAGS.epoch sess = tf.Session(config=sess_config) num_train_steps = int( train_size / FLAGS.batch_size * epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(train_size / FLAGS.batch_size) print(num_train_steps, num_warmup_steps, "=============") opt_config = Bunch({"init_lr":init_lr/(hvd.size()), "num_train_steps":num_train_steps, "num_warmup_steps":num_warmup_steps}) model_io_config = Bunch({"fix_lm":False}) model_io_fn = model_io.ModelIO(model_io_config) optimizer_fn = optimizer.Optimizer(opt_config) num_choice = FLAGS.num_classes max_seq_length = FLAGS.max_length max_predictions_per_seq = FLAGS.max_predictions_per_seq model_train_fn = classifier_fn.classifier_model_fn_builder(config, num_choice, init_checkpoint, reuse=None, load_pretrained=True, model_io_fn=model_io_fn, optimizer_fn=optimizer_fn, model_io_config=model_io_config, opt_config=opt_config) model_eval_fn = classifier_fn.classifier_model_fn_builder(config, num_choice, init_checkpoint, reuse=True, load_pretrained=True, model_io_fn=model_io_fn, optimizer_fn=optimizer_fn, model_io_config=model_io_config, opt_config=opt_config) name_to_features = { "input_ids": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([max_seq_length], tf.int64), "masked_lm_positions": tf.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_ids": tf.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_weights": tf.FixedLenFeature([max_predictions_per_seq], tf.float32), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = epoch params.batch_size = FLAGS.batch_size def parse_folder(path): files = os.listdir(path) output = [] for file_name in files: output.append(os.path.join(path, file_name)) random.shuffle(output) return output train_features = tf_data_utils.train_input_fn( parse_folder(FLAGS.train_file), _decode_record, name_to_features, params) train_dict = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN) eval_features = tf_data_utils.eval_input_fn( parse_folder(FLAGS.dev_file), _decode_record, name_to_features, params) eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) model_io_fn.set_saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) sess.run(hvd.broadcast_global_variables(0)) def eval_fn(op_dict): i = 0 eval_total_dict = {} while True: try: eval_result = sess.run(op_dict) for key in eval_result: if key in ["probabilities", "label_ids"]: if key in eval_total_dict: eval_total_dict[key].extend(eval_result[key]) else: eval_total_dict[key] = [] eval_total_dict[key].extend(eval_result[key]) i += 1 except tf.errors.OutOfRangeError: print("End of dataset") break for key in eval_result: if key not in ["probabilities", "label_ids"]: eval_total_dict[key] = eval_result[key] label_id = eval_total_dict["label_ids"] label = np.argmax(np.array(eval_total_dict["probabilities"]), axis=-1) macro_f1 = f1_score(label_id, label, average="macro") micro_f1 = f1_score(label_id, label, average="micro") accuracy = accuracy_score(label_id, label) print("test accuracy {} macro_f1 score {} micro_f1 {} masked_lm_accuracy {} sentence_f {}".format(accuracy, macro_f1, micro_f1, eval_total_dict["masked_lm_accuracy"], eval_total_dict["sentence_f"])) return eval_total_dict def run_eval(steps): import _pickle as pkl eval_features = tf_data_utils.eval_input_fn( parse_folder(FLAGS.dev_file), _decode_record, name_to_features, params) eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) sess.run(tf.local_variables_initializer()) eval_finial_dict = eval_fn(eval_dict) if hvd.rank() == 0: pkl.dump(eval_finial_dict, open(FLAGS.model_output+"/eval_dict_{}.pkl".format(steps), "wb")) return eval_finial_dict def train_fn(op_dict): i = 0 cnt = 0 loss_dict = {} monitoring_train = [] monitoring_eval = [] while True: try: train_result = sess.run(op_dict) for key in train_result: if key == "train_op": continue else: if np.isnan(train_result[key]): print(train_loss, "get nan loss") break else: if key in loss_dict: loss_dict[key] += train_result[key] else: loss_dict[key] = train_result[key] i += 1 cnt += 1 if np.mod(i, num_storage_steps) == 0: string = "" for key in loss_dict: tmp = key + " " + str(loss_dict[key]/cnt) + "\t" string += tmp print(string) monitoring_train.append(loss_dict) eval_finial_dict = run_eval(int(i/num_storage_steps)) monitoring_eval.append(eval_finial_dict) for key in loss_dict: loss_dict[key] = 0.0 if hvd.rank() == 0: model_io_fn.save_model(sess, FLAGS.model_output+"/model_{}.ckpt".format(int(i/num_storage_steps))) print("==successful storing model=={}".format(int(i/num_storage_steps))) cnt = 0 except tf.errors.OutOfRangeError: if hvd.rank() == 0: import _pickle as pkl pkl.dump({"train":monitoring_train, "eval":monitoring_eval}, open(FLAGS.model_output+"/monitoring.pkl", "wb")) break print("===========begin to train============") train_fn(train_dict) if hvd.rank() == 0: model_io_fn.save_model(sess, FLAGS.model_output+"/model.ckpt") print("===========begin to eval============") eval_finial_dict = run_eval("final")