def process_feature(self, feature): """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" self.num_features += 1 features = collections.OrderedDict() features["input_ids_a"] = tf_data_utils.create_int_feature( feature.input_ids_a) features["input_mask_a"] = tf_data_utils.create_int_feature( feature.input_mask_a) features["segment_ids_a"] = tf_data_utils.create_int_feature( feature.segment_ids_a) features["input_ids_b"] = tf_data_utils.create_int_feature( feature.input_ids_b) features["input_mask_b"] = tf_data_utils.create_int_feature( feature.input_mask_b) features["segment_ids_b"] = tf_data_utils.create_int_feature( feature.segment_ids_b) features["label_ids"] = tf_data_utils.create_int_feature( [feature.label_ids]) try: features["qas_id"] = tf_data_utils.create_int_feature( [feature.guid]) except: pass try: features["class_ratio"] = tf_data_utils.create_float_feature( [feature.class_ratio]) except: pass tf_example = tf.train.Example(features=tf.train.Features( feature=features)) self._writer.write(tf_example.SerializeToString())
def _read_write(self, input_file, output_file, tokenizer, max_length=64, bos='<S>', eos='<T>', **kargs): self._writer = tf.python_io.TFRecordWriter(output_file) with tf.gfile.Open(input_file, "r") as f: for i, line in enumerate(f): if not line.strip() or i == 0: continue content = clean(line.strip()) word_seq = [] if kargs.get('token_mapping', {}): for key in kargs.get('token_mapping', {}): content = re.sub(key, kargs.get('token_mapping', {}).get(key, ""), content) for word in content.split(): if CN_CHARACTER_REGEX.findall(word): word_seq.extend(list(word)) else: word_seq.append(word) if len(word_seq) > max_length: word_seq = word_seq[0:max_length] word_seq = [bos] + word_seq + [eos] word_id_seq = tokenizer.convert_tokens_to_ids(word_seq, max_length+2) seq_mask = [1] * len(word_id_seq) word_id_seq = tokenizer.padding(word_id_seq, max_length+2, 0) seq_mask = tokenizer.padding(seq_mask, max_length+2, 0) assert len(word_id_seq) == max_length + 2 assert len(seq_mask) == max_length + 2 features = collections.OrderedDict() features["input_ids"] = tf_data_utils.create_int_feature(word_id_seq) features["input_mask"] = tf_data_utils.create_int_feature(seq_mask) if i <= 30: tf.logging.info("*** Example ***") tf.logging.info("input_ids: %s" % " ".join([str(x) for x in word_id_seq])) tf.logging.info("input_ids_ori: %s" % " ".join(word_seq)) tf.logging.info("input_ids_length: %s" % (len(input_ids))) tf_example = tf.train.Example(features=tf.train.Features(feature=features)) self._writer.write(tf_example.SerializeToString()) self._writer.close()
def process_feature(self, feature): self.num_features += 1 features = collections.OrderedDict() features["input_ids"] = tf_data_utils.create_int_feature( feature.input_ids) features["input_mask"] = tf_data_utils.create_int_feature( feature.input_mask) features["segment_ids"] = tf_data_utils.create_int_feature( feature.segment_ids) features["masked_lm_positions"] = tf_data_utils.create_int_feature( feature.masked_lm_positions) features["masked_lm_ids"] = tf_data_utils.create_int_feature( feature.masked_lm_ids) features["masked_lm_weights"] = tf_data_utils.create_float_feature( feature.masked_lm_weights) features["label_ids"] = tf_data_utils.create_int_feature( [feature.label_ids]) try: features["qas_id"] = tf_data_utils.create_int_feature( [feature.guid]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) self._writer.write(tf_example.SerializeToString()) except: tf_example = tf.train.Example(features=tf.train.Features( feature=features)) self._writer.write(tf_example.SerializeToString())
def process_feature(self, feature): self.num_features += 1 features = collections.OrderedDict() features["input_ids_a"] = tf_data_utils.create_int_feature( feature.input_ids_a) features["label_ids"] = tf_data_utils.create_int_feature( [feature.label_ids]) try: features["input_char_ids_a"] = tf_data_utils.create_int_feature( feature.input_char_ids_a) except: s = 0 try: features["input_ids_b"] = tf_data_utils.create_int_feature( feature.input_ids_b) except: s = 0 try: features["input_char_ids_b"] = tf_data_utils.create_int_feature( feature.input_char_ids_b) except: s = 0 try: features["label_probs"] = tf_data_utils.create_float_feature( feature.label_probs) except: s = 0 try: features["label_ratio"] = tf_data_utils.create_float_feature( [feature.label_ratio]) except: s = 0 try: features[ "distillation_ratio"] = tf_data_utils.create_float_feature( [feature.distillation_ratio]) except: s = 0 try: features[ "distillation_feature"] = tf_data_utils.create_float_feature( feature.feature) except: s = 0 try: features["adv_ids"] = tf_data_utils.create_int_feature( [feature.adv_ids]) except: s = 0 tf_example = tf.train.Example(features=tf.train.Features( feature=features)) self._writer.write(tf_example.SerializeToString())
def process_feature(self, feature, **kargs): self.num_features += 1 features = collections.OrderedDict() # print(feature.label_probs) features["input_ids_a"] = tf_data_utils.create_int_feature( feature.input_ids_a) if kargs.get("label_type", "multi_class") == "multi_class": features["label_ids"] = tf_data_utils.create_int_feature( [feature.label_ids]) else: features["label_ids"] = tf_data_utils.create_int_feature( feature.label_ids) try: features["input_char_ids_a"] = tf_data_utils.create_int_feature( feature.input_char_ids_a) except: s = 0 try: features["input_ids_b"] = tf_data_utils.create_int_feature( feature.input_ids_b) except: s = 0 try: features["input_char_ids_b"] = tf_data_utils.create_int_feature( feature.input_char_ids_b) except: s = 0 try: features["label_probs"] = tf_data_utils.create_float_feature( feature.label_probs) except: s = 0 try: features["label_ratio"] = tf_data_utils.create_float_feature( [feature.label_ratio]) except: s = 0 try: features[ "distillation_ratio"] = tf_data_utils.create_float_feature( [feature.distillation_ratio]) except: s = 0 try: features[ "distillation_feature"] = tf_data_utils.create_float_feature( feature.feature) except: s = 0 tf_example = tf.train.Example(features=tf.train.Features( feature=features)) self._writer.write(tf_example.SerializeToString())
def process_feature(self, feature): """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" self.num_features += 1 features = collections.OrderedDict() features["input_ids"] = tf_data_utils.create_int_feature( feature.input_ids) features["input_mask"] = tf_data_utils.create_int_feature( feature.input_mask) features["segment_ids"] = tf_data_utils.create_int_feature( feature.segment_ids) features["label_ids"] = tf_data_utils.create_int_feature( [feature.choice]) try: features["qas_id"] = tf_data_utils.create_int_feature( [feature.unique_id]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) self._writer.write(tf_example.SerializeToString()) except: tf_example = tf.train.Example(features=tf.train.Features( feature=features)) self._writer.write(tf_example.SerializeToString())
def process_feature(self, feature): """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" self.num_features += 1 features = collections.OrderedDict() features["unique_ids"] = tf_data_utils.create_int_feature( [feature.unique_id]) features["input_ids"] = tf_data_utils.create_int_feature( feature.input_ids) features["input_mask"] = tf_data_utils.create_int_feature( feature.input_mask) features["segment_ids"] = tf_data_utils.create_int_feature( feature.segment_ids) if self.is_training: features["start_positions"] = tf_data_utils.create_int_feature( [feature.start_position]) features["end_positions"] = tf_data_utils.create_int_feature( [feature.end_position]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) self._writer.write(tf_example.SerializeToString())
def main(_): print(FLAGS) print(tf.__version__, "==tensorflow version==") init_checkpoint = os.path.join(FLAGS.buckets, FLAGS.init_checkpoint) train_file = os.path.join(FLAGS.buckets, FLAGS.train_file) dev_file = os.path.join(FLAGS.buckets, FLAGS.dev_file) checkpoint_dir = os.path.join(FLAGS.buckets, FLAGS.model_output) print(init_checkpoint, train_file, dev_file, checkpoint_dir) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) cluster = {'chief': ['localhost:2221'], 'worker': ['localhost:2222']} try: os.environ['TF_CONFIG'] = json.dumps({ 'cluster': cluster, 'task': { 'type': 'evaluator', 'index': 0 } }) except: print("==not tf config env==") run_config = tf.estimator.RunConfig(keep_checkpoint_max=5, model_dir=checkpoint_dir, session_config=sess_config, save_checkpoints_secs=None, save_checkpoints_steps=None, log_step_count_steps=100) task_index = run_config.task_id is_chief = run_config.is_chief worker_count = 1 print("==worker_count==", worker_count, "==local_rank==", task_index, "==is is_chief==", is_chief) target = "" if FLAGS.mode == "single_task": train_eval_api = train_eval elif FLAGS.mode == "multi_task": train_eval_api = multitask_train_eval if FLAGS.run_type == "estimator": train_eval_api.monitored_estimator(FLAGS=FLAGS, worker_count=worker_count, task_index=task_index, cluster=cluster, is_chief=is_chief, target=target, init_checkpoint=init_checkpoint, train_file=train_file, dev_file=dev_file, checkpoint_dir=checkpoint_dir, run_config=run_config, profiler=FLAGS.profiler, parse_type=FLAGS.parse_type, rule_model=FLAGS.rule_model, train_op=FLAGS.train_op, running_type="eval", input_target=FLAGS.input_target, ues_token_type=FLAGS.ues_token_type, attention_type=FLAGS.attention_type) elif FLAGS.run_type == "sess": result_dict = train_eval_api.monitored_sess( FLAGS=FLAGS, worker_count=worker_count, task_index=task_index, cluster=cluster, is_chief=is_chief, target=target, init_checkpoint=init_checkpoint, train_file=train_file, dev_file=dev_file, checkpoint_dir=checkpoint_dir, run_config=run_config, profiler=FLAGS.profiler, parse_type=FLAGS.parse_type, rule_model=FLAGS.rule_model, train_op=FLAGS.train_op, running_type="eval", input_target=FLAGS.input_target, ues_token_type=FLAGS.ues_token_type, attention_type=FLAGS.attention_type) result_log_file = os.path.join(checkpoint_dir, FLAGS.feature_output) print(result_log_file, "==result log path==") # with tf.gfile.GFile(result_log_file, 'w') as f: # f.write(json.dumps(result_dict)+"\n") writer = tf.python_io.TFRecordWriter(result_log_file) try: for label_id, feature, prob in zip(result_dict["label_ids"], result_dict["feature"], result_dict["prob"]): features = {} features["label_id"] = tf_data_utils.create_int_feature( [label_id]) features["feature"] = tf_data_utils.create_float_feature( feature) features["prob"] = tf_data_utils.create_float_feature(prob) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writer.write(tf_example.SerializeToString()) writer.close() except: print("===not legal output for writer===")
def write2tfrecords(): multi_task_config = Bunch(json.load(tf.gfile.Open( FLAGS.multi_task_config))) generator = create_generator(FLAGS, multi_task_config, "train", FLAGS.epoch) _writer = tf.python_io.TFRecordWriter( os.path.join(FLAGS.buckets, FLAGS.model_output)) problem_config = multi_task_config[FLAGS.multi_task_type.split(",")[0]] cnt = 0 for idx, item in enumerate(tqdm(generator)): features = {} features["input_ids"] = tf_data_utils.create_int_feature( item["input_ids"]) features["input_mask"] = tf_data_utils.create_int_feature( item["input_mask"]) features["segment_ids"] = tf_data_utils.create_int_feature( item["segment_ids"]) if problem_config["lm_augumentation"]: features["masked_lm_positions"] = tf_data_utils.create_int_feature( item["masked_lm_positions"]) features["masked_lm_ids"] = tf_data_utils.create_int_feature( item["masked_lm_ids"]) features["masked_lm_weights"] = tf_data_utils.create_int_feature( item["masked_lm_weights"]) for problem in FLAGS.multi_task_type.split(","): problem_dict = multi_task_config[problem] problem_type = multi_task_config[problem]["task_type"] features["{}_loss_multiplier".format( problem)] = tf_data_utils.create_int_feature( [item["{}_loss_multiplier".format(problem)]]) if problem_type in ['cls_task']: features["{}_label_ids".format( problem)] = tf_data_utils.create_int_feature( [item["{}_label_ids".format(problem)]]) elif problem_type in ['seq2seq_tag_task', 'seq2seq_text_task']: features["{}_label_ids".format( problem)] = tf_data_utils.create_int_feature( item["{}_label_ids".format(problem)]) features["task_id"] = tf_data_utils.create_int_feature( [item["task_id"]]) try: features["guid"] = tf_data_utils.create_int_feature([idx]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) _writer.write(tf_example.SerializeToString()) except: tf_example = tf.train.Example(features=tf.train.Features( feature=features)) _writer.write(tf_example.SerializeToString()) cnt += 1 print("==total sample==", cnt)
def process_feature(self, feature, task_type, task_type_dict): """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" self.num_features += 1 features = collections.OrderedDict() features["input_ids"] = tf_data_utils.create_int_feature( feature.input_ids) features["input_mask"] = tf_data_utils.create_int_feature( feature.input_mask) features["segment_ids"] = tf_data_utils.create_int_feature( feature.segment_ids) for task_index, task in enumerate(task_type_dict): if task == task_type: features["{}_loss_multiplier".format( task)] = tf_data_utils.create_int_feature([1]) if task_type_dict[task]["task_type"] == "cls_task": features["{}_label_ids".format( task)] = tf_data_utils.create_int_feature( [feature.label_ids]) elif task_type_dict[task]["task_type"] == "seq2tag": features["{}_label_ids".format( task)] = tf_data_utils.create_int_feature( feature.label_ids) elif task_type_dict[task]["task_type"] == "mrc": features["{}_label_ids".format( task)] = tf_data_utils.create_int_feature( feature.label_ids) features["task_id"] = tf_data_utils.create_int_feature( [task_index]) else: features["{}_loss_multiplier".format( task)] = tf_data_utils.create_int_feature([0]) if task_type_dict[task]["task_type"] == "cls_task": features["{}_label_ids".format( task)] = tf_data_utils.create_int_feature([0]) elif task_type_dict[task]["task_type"] == "seq2tag": features["{}_label_ids".format( task)] = tf_data_utils.create_int_feature( [0] * len(feature.label_ids)) elif task_type_dict[task]["task_type"] == "mrc": features["{}_label_ids".format( task)] = tf_data_utils.create_int_feature( [0] * len(feature.label_ids)) if self.num_features == 10: print(features.keys()) try: features["guid"] = tf_data_utils.create_int_feature([feature.guid]) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) self._writer.write(tf_example.SerializeToString()) except: tf_example = tf.train.Example(features=tf.train.Features( feature=features)) self._writer.write(tf_example.SerializeToString())