def get_scores(): tf.compat.v1.logging.set_verbosity(tf.logging.INFO) tokenization.validate_case_matches_checkpoint(LOWER_CASE, BERT_INIT_CHKPNT) tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=LOWER_CASE) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG) inputs = read_examples(INPUT_FILE) features, all_tokens = convert_examples_to_features( inputs, SEQ_LEN, tokenizer) input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids = \ features_to_vectors(features) print(masked_lm_ids) tf.reset_default_graph() sess = tf.Session() model = Model(bert_config) sess.run(tf.global_variables_initializer()) losses = sess.run(model.masked_lm_example_loss, feed_dict={ model.input_ids: input_ids, model.input_mask: input_mask, model.token_type: segment_ids, model.masked_lm_positions: masked_lm_positions, model.masked_lm_ids: masked_lm_ids }) parse_result(losses, all_tokens)
def validate_flags_or_throw(bert_config): """Validate the input FLAGS or throw an exception.""" tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if FLAGS.do_train: if not FLAGS.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if FLAGS.do_predict: if not FLAGS.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: raise ValueError( "The max_seq_length (%d) must be greater than max_query_length " "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length))
def load_models(self): """ Load and initialize saved BERT model. :return: Tensorflow Estimator with saved weights. """ bert_config = modeling.BertConfig.from_json_file(self.config) tokenization.validate_case_matches_checkpoint(True, self.init_ckpt) self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab, do_lower_case=True) run_config = tf.estimator.RunConfig( model_dir=self.out_dir, save_summary_steps=0, keep_checkpoint_max=1, save_checkpoints_steps=0) model_fn = ut.model_fn_builder( bert_config=bert_config, num_labels=self.n_classes, init_checkpoint=self.init_ckpt, learning_rate=0, num_train_steps=0, num_warmup_steps=0, use_tpu=False, use_one_hot_embeddings=False) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, params={"batch_size": self.batch_size}) return estimator
def chinese_tokenizer(): BERT_INIT_CHKPNT = './chinese_L-12_H-768_A-12/bert_model.ckpt' BERT_VOCAB = './chinese_L-12_H-768_A-12/vocab.txt' tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT) tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True) return tokenizer
def create_tokenizer_from_hub_module(hp): """ create tokenizer :return: """ tokenization.validate_case_matches_checkpoint(True, hp.BERT_INIT_CHKPNT) return tokenization.FullTokenizer(vocab_file=hp.BERT_VOCAB, do_lower_case=True)
def __init__(self): tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT) self.tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True) self.max_seq_length = MAX_SEQ_LENGTH self.train_data_path = train_data_path self.train_tf_record_path = train_tf_record_path self.eval_tf_record_path = eval_tf_record_path self.train_val_ratio =TRAIN_VAL_RATIO self.train_examples = None self.eval_examples = None
def __init__(self, model, bert_config_file, vocab_file, init_checkpoint, batch_size=32, max_seq_length=128, do_lower_case=False, finetune_embedding=False, split_args=False, is_training=False, truncation_mode="normal", padding_action='normal', scope=None): self.model = model self.is_mask_attentional_model = self.model.startswith("mask") self.bert_config_file = bert_config_file self.vocab_file = vocab_file self.init_checkpoint = init_checkpoint self.batch_size = batch_size self.max_seq_length = max_seq_length self.max_arg_length = int(max_seq_length / 2) self.do_lower_case = do_lower_case self.split_args = split_args self.finetune_embedding = finetune_embedding self.truncation_mode = truncation_mode self.padding_action = padding_action # Word-Piece tokenizer self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) # load bert tokenization.validate_case_matches_checkpoint(self.do_lower_case, self.init_checkpoint) self.bert_config = copy.deepcopy( modeling.BertConfig.from_json_file(self.bert_config_file)) self.is_training = is_training if not self.is_training: self.bert_config.hidden_dropout_prob = 0.0 self.bert_config.attention_probs_dropout_prob = 0.0 self._embedding_table = None self._vocab = tokenization.load_vocab(self.vocab_file) # max_position_embeddings==512 if self.max_seq_length > self.bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (self.max_seq_length, self.bert_config.max_position_embeddings)) self.build()
def read_config(self): #tf.logging.set_verbosity(tf.logging.INFO) processors = { "multilabel": MultilabelClassfier, } tokenization.validate_case_matches_checkpoint(self.do_lower_case, self.init_checkpoint) #if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: # raise ValueError( # "At least one of `do_train`, `do_eval` or `do_predict' must be True.") self.bert_config = modeling.BertConfig.from_json_file( self.bert_config_file) if self.max_seq_length > self.bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (self.max_seq_length, self.bert_config.max_position_embeddings)) tf.gfile.MakeDirs(self.output_dir) task_name = self.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) self.processor = processors[task_name]() self.label_list = self.processor.get_labels(self.data_dir) self.tokenizer = tokenization.FullTokenizer( vocab_file=self.vocab_file, do_lower_case=self.do_lower_case) tpu_cluster_resolver = None if self.use_tpu and self.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( self.tpu_name, zone=self.tpu_zone, project=self.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 self.run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=self.master, model_dir=self.output_dir, save_checkpoints_steps=self.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=self.iterations_per_loop, num_shards=self.num_tpu_cores, per_host_input_for_training=is_per_host))
def get_scores(): tf.compat.v1.logging.set_verbosity(tf.logging.INFO) tokenization.validate_case_matches_checkpoint(LOWER_CASE, BERT_INIT_CHKPNT) tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=LOWER_CASE) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG) # for v in tf.train.list_variables(BERT_INIT_CHKPNT): # print(v) tf.reset_default_graph() sess = tf.Session() model = Model(bert_config) tvars = tf.trainable_variables() print(len(tvars)) (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( tvars, BERT_INIT_CHKPNT) tf.train.init_from_checkpoint(BERT_INIT_CHKPNT, assignment_map) sess.run(tf.global_variables_initializer()) inputs = read_examples(INPUT_FILE) # inputs = [inputs[0]] not_masked_ids, arrays = \ get_all_tokens(inputs, tokenizer, SEQ_LEN) print(arrays[0]) preds, logits, inp = sess.run( [tf.nn.softmax(model.logits), model.logits, model.input_ids], feed_dict={ model.input_ids: arrays[0], model.input_mask: arrays[1], model.token_type: arrays[2] }) print("input:", inp) print("logits:", logits) print("softmax:", preds) print(preds.shape) first_index = 0 sent_probs = [] for ids in not_masked_ids: print(ids) sent_preds = preds[first_index:first_index + len(ids), :, :] word_probs = [sent_preds[i, i + 1, x] for i, x in enumerate(ids)] print(word_probs) sent_prob = np.prod(word_probs) sent_probs.append(sent_prob) first_index += len(ids) print(list(zip(inputs, sent_probs))) probs = np.array(sent_probs) / sum(sent_probs) print(list(zip(inputs, probs)))
def create_estimator(): tf.logging.set_verbosity(tf.logging.INFO) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) label_list = get_labels() # tokenizer = tokenization.FullTokenizer( # vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) run_config = tf.contrib.tpu.RunConfig( # cluster=tpu_cluster_resolver, # master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, # tpu_config=tf.contrib.tpu.TPUConfig( # iterations_per_loop=FLAGS.iterations_per_loop, # num_shards=FLAGS.num_tpu_cores, # per_host_input_for_training=is_per_host)) ) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, # learning_rate=FLAGS.learning_rate, # num_train_steps=num_train_steps, # num_warmup_steps=num_warmup_steps, # use_tpu=FLAGS.use_tpu, # use_one_hot_embeddings=FLAGS.use_tpu ) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, # eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) return estimator
def __init__(self, bert_path, output_path): self.BERT_VOCAB = os.path.join(bert_path, 'vocab.txt') self.BERT_INIT_CHKPNT = os.path.join(bert_path, 'bert_model.ckpt') self.BERT_CONFIG = os.path.join(bert_path, 'bert_config.json') tokenization.validate_case_matches_checkpoint(True, self.BERT_INIT_CHKPNT) self.tokenizer = tokenization.FullTokenizer(vocab_file=self.BERT_VOCAB, do_lower_case=True) self.ID = 'guid' self.DATA_COLUMN = 'txt' self.LABEL_COLUMNS = [ 'Safety', 'CleanlinessView', 'Information', 'Service', 'Comfort', 'PersonnelCard', 'Additional' ] self.MAX_SEQ_LENGTH = 128 # Compute train and warmup steps from batch size # These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb) self.BATCH_SIZE = 32 self.LEARNING_RATE = 2e-5 self.NUM_TRAIN_EPOCHS = 1 # Warmup is a period of time where hte learning rate # is small and gradually increases--usually helps training. self.WARMUP_PROPORTION = 0.1 # Model configs self.SAVE_CHECKPOINTS_STEPS = 1000 self.SAVE_SUMMARY_STEPS = 500 self.run_config = tf.estimator.RunConfig( model_dir=output_path, save_summary_steps=self.SAVE_SUMMARY_STEPS, keep_checkpoint_max=1, save_checkpoints_steps=self.SAVE_CHECKPOINTS_STEPS) self.train_file = os.path.join(output_path, "train.tf_record") if not os.path.exists(self.train_file): open(self.train_file, 'w', encoding='utf8').close() self.eval_file = os.path.join(output_path, "eval.tf_record") if not os.path.exists(self.eval_file): open(self.eval_file, 'w', encoding='utf8').close() self.output_eval_file = os.path.join(output_path, "eval_results.txt")
def create_mini_bert_weights(model_dir=None): model_dir = model_dir if model_dir is not None else tempfile.TemporaryDirectory( ).name os.makedirs(model_dir, exist_ok=True) from bert.loader import StockBertConfig bert_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] bert_config = StockBertConfig( attention_probs_dropout_prob=0.1, hidden_act="gelu", hidden_dropout_prob=0.1, hidden_size=8, initializer_range=0.02, intermediate_size=32, max_position_embeddings=32, num_attention_heads=2, num_hidden_layers=2, type_vocab_size=2, vocab_size=len(string.ascii_lowercase) * 2 + len(bert_tokens), ) print("creating mini BERT at:", model_dir) bert_config_file = os.path.join(model_dir, "bert_config.json") bert_vocab_file = os.path.join(model_dir, "vocab.txt") with open(bert_config_file, "w") as f: f.write(bert_config.to_json_string()) with open(bert_vocab_file, "w") as f: f.write("\n".join(list(string.ascii_lowercase) + bert_tokens)) f.write("\n".join( ["##" + tok for tok in list(string.ascii_lowercase)])) with tf.Graph().as_default(): _ = MiniBertFactory.create_stock_bert_graph(bert_config_file, 16) saver = tf.compat.v1.train.Saver(max_to_keep=1, save_relative_paths=True) with tf.compat.v1.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) ckpt_path = os.path.join(model_dir, "bert_model.ckpt") save_path = saver.save(sess, ckpt_path, write_meta_graph=True) print("saving to:", save_path) validate_case_matches_checkpoint(True, save_path) return save_path
def bert(validate=True): """ Load BERT similarity model. Parameters ---------- validate: bool, optional (default=True) if True, malaya will check model availability and download if not available. Returns ------- SIMILARITY_BERT : malaya._models._tensorflow_model.SIAMESE_BERT class """ if not isinstance(validate, bool): raise ValueError('validate must be a boolean') try: from bert import tokenization except: raise Exception( 'bert-tensorflow not installed. Please install it using `pip3 install bert-tensorflow` and try again.' ) if validate: check_file(PATH_SIMILARITY['bert'], S3_PATH_SIMILARITY['bert']) else: if not check_available(PATH_SIMILARITY['bert']): raise Exception( 'toxic/bert is not available, please `validate = True`') tokenization.validate_case_matches_checkpoint(True, '') tokenizer = tokenization.FullTokenizer( vocab_file=PATH_SIMILARITY['bert']['vocab'], do_lower_case=True) try: g = load_graph(PATH_SIMILARITY['bert']['model']) except: raise Exception( "model corrupted due to some reasons, please run malaya.clear_cache('similarity/bert') and try again" ) return SIAMESE_BERT( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g), tokenizer=tokenizer, maxlen=100, label=['not similar', 'similar'], )
def build_model(self): # Placeholders for input, output BERT_VOCAB = '../chinese_L-12_H-768_A-12/vocab.txt' BERT_INIT_CHKPNT = '../chinese_L-12_H-768_A-12/bert_model.ckpt' BERT_CONFIG = '../chinese_L-12_H-768_A-12/bert_config.json' tokenization.validate_case_matches_checkpoint(True, '') bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG) tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True) bert_config = modeling.BertConfig( vocab_size=self.vocab_size, hidden_size=self.size_layer, num_hidden_layers=self.num_layers, num_attention_heads=self.size_layer // 4, intermediate_size=self.size_layer * 2, ) self.input_ids = tf.placeholder(tf.int32, [None, self.seq_len]) self.input_mask = tf.placeholder(tf.int32, [None, self.seq_len]) self.segment_ids = tf.placeholder(tf.int32, [None, self.seq_len]) self.label_ids = tf.placeholder(tf.int32, [None]) self.is_training = tf.placeholder(tf.bool) use_one_hot_embeddings = False self.loss, self.logits, probabilities, model, self.accuracy = create_model( bert_config, self.is_training, self.input_ids, self.input_mask, self.segment_ids, self.label_ids, self.num_classes, use_one_hot_embeddings, ) global_step = tf.Variable(0, trainable=False, name='Global_Step') self.optimizer = tf.contrib.layers.optimize_loss( self.loss, global_step=global_step, learning_rate=self.learning_rate, optimizer='Adam', clip_gradients=3.0, ) tf.summary.scalar("loss", self.loss) self.summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(tf.global_variables())
def get_processor(self, task_name="pico"): processors = {"pico": PICOProcessor} tokenization.validate_case_matches_checkpoint( config.do_lower_case, config.init_checkpoint_dependency) bert_config = modeling.BertConfig.from_json_file( config.bert_config_file) if config.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() return processor
def get_estimator(self, processor): tokenization.validate_case_matches_checkpoint( config.do_lower_case, config.init_checkpoint_dependency) bert_config = modeling.BertConfig.from_json_file( config.bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=config.vocab_file, do_lower_case=config.do_lower_case) tpu_cluster_resolver = None is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=None, model_dir=config.bluebert_dependency_dir, save_checkpoints_steps=1000, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=1000, num_shards=8, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None label_list = processor.get_labels() model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=config.init_checkpoint_dependency, learning_rate=config.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, use_one_hot_embeddings=False) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, train_batch_size=config.pred_batch_size, eval_batch_size=config.pred_batch_size, predict_batch_size=config.pred_batch_size) return estimator
def bert(path, s3_path, class_name, label, validate=True): try: from bert import tokenization except: raise Exception( 'bert-tensorflow not installed. Please install it using `pip3 install bert-tensorflow` and try again.' ) if validate: check_file(path['bert'], s3_path['bert']) else: if not check_available(path['bert']): raise Exception( '%s/bert is not available, please `validate = True`' % (class_name)) tokenization.validate_case_matches_checkpoint(False, '') tokenizer = tokenization.FullTokenizer(vocab_file=path['bert']['vocab'], do_lower_case=False) try: g = load_graph(path['bert']['model']) except: raise Exception( "model corrupted due to some reasons, please run malaya.clear_cache('%s/bert') and try again" % (class_name)) if len(label) > 2: selected_class = MULTICLASS_BERT else: selected_class = BINARY_BERT return selected_class( X=g.get_tensor_by_name('import/Placeholder:0'), segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'), input_masks=g.get_tensor_by_name('import/Placeholder_2:0'), logits=g.get_tensor_by_name('import/logits:0'), sess=generate_session(graph=g), tokenizer=tokenizer, maxlen=100, label=label, )
def test_main(): ID = 'id' DATA_COLUMN = 'content' LABEL_COLUMNS = ['environment', 'price_level', 'traffic', 'food'] num_labels = len(LABEL_COLUMNS) use_one_hot_embeddings = False MAX_SEQ_LENGTH = 128 BATCH_SIZE = 4 os.chdir(r'E:\Toxic_BERT_multi_task') # 加载分词和模型 BERT_VOCAB = 'chinese_L-12_H-768_A-12/vocab.txt' # 模型词汇表 BERT_INIT_CHKPNT = 'output/model.ckpt' # 模型预训练权重 BERT_CONFIG = 'chinese_L-12_H-768_A-12/bert_config.json' # BERT模型架构 tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT) # 检查checkpoint的合法性 tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True) tokenizer.tokenize('查看中文分词效果。') # test = pd.read_csv('reforcement_test.csv') # x_test = test[:100][['id', 'content']] #testing a small sample # x_test = x_test.reset_index(drop=True) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(LABEL_COLUMNS), init_checkpoint=BERT_INIT_CHKPNT, use_one_hot_embeddings=False) estimator = tf.estimator.Estimator(model_fn, params={"batch_size": BATCH_SIZE}) return estimator, tokenizer
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tf.io.gfile.makedirs(FLAGS.output_dir) tf.logging.info("***** FLAGS *****") writer = tf.io.gfile.GFile( f"{FLAGS.output_dir}/{FLAGS.al_query_strategy}_flags.txt", "w+") for key, val in FLAGS.__flags.items(): tf.logging.info(" %s = %s", key, str(val.value)) writer.write("%s = %s\n" % (key, str(val.value))) writer.close() tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2 # Active learning procedure results = active_learning_procedure( FLAGS.al_query_strategy, tokenizer, bert_config, FLAGS.data_dir, FLAGS.output_dir, FLAGS.finetune_module, tpu_cluster_resolver, is_per_host, FLAGS.max_seq_length, FLAGS.use_tpu, FLAGS.predict_batch_size, "train", FLAGS.n_queries, FLAGS.n_instances, FLAGS.sample_size, FLAGS.num_init_train_epochs, FLAGS.num_query_train_epochs, FLAGS.retrain_all, FLAGS.convert_tsv_to_tfrecord)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.input_file_processor == "run_classifier": processors = { "sst-2": rc.SST2Processor, "mnli": rc.MnliProcessor, } elif FLAGS.input_file_processor == "run_classifier_distillation": processors = { "sst-2": rc.SST2ProcessorDistillation, "mnli": rc.MNLIProcessorDistillation, } else: raise ValueError("Invalid --input_file_processor flag value") tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) task_name = FLAGS.task_name.lower() processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) input_ids_placeholder = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_seq_length]) bert_input_mask_placeholder = tf.placeholder( dtype=tf.int32, shape=[None, FLAGS.max_seq_length]) token_type_ids_placeholder = tf.placeholder( dtype=tf.int32, shape=[None, FLAGS.max_seq_length]) prob_vector_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, num_labels]) one_hot_input_ids = tf.one_hot(input_ids_placeholder, depth=bert_config.vocab_size) input_tensor, _ = em_util.run_one_hot_embeddings( one_hot_input_ids=one_hot_input_ids, config=bert_config) flex_input_obj, per_eg_obj, probs = em_util.model_fn( input_tensor=input_tensor, bert_input_mask=bert_input_mask_placeholder, token_type_ids=token_type_ids_placeholder, bert_config=bert_config, num_labels=num_labels, obj_type=FLAGS.obj_type, prob_vector=prob_vector_placeholder) if FLAGS.obj_type.startswith("min"): final_obj = -1 * flex_input_obj elif FLAGS.obj_type.startswith("max"): final_obj = flex_input_obj # Calculate the gradient of the final loss function with respect to # the one-hot input space grad_obj_one_hot = tf.gradients(ys=final_obj, xs=one_hot_input_ids)[0] # gradients with respect to position in one hot input space with 1s in it # this is one term in the directional derivative of HotFlip, # Eq1 in https://arxiv.org/pdf/1712.06751.pdf # # grad_obj_one_hot.shape = [batch_size, seq_length, vocab_size] # input_ids_placeholder.shape = [batch_size, seq_length] # original_token_gradients.shape = [batch_size, seq_length] original_token_gradients = tf.gather(params=grad_obj_one_hot, indices=tf.expand_dims( input_ids_placeholder, -1), batch_dims=2) original_token_gradients = tf.tile(original_token_gradients, multiples=[1, 1, FLAGS.beam_size]) # These are the gradients / indices whose one-hot position has the largest # gradient magnitude, the performs part of the max calculation in Eq10 of # https://arxiv.org/pdf/1712.06751.pdf biggest_gradients, biggest_indices = tf.nn.top_k(input=grad_obj_one_hot, k=FLAGS.beam_size) # Eq10 of https://arxiv.org/pdf/1712.06751.pdf grad_difference = biggest_gradients - original_token_gradients tvars = tf.trainable_variables() assignment_map, _ = modeling.get_assignment_map_from_checkpoint( tvars, FLAGS.init_checkpoint) tf.logging.info("Variables mapped = %d / %d", len(assignment_map), len(tvars)) tf.train.init_from_checkpoint(FLAGS.init_checkpoint, assignment_map) sess = tf.Session() sess.run(tf.global_variables_initializer()) if FLAGS.input_file: custom_examples = processor.get_custom_examples(FLAGS.input_file) custom_templates = [ em_util.input_to_template(x, label_list) for x in custom_examples ] else: prob_vector = [float(x) for x in FLAGS.prob_vector.split(",")] custom_templates = [(FLAGS.input_template, prob_vector)] num_input_sequences = custom_templates[0][0].count("[SEP]") if FLAGS.flipping_mode == "beam_search": FLAGS.batch_size = 1 detok_partial = functools.partial(em_util.detokenize, tokenizer=tokenizer) # Since input files will often be quite large, this flag allows processing # only a slice of the input file if FLAGS.input_file_range: start_index, end_index = FLAGS.input_file_range.split("-") if start_index == "start": start_index = 0 if end_index == "end": end_index = len(custom_templates) start_index, end_index = int(start_index), int(end_index) else: start_index = 0 end_index = len(custom_templates) tf.logging.info("Processing examples in range %d, %d", start_index, end_index) all_elements = [] too_long = 0 for ip_num, (ip_template, prob_vector) in enumerate( custom_templates[start_index:end_index]): # Parse the input template into a list of IDs and the corresponding mask. # Different segments in template are separated by " <piece> " # Each segment is associated with a word piece (or [EMPTY] to get flex # inputs) and a frequency. (which is separated by "<freq>"). * can be used # to choose a frequency till the end of the string # # Here is an example 2-sequence template for tasks like MNLI to optimize # 20 vectors, (10 for each sequence) # [CLS]<freq>1 <piece> [EMPTY]<freq>10 <piece> [SEP]<freq>1 <piece> \ # [EMPTY]<freq>10 <piece> [SEP]<freq>1 <piece> [PAD]<freq>* (input_ids, input_mask, bert_input_mask, token_type_ids) = em_util.template_to_ids( template=ip_template, config=bert_config, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length) if len(input_ids) > FLAGS.max_seq_length: # truncate them! input_ids = input_ids[:FLAGS.max_seq_length] input_mask = input_mask[:FLAGS.max_seq_length] bert_input_mask = bert_input_mask[:FLAGS.max_seq_length] token_type_ids = token_type_ids[:FLAGS.max_seq_length] too_long += 1 all_elements.append({ "input_ids": input_ids, "original_input_ids": [ii for ii in input_ids], "ip_num": start_index + ip_num, "score": 0.0, "bert_input_mask": bert_input_mask, "input_mask": input_mask, "token_type_ids": token_type_ids, "prob_vector": prob_vector, "stopped": False, "steps_taken": 0 }) tf.logging.info("%d / %d were too long and hence truncated.", too_long, len(all_elements)) iteration_number = 0 consistent_output_sequences = [] while all_elements and iteration_number < 10: steps_taken = [] output_sequences = [] failures = [] zero_step_instances = 0 iteration_number += 1 tf.logging.info("Starting iteration number %d", iteration_number) tf.logging.info("Pending items = %d / %d", len(all_elements), len(custom_templates[start_index:end_index])) batch_elements = [] for ip_num, input_object in enumerate(all_elements): batch_elements.append(input_object) # wait until the input has populated up to the batch size if (len(batch_elements) < FLAGS.batch_size and ip_num < len(all_elements) - 1): continue # optimize a part of the flex_input (depending on the template) for step_num in range(FLAGS.total_steps): feed_dict = { input_ids_placeholder: np.array([x["input_ids"] for x in batch_elements]), bert_input_mask_placeholder: np.array([x["bert_input_mask"] for x in batch_elements]), token_type_ids_placeholder: np.array([x["token_type_ids"] for x in batch_elements]), prob_vector_placeholder: np.array([x["prob_vector"] for x in batch_elements]) } if FLAGS.flipping_mode == "random": # Avoiding the gradient computation when the flipping mode is random peo, pr = sess.run([per_eg_obj, probs], feed_dict=feed_dict) else: peo, gd, bi, pr = sess.run( [per_eg_obj, grad_difference, biggest_indices, probs], feed_dict=feed_dict) if FLAGS.print_flips: output_log = "\n" + "\n".join([ "Objective = %.4f, Score = %.4f, Element %d = %s" % (obj, elem["score"], kk, detok_partial(elem["input_ids"])) for kk, (obj, elem) in enumerate(zip(peo, batch_elements)) ]) tf.logging.info("Step = %d %s\n", step_num, output_log) should_stop = evaluate_stopping( stopping_criteria=FLAGS.stopping_criteria, obj_prob_vector=np.array( [x["prob_vector"] for x in batch_elements]), curr_prob_vector=pr, per_example_objective=peo) for elem, stop_bool in zip(batch_elements, should_stop): if stop_bool and (not elem["stopped"]): if step_num == 0: # don't actually stop the perturbation since we want a new input zero_step_instances += 1 else: elem["stopped"] = True elem["steps_taken"] = step_num if np.all([elem["stopped"] for elem in batch_elements]): steps_taken.extend( [elem["steps_taken"] for elem in batch_elements]) output_sequences.extend([elem for elem in batch_elements]) batch_elements = [] break if step_num == FLAGS.total_steps - 1: failures.extend([ elem for elem in batch_elements if not elem["stopped"] ]) steps_taken.extend([ elem["steps_taken"] for elem in batch_elements if elem["stopped"] ]) output_sequences.extend( [elem for elem in batch_elements if elem["stopped"]]) batch_elements = [] break # Flip a token / word-piece either systematically or randomly # For instances where hotflip was not successful, do some random # perturbations before doing hotflip if (FLAGS.flipping_mode == "random" or (iteration_number > 1 and step_num < iteration_number)): for element in batch_elements: # don't perturb elements which have stopped if element["stopped"]: continue random_seq_index = np.random.choice([ ii for ii, mask_id in enumerate(element["input_mask"]) if mask_id > 0.5 ]) random_token_id = np.random.randint( len(tokenizer.vocab)) while (tokenizer.inv_vocab[random_token_id][0] == "[" and tokenizer.inv_vocab[random_token_id][-1] == "]"): random_token_id = np.random.randint( len(tokenizer.vocab)) element["input_ids"][ random_seq_index] = random_token_id elif FLAGS.flipping_mode == "greedy": batch_elements = greedy_updates( old_elements=batch_elements, grad_difference=gd, biggest_indices=bi, max_seq_length=FLAGS.max_seq_length) elif FLAGS.flipping_mode == "beam_search": # only supported with a batch size of 1! batch_elements = beam_search( old_beams=batch_elements, grad_difference=gd, biggest_indices=bi, beam_size=FLAGS.beam_size, accumulate_scores=FLAGS.accumulate_scores, max_seq_length=FLAGS.max_seq_length) else: raise ValueError("Invalid --flipping_mode flag value") tf.logging.info("steps = %.4f (%d failed, %d non-zero, %d zero)", np.mean([float(x) for x in steps_taken if x > 0]), len(failures), len([x for x in steps_taken if x > 0]), zero_step_instances) # measure consistency of final dataset - run a forward pass through the # entire final dataset and verify it satisfies the original objective. This # if the code runs correctly, total_inconsistent = 0 tf.logging.info("Measuring consistency of final dataset") total_inconsistent = 0 total_lossy = 0 for i in range(0, len(output_sequences), FLAGS.batch_size): batch_elements = output_sequences[i:i + FLAGS.batch_size] feed_dict = { input_ids_placeholder: np.array([x["input_ids"] for x in batch_elements]), bert_input_mask_placeholder: np.array([x["bert_input_mask"] for x in batch_elements]), token_type_ids_placeholder: np.array([x["token_type_ids"] for x in batch_elements]), prob_vector_placeholder: np.array([x["prob_vector"] for x in batch_elements]) } peo, pr = sess.run([per_eg_obj, probs], feed_dict=feed_dict) consistency_flags = evaluate_stopping( stopping_criteria=FLAGS.stopping_criteria, obj_prob_vector=np.array( [x["prob_vector"] for x in batch_elements]), curr_prob_vector=pr, per_example_objective=peo) total_inconsistent += len(batch_elements) - np.sum( consistency_flags) # Next, apply a lossy perturbation to the input (conversion to a string) # This is often lossy since it eliminates impossible sequences and # incorrect tokenizations. We check how many consistencies still hold true all_detok_strings = [ em_util.ids_to_strings(elem["input_ids"], tokenizer) for elem in batch_elements ] all_ip_examples = [] if num_input_sequences == 1: for ds, be in zip(all_detok_strings, batch_elements): prob_vector_labels = be["prob_vector"].tolist() all_ip_examples.append( rc.InputExample(text_a=ds[0], text_b=None, label=prob_vector_labels, guid=None)) else: for ds, be in zip(all_detok_strings, batch_elements): prob_vector_labels = be["prob_vector"].tolist() all_ip_examples.append( rc.InputExample(text_a=ds[0], text_b=ds[1], label=prob_vector_labels, guid=None)) all_templates = [ em_util.input_to_template(aie, label_list) for aie in all_ip_examples ] all_new_elements = [] for ip_template, prob_vector in all_templates: (input_ids, input_mask, bert_input_mask, token_type_ids) = em_util.template_to_ids( template=ip_template, config=bert_config, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length) if len(input_ids) > FLAGS.max_seq_length: input_ids = input_ids[:FLAGS.max_seq_length] input_mask = input_mask[:FLAGS.max_seq_length] bert_input_mask = bert_input_mask[:FLAGS.max_seq_length] token_type_ids = token_type_ids[:FLAGS.max_seq_length] all_new_elements.append({ "input_ids": input_ids, "input_mask": input_mask, "bert_input_mask": bert_input_mask, "token_type_ids": token_type_ids, "prob_vector": prob_vector }) feed_dict = { input_ids_placeholder: np.array([x["input_ids"] for x in all_new_elements]), bert_input_mask_placeholder: np.array([x["bert_input_mask"] for x in all_new_elements]), token_type_ids_placeholder: np.array([x["token_type_ids"] for x in all_new_elements]), prob_vector_placeholder: np.array([x["prob_vector"] for x in all_new_elements]) } peo, pr = sess.run([per_eg_obj, probs], feed_dict=feed_dict) lossy_consistency_flags = evaluate_stopping( stopping_criteria=FLAGS.stopping_criteria, obj_prob_vector=np.array( [x["prob_vector"] for x in all_new_elements]), curr_prob_vector=pr, per_example_objective=peo) total_lossy += len(all_new_elements) - np.sum( lossy_consistency_flags) net_consistency_flags = np.logical_and(consistency_flags, lossy_consistency_flags) for elem, ncf in zip(batch_elements, net_consistency_flags): if ncf: consistent_output_sequences.append(elem) else: failures.append(elem) tf.logging.info("Total inconsistent found = %d / %d", total_inconsistent, len(output_sequences)) tf.logging.info("Total lossy inconsistent found = %d / %d", total_lossy, len(output_sequences)) tf.logging.info("Total consistent outputs so far = %d / %d", len(consistent_output_sequences), len(custom_templates[start_index:end_index])) # Getting ready for next iteration of processing if iteration_number < 10: for elem in failures: elem["input_ids"] = [x for x in elem["original_input_ids"]] elem["stopped"] = False elem["steps_taken"] = 0 elem["score"] = 0.0 all_elements = failures tf.logging.info("Giving up on %d instances!", len(failures)) for elem in failures: consistent_output_sequences.append(elem) if FLAGS.output_file: final_output = [] for op_num, elem in enumerate(consistent_output_sequences): detok_strings = em_util.ids_to_strings(elem["input_ids"], tokenizer) if num_input_sequences == 1: final_output.append("%d\t%d\t%s" % (op_num, elem["ip_num"], detok_strings[0])) elif num_input_sequences == 2: final_output.append("%d\t%d\t%s\t%s" % (op_num, elem["ip_num"], detok_strings[0], detok_strings[1])) if num_input_sequences == 1: header = "index\toriginal_index\tsentence" elif num_input_sequences == 2: header = "index\toriginal_index\tsentence1\tsentence2" final_output = [header] + final_output with tf.gfile.Open(FLAGS.output_file, "w") as f: f.write("\n".join(final_output) + "\n") return
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError("At least one of `do_train`, `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) num_train_steps = None num_warmup_steps = None if FLAGS.do_train: num_train_steps = int( FLAGS.train_data_size / FLAGS.train_batch_size) * FLAGS.epochs num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) if not tf.gfile.Exists(FLAGS.train_file): tf.logging.info( "DANITER:File doesn't exist, creating tfrecord data") examples = model_builder.load_hellaswag(FLAGS.train_raw_data) tf.logging.info("DANITER:Read raw data as json") model_builder.file_based_convert_examples_for_bilinear( examples, 512, tokenizer, FLAGS.train_file, do_copa=True) train_input_fn = file_based_input_fn_builder( input_file=FLAGS.train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, steps=num_train_steps) if FLAGS.do_eval: # This tells the estimator to run through the entire set. if FLAGS.eval_data_size < 0: eval_steps = None else: eval_steps = int(FLAGS.eval_data_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False if not tf.gfile.Exists(FLAGS.eval_file): examples = model_builder.load_hellaswag(FLAGS.eval_raw_data) model_builder.file_based_convert_examples_for_bilinear( examples, 512, tokenizer, FLAGS.eval_file, do_copa=True) eval_input_fn = file_based_input_fn_builder( input_file=FLAGS.eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates tf.logging.info("Evaling all models in output dir") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") key_name = "eval_accuracy" tf.logging.info("Checkpoint path " + checkpoint_path) if tf.gfile.Exists(checkpoint_path + ".index"): tf.logging.info("Found a best model... not good") result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) best_perf = result[key_name] global_step = result["global_step"] else: tf.logging.info("Setting global step to -1") global_step = -1 best_perf = -1 checkpoint_path = None tf.logging.info("Openning writer " + output_eval_file) writer = tf.gfile.GFile(output_eval_file, "w") steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) tf.logging.info("Models found " + "\n".join(filenames)) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) # steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format(global_step)) else: for ele in sorted(steps_and_files.items()): step, checkpoint_path = ele if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) # Why should we remove checkpoints? # tf.gfile.Remove(src_ckpt) tf.logging.info("Skipping candidate for some reason") continue result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if len(_find_valid_cands(global_step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) # tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") writer.close()
def main(_): os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" # Load emotion categories with open(FLAGS.emotion_file, "r") as f: all_emotions = f.read().splitlines() if FLAGS.add_neutral: all_emotions = all_emotions + ["neutral"] idx2emotion = {i: e for i, e in enumerate(all_emotions)} num_labels = len(all_emotions) print("%d labels" % num_labels) print("Multilabel: %r" % FLAGS.multilabel) sentiment = FLAGS.sentiment entailment = FLAGS.entailment correlation = FLAGS.correlation # Create emotion distance matrix # If the regularization parameter is set to 0, don't load matrix. print("Getting distance matrix...") empty_rels = [[0] * num_labels] * num_labels if sentiment == 0: sent_rels = empty_rels else: sent_rels = get_sent_rels(all_emotions) sent_groups = get_sentiment_groups(all_emotions) print(sent_rels) if entailment == 0: entailment_rels = empty_rels intensity_groups = empty_rels else: entailment_rels = get_entailment_rels(all_emotions) intensity_groups = get_intensity_groups(all_emotions) print(entailment_rels) if correlation == 0: corr_rels = empty_rels else: corr_rels = get_correlations(all_emotions) print(corr_rels) tf.logging.set_verbosity(tf.logging.INFO) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_predict: raise ValueError("At least one of `do_train` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) processor = DataProcessor(num_labels, FLAGS.data_dir) # set up preprocessor tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, save_summary_steps=FLAGS.save_summary_steps, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=FLAGS.keep_checkpoint_max) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_examples("train", FLAGS.train_fname) eval_examples = processor.get_examples("dev", FLAGS.dev_fname) num_eval_examples = len(eval_examples) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) params = { "num_labels": num_labels, "learning_rate": FLAGS.learning_rate, "num_train_epochs": FLAGS.num_train_epochs, "warmup_proportion": FLAGS.warmup_proportion, "sentiment": FLAGS.sentiment, "entailment": FLAGS.entailment, "correlations": FLAGS.correlation, "batch_size": FLAGS.train_batch_size, "num_train_examples": len(train_examples), "num_eval_examples": num_eval_examples, "data_dir": FLAGS.data_dir, "output_dir": FLAGS.output_dir, "train_fname": FLAGS.train_fname, "dev_fname": FLAGS.dev_fname, "test_fname": FLAGS.test_fname } with open(os.path.join(FLAGS.output_dir, "config.json"), "w") as f: json.dump(params, f) model_fn = model_fn_builder( bert_config=bert_config, num_labels=num_labels, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, multilabel=FLAGS.multilabel, sent_rels=sent_rels, sentiment=sentiment, entailment_rels=entailment_rels, entailment=entailment, corr_rels=corr_rels, correlation=correlation, idx2emotion=idx2emotion, sentiment_groups=sent_groups, intensity_groups=intensity_groups) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, params={"batch_size": FLAGS.train_batch_size}) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, FLAGS.max_seq_length, tokenizer, train_file) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running training and evaluation *****") tf.logging.info(" Num train examples = %d", len(train_examples)) tf.logging.info(" Num eval examples = %d", num_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num training steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, num_labels=num_labels) train_spec = tf.estimator.TrainSpec( input_fn=train_input_fn, max_steps=num_train_steps) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, num_labels=num_labels) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, steps=FLAGS.eval_steps, start_delay_secs=0, throttle_secs=1000) tf.estimator.train_and_evaluate( estimator, train_spec=train_spec, eval_spec=eval_spec) if FLAGS.calculate_metrics: # Setting the parameter to "dev" ensures that we get labels for the examples eval_examples = processor.get_examples("dev", FLAGS.test_fname) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num eval examples = %d", len(eval_examples)) eval_file = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".tf_record") file_based_convert_examples_to_features(eval_examples, FLAGS.max_seq_length, tokenizer, eval_file) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, num_labels=num_labels) result = estimator.evaluate(input_fn=eval_input_fn, steps=None) output_eval_file = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: # Have to change my dataset in this format predict_examples = processor.get_examples("test", FLAGS.test_fname) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".tf_record") file_based_convert_examples_to_features(predict_examples, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, num_labels=num_labels) # looks like predict_input_fn will contain the data for loading result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".predictions.tsv") output_labels = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".label_predictions.tsv") logits_file = open('logits_file_vanilla.txt', 'w') #prediction_file = open(FLAGS.test_fname+'_predictions.csv', 'w') preds_file = open(FLAGS.test_fname+'_k_bal_numb_predictions_bin.csv', 'w') with tf.gfile.GFile(output_predict_file, "w") as writer: with tf.gfile.GFile(output_labels, "w") as writer2: writer.write("\t".join(all_emotions) + "\n") writer2.write("\t".join([ "text", "emotion_1", "prob_1", "emotion_2", "prob_2", "emotion_3", "prob_3" ]) + "\n") tf.logging.info("***** Predict results *****") num_written_lines = 0 # Do something here df_file = pd.read_csv(os.path.join('data', FLAGS.test_fname), sep='\t', header=None) dict_store = dict() ctr=0 for (i, prediction) in enumerate(result): ctr+=1 if i<5: print(i, prediction["output_layer"], type(prediction["output_layer"]), file=logits_file) #dict_store[df_file.iloc[i, 2]] = prediction["output_layer"] dict_store[i] = prediction["output_layer"] probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" sorted_idx = np.argsort(-probabilities) top_3_emotion = [idx2emotion[idx] for idx in sorted_idx[:3]] top_3_prob = [probabilities[idx] for idx in sorted_idx[:3]] pred_line = [] for emotion, prob in zip(top_3_emotion, top_3_prob): if prob >= FLAGS.pred_cutoff: pred_line.extend([emotion, "%.4f" % prob]) else: pred_line.extend(["", ""]) writer.write(output_line) writer2.write(predict_examples[i].text + "\t" + "\t".join(pred_line) + "\n") num_written_lines += 1 #print(str(df_file.iloc[i,2])+","+str(top_3_emotion), file=prediction_file) print(str(df_file.iloc[i,2])+","+str(sorted_idx[0]), file=preds_file) assert num_written_lines == num_actual_predict_examples # Dump the dictionary into pickle print(len(dict_store)) print(ctr) with open(FLAGS.test_fname + '_k_bal_bin.pickle', 'wb') as handle: pickle.dump(dict_store, handle, protocol=pickle.HIGHEST_PROTOCOL)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError("At least one of `do_train`, `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) num_train_steps = None num_warmup_steps = None if FLAGS.do_train: num_train_steps = int(FLAGS.train_data_size / FLAGS.train_batch_size) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, num_choices=FLAGS.num_choices, add_masking=FLAGS.include_mlm) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=FLAGS.train_file, is_training=True, drop_remainder=True, add_masking=FLAGS.include_mlm) estimator.train(input_fn=train_input_fn, steps=num_train_steps) if FLAGS.do_eval: # This tells the estimator to run through the entire set. if FLAGS.eval_data_size < 0: eval_steps = None else: eval_steps = int(FLAGS.eval_data_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False # Note that we are masking inputs for eval as well as training and this will # decrease eval performance eval_input_fn = file_based_input_fn_builder( input_file=FLAGS.eval_file, is_training=False, drop_remainder=eval_drop_remainder, add_masking=FLAGS.include_mlm) # checkpoints_iterator blocks until a new checkpoint appears. for ckpt in contrib_training.checkpoints_iterator(estimator.model_dir): try: result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) tf.logging.info("********** Eval results:*******\n") for key in sorted(result.keys()): tf.logging.info("%s = %s" % (key, str(result[key]))) except tf.errors.NotFoundError: tf.logging.error("Checkpoint path '%s' no longer exists.", ckpt)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "ske_2019": SKE_2019_Sequence_labeling_Processor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() token_label_list = processor.get_token_labels() predicate_label_list = processor.get_predicate_labels() num_token_labels = len(token_label_list) num_predicate_labels = len(predicate_label_list) token_label_id2label = {} for (i, label) in enumerate(token_label_list): token_label_id2label[i] = label predicate_label_id2label = {} for (i, label) in enumerate(predicate_label_list): predicate_label_id2label[i] = label tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_token_labels=num_token_labels, num_predicate_labels=num_predicate_labels, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, token_label_list, predicate_label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, token_label_list, predicate_label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, token_label_list, predicate_label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) token_label_output_predict_file = os.path.join( FLAGS.output_dir, "token_label_predictions.txt") predicate_output_predict_file = os.path.join(FLAGS.output_dir, "predicate_predict.txt") predicate_output_probabilities_file = os.path.join( FLAGS.output_dir, "predicate_probabilities.txt") with open(token_label_output_predict_file, "w", encoding='utf-8') as token_label_writer: with open(predicate_output_predict_file, "w", encoding='utf-8') as predicate_predict_writer: with open(predicate_output_probabilities_file, "w", encoding='utf-8') as predicate_probabilities_writer: num_written_lines = 0 tf.logging.info( "***** token_label predict and predicate labeling results *****" ) for (i, prediction) in enumerate(result): token_label_prediction = prediction[ "token_label_predictions"] predicate_probabilities = prediction[ "predicate_probabilities"] predicate_prediction = prediction[ "predicate_prediction"] if i >= num_actual_predict_examples: break token_label_output_line = " ".join( token_label_id2label[id] for id in token_label_prediction) + "\n" token_label_writer.write(token_label_output_line) predicate_predict_line = predicate_label_id2label[ predicate_prediction] predicate_predict_writer.write(predicate_predict_line + "\n") predicate_probabilities_line = " ".join( str(sigmoid_logit) for sigmoid_logit in predicate_probabilities) + "\n" predicate_probabilities_writer.write( predicate_probabilities_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def finetune(_): tf.logging.set_verbosity(tf.logging.INFO) # set parameters temp_dir = os.path.join(FLAGS.output_dir, 'temp') output_model_dir = os.path.join(FLAGS.output_dir, 'model') pretrained_models_dir = os.path.join(FLAGS.output_dir, 'pretrained_models', FLAGS.pretrained_model_folder) assert not ( FLAGS.bert_config_file is None and FLAGS.pretrained_model_checkpoint is None and FLAGS.vocab_file is None and FLAGS.pretrained_model_folder is None), \ "Either the `pretrained_model_folder` has to be specified, or all three of the following parameters: " \ "`bert_config_file`, `output_dir`, and `pretrained_model_checkpoint`." if FLAGS.vocab_file is None: FLAGS.vocab_file = os.path.join(pretrained_models_dir, 'vocab.txt') if FLAGS.bert_config_file is None: FLAGS.bert_config_file = os.path.join(pretrained_models_dir, 'bert_config.json') if FLAGS.pretrained_model_checkpoint is None: FLAGS.pretrained_model_checkpoint = os.path.join( pretrained_models_dir, 'bert_model.ckpt') # Validate the pre-trained model tokenization.validate_case_matches_checkpoint( FLAGS.do_lower_case, FLAGS.pretrained_model_checkpoint) # Load the BERT config bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # Validate the max_seq_length parameter if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # Create the temp output directory (if required) tf.gfile.MakeDirs(temp_dir) # Decode the label list json and initialize the CustomDataProcessor processor = CustomDataProcessor(str(FLAGS.data_type)) label_list = processor.get_labels(FLAGS.train_data) # save the labels.txt file with open(os.path.join(output_model_dir, 'labels.txt'), 'w') as f: for label in label_list: f.write(str(label) + '\n') # Initialize the tokenizer tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # Specify the `run_config` parameters: # (We don't support TPU in this version of the training script, but let's leave the original code in place.) tpu_cluster_resolver = None use_tpu = False master = None num_tpu_cores = 8 # if use_tpu and tpu_name: # tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # tpu_name, zone=tpu_zone, project=gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=master, model_dir=temp_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=1, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=num_tpu_cores, per_host_input_for_training=is_per_host)) # Finetuning section. train_examples = processor.get_train_examples(FLAGS.train_data) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.pretrained_model_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=use_tpu, use_one_hot_embeddings=use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size) train_file = os.path.join(temp_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") params = f" Num examples = {len(train_examples)}\n" \ f" Batch size = {FLAGS.train_batch_size}\n" \ f" Num steps = {num_train_steps}\n" \ f" Epochs = {FLAGS.num_train_epochs}\n" \ f" Learning rate = {FLAGS.learning_rate}\n" \ f" warmup_proportion = {FLAGS.warmup_proportion}\n" \ f" max_seq_length = {FLAGS.max_seq_length}\n" \ f" do_lower_case = {FLAGS.do_lower_case}" tf.logging.info(params) print(params) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) latest_model = estimator.latest_checkpoint() # export to savedmodel tf.logging.info('exporting the model to savedmodel') model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=latest_model, learning_rate=FLAGS.learning_rate, num_train_steps=None, num_warmup_steps=None, use_tpu=False, use_one_hot_embeddings=False) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. predict_batch_size = FLAGS.train_batch_size estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, predict_batch_size=predict_batch_size) estimator._export_to_tpu = False tf.logging.info(f'LATEST MODEL: {latest_model}') saved_model_path = estimator.export_savedmodel( output_model_dir, serving_input_fn, checkpoint_path=latest_model).decode("utf-8") # add the vocab.txt file as well shutil.move(os.path.join(pretrained_models_dir, 'vocab.txt'), os.path.join(os.path.dirname(saved_model_path), 'vocab.txt')) # clean up the temp folder shutil.rmtree(temp_dir, ignore_errors=True) # move the model files to the parent directory (to meet the WML convention) for filename in os.listdir(saved_model_path): shutil.move(os.path.join(saved_model_path, filename), os.path.join(os.path.dirname(saved_model_path), filename)) shutil.rmtree(saved_model_path, ignore_errors=True) # update the saved model path as well saved_model_path = os.path.dirname(saved_model_path) tf.logging.info( f'the saved model can be found in this directory: {saved_model_path}')
def main(_): tf.logging.set_verbosity(tf.logging.INFO) np.random.seed(FLAGS.random_seed) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) data_dir = FLAGS.data_dir task_name = FLAGS.task_name.lower() processor = NluProcessor(data_dir, task_name) token_label_list = processor.get_token_labels() sent_label_list = processor.get_sent_labels() train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples() num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = model_fn_builder(bert_config=bert_config, token_label_list=token_label_list, sent_label_list=sent_label_list, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, export_to_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: tf.logging.info("***** Run training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_features = convert_examples_to_features( examples=train_examples, token_label_list=token_label_list, sent_label_list=sent_label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) train_input_fn = input_fn_builder(features=train_features, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples() tf.logging.info("***** Run evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_features = convert_examples_to_features( examples=eval_examples, token_label_list=token_label_list, sent_label_list=sent_label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) eval_input_fn = input_fn_builder(features=eval_features, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.evaluate(input_fn=eval_input_fn) token_precision = result["token_precision"] token_recall = result["token_recall"] token_f1_score = 2.0 * token_precision * token_recall / ( token_precision + token_recall) sent_accuracy = result["sent_accuracy"] tf.logging.info("***** Evaluation result *****") tf.logging.info(" Precision (token-level) = %s", str(token_precision)) tf.logging.info(" Recall (token-level) = %s", str(token_recall)) tf.logging.info(" F1 score (token-level) = %s", str(token_f1_score)) tf.logging.info(" Accuracy (sent-level) = %s", str(sent_accuracy)) if FLAGS.do_predict: predict_examples = processor.get_test_examples() tf.logging.info("***** Run prediction *****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_features = convert_examples_to_features( examples=predict_examples, token_label_list=token_label_list, sent_label_list=sent_label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) predict_input_fn = input_fn_builder(features=predict_features, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) predicts = [{ "input_ids": feature.input_ids, "input_masks": feature.input_masks, "token_label_ids": feature.token_label_ids, "sent_label_id": feature.sent_label_id, "token_predict_ids": predict["token_predict"].tolist(), "sent_predict_id": predict["sent_predict"].tolist() } for feature, predict in zip(predict_features, result)] decoded_predicts = decode_predicts(predicts=predicts, token_label_list=token_label_list, sent_label_list=sent_label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) predict_tag = FLAGS.predict_tag if FLAGS.predict_tag else str( time.time()) output_path = os.path.join(FLAGS.output_dir, "predict.{0}.json".format(predict_tag)) write_to_json(decoded_predicts, output_path) if FLAGS.do_export: tf.logging.info("***** Running exporting *****") tf.gfile.MakeDirs(FLAGS.export_dir) estimator.export_savedmodel(FLAGS.export_dir, serving_input_fn, as_text=False)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "mrpc": data_cls_helper.MrpcProcessor, "snli": data_cls_helper.SnliProcessor, "sick": data_cls_helper.SickProcessor, "cola": data_cls_helper.ColaProcessor, "cr": data_cls_helper.CrProcessor, "mr": data_cls_helper.MrProcessor, "subj": data_cls_helper.SubjProcessor, "sst5": data_cls_helper.Sst5Processor, "sst2": data_cls_helper.Sst2Processor, "trec": data_cls_helper.TrecProcessor } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model was only trained up to sequence " "length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) if not os.path.exists(FLAGS.output_dir): tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) run_config = tf.contrib.tpu.RunConfig( cluster=None, master=None, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=8, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False) # If TPU is not available, this will fall back to normal Estimator on CPU or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.batch_size) # This tells the estimator to run through the entire set. eval_steps = None eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") batch_tokens, batch_labels = file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") total_examples, correct_predicts = 0, 0 with tf.gfile.GFile(output_predict_file, mode="w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for i, (tokens, label, prediction) in enumerate( zip(batch_tokens, batch_labels, result)): probabilities = prediction["probabilities"] predict_label = prediction["predictions"] if i >= num_actual_predict_examples: break total_examples += 1 if predict_label == label: correct_predicts += 1 sentence = " ".join(tokens) class_probabilities = "\t".join( str(class_probability) for class_probability in probabilities) output_line = "\t".join([ sentence, class_probabilities, str(label), str(predict_label) ]) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples acc = float(correct_predicts) / float(total_examples) print("Test accuracy: {}".format(acc))
def generate_embeddings(args): """Generates a set of word embeddings from the final four BERT layers. Parameters ---------- args : Namespace Parsed arguments from argparse, containing all of the input arguments """ time_start = time() print(tf.__version__) print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices("GPU"))) # data df = pd.read_pickle(args.dataframe_path) topics = list(df)[5:] xtrain, xtest, ytrain, ytest = train_test_split(df["clean_text"], df.iloc[:, 5:], test_size=0.2, random_state=42) xtrain, xdev, ytrain, ydev = train_test_split(xtrain, ytrain, test_size=0.25, random_state=42) print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape) print(xtrain.shape, xdev.shape, ytrain.shape, ydev.shape) df_train = pd.concat([xtrain, ytrain], axis=1, ignore_index=True) df_dev = pd.concat([xdev, ydev], axis=1, ignore_index=True) df_test = pd.concat([xtest, ytest], axis=1, ignore_index=True) print(f"train shape: {df_train.shape}") print(f"val shape: {df_dev.shape}") print(f"test shape: {df_test.shape}") if args.stage == "train": examples = create_examples(df_train) if args.stage == "dev": examples = create_examples(df_dev) if args.stage == "test": examples = create_examples(df_test) input_fn = create_input_fn_from_examples(examples, args.stage, args.base_working_path, args.max_seq_length, len(topics)) # model init bert_vocab = args.base_path + "/bert_vocab.txt" bert_init_chckpnt = args.base_path + "/bert_model.ckpt" bert_config = args.base_path + "/config.json" tokenization.validate_case_matches_checkpoint(True, bert_init_chckpnt) tokenizer = tokenization.FullTokenizer(vocab_file=bert_vocab, do_lower_case=True) output_dir = args.base_working_path + "/output" run_config = tf.estimator.RunConfig( model_dir=output_dir, save_summary_steps=args.save_summary_steps, keep_checkpoint_max=1, save_checkpoints_steps=args.save_checkpoint_steps, ) bert_config = modeling.BertConfig.from_json_file(bert_config) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(topics), init_checkpoint=bert_init_chckpnt, learning_rate=args.learning_rate, num_train_steps=-1, num_warmup_steps=-1, use_tpu=False, use_one_hot_embeddings=False, layer_indexes=args.layer_indices, ) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={"batch_size": args.batch_size}) print("=" * 50) print(f"Beginning predict") print("=" * 50) # inference embeddings_list = np.empty( [len(examples), args.max_seq_length, args.embedding_size]) label_ids_list = np.empty([len(examples), examples[0].labels.shape[0]]) generate_bert_embeddings( args.mode, input_fn, embeddings_list, label_ids_list, estimator, tokenizer, args.layer_indices, ) print("=" * 50) print(f"Embedding list size: {len(embeddings_list)}") print(f"Labels list size: {len(label_ids_list)}") print( f"Embedding size: {len(embeddings_list[0][0])}, {len(embeddings_list[0])}" ) print(f"Saving...") dump_path = f"bert_{args.stage}_{args.mode}.npy" np.save(dump_path, embeddings_list) dump_path_labels = f"bert_{args.stage}_{args.mode}_labels.npy" np.save(dump_path_labels, label_ids_list) print("DONE") print("=" * 50) print( f"Finished generating {args.stage} BERT token level embeddings", f"in {time()-time_start} seconds.\nPath: {dump_path}", )
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processor = ccfKeyProcessor() tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) tf.gfile.MakeDirs(FLAGS.model_dir) label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.model_dir, keep_checkpoint_max=10000, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.train_data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.dev_data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) steps_and_files = [] filenames = tf.gfile.ListDirectory(FLAGS.model_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.model_dir, ckpt_name) global_step = int(cur_filename.split("-")[-1]) tf.logging.info("Add {} to eval list.".format(cur_filename)) steps_and_files.append([global_step, cur_filename]) steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) result_list = list() for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) result_list.append([global_step, result]) for step, result in result_list: tf.logging.info("\n\n------ step ------" + str(step)) pre, rec, f1 = get_metrics(result["cf"], 3) tf.logging.info("eval_precision: {}".format(pre)) tf.logging.info("eval_recall: {}".format(rec)) tf.logging.info("eval_f1: {}".format(f1)) tf.logging.info("eval_accuracy: {}".format( result["eval_accuracy"])) tf.logging.info("eval_loss: {}".format(result["eval_loss"])) tf.logging.info("-------------------------\n\n") if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.test_data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) """ change """ # Filter out all checkpoints in the directory steps_and_files = [] filenames = tf.gfile.ListDirectory(FLAGS.model_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.model_dir, ckpt_name) global_step = int(cur_filename.split("-")[-1]) tf.logging.info("Add {} to eval list.".format(cur_filename)) steps_and_files.append([global_step, cur_filename]) steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) # Decide whether to evaluate all ckpts if not FLAGS.eval_all_ckpt: steps_and_files = steps_and_files[-1:] for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): tf.logging.info("------ global_step ------" + str(global_step)) # ret = estimator.evaluate( # input_fn=eval_input_fn, # steps=eval_steps, # checkpoint_path=filename) result = estimator.predict(input_fn=predict_input_fn, checkpoint_path=filename) output_predict_file = os.path.join( FLAGS.output_dir, str(global_step) + "_test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(_): record_dir = os.path.join(FLAGS.data_dir, "trainrecords" + str(FLAGS.max_seq_length)) tf.logging.set_verbosity(tf.logging.INFO) processors = { "gap": GAProcessor, } tf.estimator.RunConfig(model_dir=FLAGS.output_dir) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict and not FLAGS.pre_train: raise ValueError( "One of `pre_train`, `do_train`, `do_eval` or `do_predict' must be True." ) if FLAGS.do_train and FLAGS.pre_train: raise ValueError( "Cannot `pre_train` and `do_train` in a single pass. First do `pre_train`" ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: num_train_steps = int(FLAGS.epoch_size / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) ############################################################################## # PRE TRAIN # ############################################################################## if FLAGS.pre_train: tsv_dir = os.path.join(FLAGS.data_dir, "trainQ") tf.gfile.MakeDirs(record_dir) in_file = FLAGS.train_data_path if "/" in in_file: in_file = in_file[in_file.rfind("/") + 1:] train_examples = processor.get_train_examples(FLAGS.train_data_path) train_file = in_file + '.tf_record' train_path = os.path.join(record_dir, train_file) file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_path) ''' files = tf.gfile.ListDirectory(tsv_dir) for in_file in files: in_path = os.path.join(tsv_dir, in_file) train_examples = processor.get_train_examples(in_path) train_file = in_file + '.tf_record' train_path = os.path.join(record_dir, train_file) file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_path) ''' ############################################################################## # DO TRAIN # ############################################################################## if FLAGS.do_train: tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=record_dir, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) ############################################################################## # DO EVAL # ############################################################################## if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") if not tf.gfile.Exists(eval_file): file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ############################################################################## # DO PREDICT # ############################################################################## if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) print("***************************************" + str(num_actual_predict_examples)) if FLAGS.use_tpu: while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") if not tf.gfile.Exists(predict_file): file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False # True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) print("**************** MAX_SEQ_LENGTH " + str(FLAGS.max_seq_length)) result = estimator.predict(input_fn=predict_input_fn) print("PREDICT_EXAMPLES", len(predict_examples)) print("INPUT_FN", predict_input_fn) print("RESULT", result) print("LABEL LIST", label_list) # My own algorithm for keeping probs away from extremes 0 and 1 # Standard clipping might be better, but this is what I used for kaggle comp. def smooth(prob): return (1.0 - FLAGS.smoothing) * prob + FLAGS.smoothing / 3.0 guids = [] # This is a hack. If order gets shuffled ids will not match predictions. # So predict must NOT be parrarelised! # Tried to do this with feature_forwarding, but so far a fail. for example in predict_examples: guids.append(example.guid) output_predict_file = os.path.join(FLAGS.output_dir, FLAGS.output_file) print("***** PREDICT FILE " + output_predict_file) with tf.gfile.GFile(output_predict_file, "w") as writer: tf.logging.info("***** Predict results *****") writer.write("ID,A,B,NEITHER\n") for i, prediction in enumerate(result): print("***** Predict results ***** " + str(i), end="\r") guid = guids[i] out = prediction['probabilities'].tolist() output_line = guid + ',' + ",".join( str(smooth(result)) for result in out) + "\n" writer.write(output_line) print()