def enhance_dataset(bert_path, path, table_path, out_path, max_n=10, threshold=1.9): vocab_file = os.path.join(bert_path, f'vocab.txt') bert_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) tables = load_table(table_path) fout = open(out_path, "w", encoding="utf-8") cnt = 0 with open(path) as fin: for line in fin: if cnt % 1000 == 0: print(cnt) cnt += 1 example = json.loads(line.strip()) table = tables[example["table_id"]] h_aug, q_feature = enhance_example(bert_tokenizer, example["question_tok"], table["header"], table["rows"], max_n, threshold) example["header_aug"] = h_aug example["question_feature"] = q_feature json_str = json.dumps(example, ensure_ascii=False, default=json_default_type_checker) json_str += "\n" fout.writelines(json_str) fout.close()
def __init__(self, logger, train_path, eval_path, bert_path, max_length, batch_size, rate, epoch, loss, tf_config, model_path, summary_path, tag2label=None, encoder_layer=11): self.logger = logger self.train_path = train_path self.eval_path = eval_path self.bert_path = bert_path self.max_length = max_length self.batch_size = batch_size self.rate = rate self.epoch = epoch self.loss = loss self.encoder_layer = encoder_layer self.tf_config = tf_config self.model_path = model_path self.summary_path = summary_path vocab_file = os.path.join(self.bert_path, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file) self.predictor = None if tag2label is None: tag2label = { 'O': 0, 'B-com': 1, 'I-com': 2, 'B-pos': 3, 'I-pos': 4 } self.tag2label = tag2label self.label2tag = {} for key in self.tag2label: self.label2tag[self.tag2label[key]] = key
def __init__(self, attribute_name, init_checkpoint=FLAGS.init_checkpoint, is_training=False): """ checkpoint: Initial checkpoint (usually from a pre-trained BERT model). is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. """ self.attribute_name = attribute_name self.init_checkpoint = init_checkpoint self.is_training = is_training self.learning_rate = FLAGS.learning_rate self.bert_config = modeling.BertConfig.from_json_file( FLAGS.bert_config_file) self.tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=False) self.data_processor = NerProcessor(self.attribute_name) self.attribute_name = attribute_name self.attribute_info = attr_ner_label.attribute_infos[attribute_name] self.attribute_dict = self.attribute_info['dict'] self.attribute_label = self.attribute_info['label'] self.file_prefix = self.attribute_info['file_prefix'] self.labels = self.data_processor.get_labels() self.num_labels = len(self.labels) self.graph = tf.Graph() with self.graph.as_default(): self._model_builder() self.sess.run(tf.initialize_all_variables())
def load(self, dir): assert os.path.exists(dir) with open(os.path.join(dir, "config"), "rb") as fin: self.config = pickle.load(fin) vocab_file = os.path.join(self.config['bert_path'], "vocab.txt") self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=self.config['do_lower_case']) saved_model = sorted(glob.glob(os.path.join(dir, "exported", "*")))[-1] self.predictor = tf.contrib.predictor.from_saved_model(saved_model)
def load_bert(BERT_PATH): config_file = os.path.join(BERT_PATH, f'config.json') vocab_file = os.path.join(BERT_PATH, f'vocab.txt') init_checkpoint = os.path.join(BERT_PATH, f'pytorch_model.bin') bert_config = BertConfig.from_json_file(config_file) bert_config.print_status() bert_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) bert_model = BertModel(bert_config) bert_model.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) return bert_model, bert_tokenizer, bert_config
def __init__(self, do_train=False): self.bert_config = modeling.BertConfig.from_json_file( FLAGS.bert_config_file) self.tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) self.data_processor = NerProcessor() self.label_list = self.data_processor.get_labels() self.id2label = {i: label for i, label in enumerate(self.label_list)} self.train_examples = None self.num_train_steps = None self.num_warmup_steps = None self.do_train = do_train if do_train: self.train_examples = self.data_processor.get_train_examples( FLAGS.data_dir) self.num_train_steps = int( len(self.train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) self.num_warmup_steps = int(self.num_train_steps * FLAGS.warmup_proportion) self.model_fn = model_fn_builder( bert_config=self.bert_config, num_labels=len(self.label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=self.num_train_steps, num_warmup_steps=self.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=None, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) self.estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=self.model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size)
def fit(self): tf.gfile.MakeDirs(self.save_path) processor = Processor() train_examples, labels = processor.get_train_examples(self.train_path) num_train_steps = int( len(train_examples) / self.batch_size * self.epoch) num_warmup_steps = 0 if self.model else int(num_train_steps * self.warmup_ratio) if not self.model: self.labels = labels self.config['labels'] = self.labels init_checkpoint = os.path.join(self.bert_path, "bert_model.ckpt") bert_config_file = os.path.join(self.bert_path, "bert_config.json") bert_config = modeling.BertConfig.from_json_file(bert_config_file) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(labels), init_checkpoint=init_checkpoint, filters=self.filters, kernel_size=self.kernel_size, strides=self.strides, pool_size=self.pool_size, learning_rate=self.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) run_config = tf.estimator.RunConfig( model_dir=self.save_path, save_checkpoints_steps=self.save_checkpoints_steps, session_config=self.tf_config) self.model = tf.estimator.Estimator(model_fn=model_fn, config=run_config) vocab_file = os.path.join(self.bert_path, "vocab.txt") self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=self.do_lower_case) train_file = os.path.join(self.save_path, "train.tf_record") file_based_convert_examples_to_features(train_examples, labels, self.max_length, self.tokenizer, train_file) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=self.max_length, is_training=True, drop_remainder=True, batch_size=self.batch_size) self.model.train(input_fn=train_input_fn, max_steps=num_train_steps) with open(os.path.join(self.save_path, "config"), "wb") as out: pickle.dump(self.config, out)
def __init__(self, init_checkpoint=FLAGS.init_checkpoint, is_training=False): """ checkpoint: Initial checkpoint (usually from a pre-trained BERT model). is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. """ self.init_checkpoint = init_checkpoint self.is_training = is_training self.learning_rate = FLAGS.learning_rate self.bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) self.tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file) self.data_processor = DataProcessor() self.labels = self.data_processor.get_labels() self.num_labels = len(self.labels) self.graph = tf.Graph() self.summary_writer = tf.summary.FileWriter(f'{DIR}/logs', tf.get_default_graph()) with self.graph.as_default(): self._model_builder() self.sess.run(tf.initialize_all_variables())
def __init__(self, init_checkpoint, is_training=False): """ checkpoint: Initial checkpoint (usually from a pre-trained BERT model). is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. """ self.init_checkpoint = init_checkpoint self.is_training = is_training self.bert_config = modeling.BertConfig.from_json_file( f"{DIR}/model/pretrained_model/bert_config.json") self.tokenizer = tokenization.FullTokenizer( vocab_file=f"{DIR}/model/pretrained_model/vocab.txt") self.data_processor = DataProcessor() self.labels = self.data_processor.get_labels() self.num_labels = len(self.labels) self.max_seq_length = 256 self.graph = tf.Graph() with self.graph.as_default(): self._model_builder() self.sess.run(tf.initialize_all_variables())
def main(_): logging.set_verbosity(logging.INFO) processors = {"ner": NerProcessor} # if not FLAGS.do_train and not FLAGS.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") _, _ = filed_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) logging.info("***** Running training *****") logging.info(" Num examples = %d", len(train_examples)) logging.info(" Batch size = %d", FLAGS.train_batch_size) logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") batch_tokens, batch_labels = filed_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) logging.info("***** Running evaluation *****") logging.info(" Num examples = %d", len(eval_examples)) logging.info(" Batch size = %d", FLAGS.eval_batch_size) # if FLAGS.use_tpu: # eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) # eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.evaluate(input_fn=eval_input_fn) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with open(output_eval_file, "w") as wf: logging.info("***** Eval results *****") confusion_matrix = result["confusion_matrix"] p, r, f = metrics.calculate(confusion_matrix, len(label_list) - 1) logging.info("***********************************************") logging.info("********************P = %s*********************", str(p)) logging.info("********************R = %s*********************", str(r)) logging.info("********************F = %s*********************", str(f)) logging.info("***********************************************") if FLAGS.do_predict: with open(FLAGS.data_dir + '/label2id.pkl', 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") batch_tokens, batch_labels = filed_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) logging.info("***** Running prediction*****") logging.info(" Num examples = %d", len(predict_examples)) logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") # here if the tag is "X" means it belong to its before token, here for convenient evaluate use # conlleval.pl we discarding it directly Writer(output_predict_file, result, batch_tokens, batch_labels, id2label)