예제 #1
0
def enhance_dataset(bert_path,
                    path,
                    table_path,
                    out_path,
                    max_n=10,
                    threshold=1.9):
    vocab_file = os.path.join(bert_path, f'vocab.txt')
    bert_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                do_lower_case=True)
    tables = load_table(table_path)
    fout = open(out_path, "w", encoding="utf-8")
    cnt = 0
    with open(path) as fin:
        for line in fin:
            if cnt % 1000 == 0:
                print(cnt)
            cnt += 1
            example = json.loads(line.strip())
            table = tables[example["table_id"]]
            h_aug, q_feature = enhance_example(bert_tokenizer,
                                               example["question_tok"],
                                               table["header"], table["rows"],
                                               max_n, threshold)
            example["header_aug"] = h_aug
            example["question_feature"] = q_feature
            json_str = json.dumps(example,
                                  ensure_ascii=False,
                                  default=json_default_type_checker)
            json_str += "\n"
            fout.writelines(json_str)
    fout.close()
예제 #2
0
 def __init__(self, logger, train_path, eval_path, bert_path, max_length, batch_size, rate, epoch,
              loss, tf_config, model_path, summary_path, tag2label=None, encoder_layer=11):
     self.logger = logger
     self.train_path = train_path
     self.eval_path = eval_path
     self.bert_path = bert_path
     self.max_length = max_length
     self.batch_size = batch_size
     self.rate = rate
     self.epoch = epoch
     self.loss = loss
     self.encoder_layer = encoder_layer
     self.tf_config = tf_config
     self.model_path = model_path
     self.summary_path = summary_path
     vocab_file = os.path.join(self.bert_path, 'vocab.txt')
     self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file)
     self.predictor = None
     if tag2label is None:
         tag2label = {
             'O': 0,
             'B-com': 1,
             'I-com': 2,
             'B-pos': 3,
             'I-pos': 4
         }
     self.tag2label = tag2label
     self.label2tag = {}
     for key in self.tag2label:
         self.label2tag[self.tag2label[key]] = key
예제 #3
0
 def __init__(self,
              attribute_name,
              init_checkpoint=FLAGS.init_checkpoint,
              is_training=False):
     """
     checkpoint: Initial checkpoint (usually from a pre-trained BERT model).
     is_training: bool. true for training model, false for eval model. Controls
                  whether dropout will be applied.
     """
     self.attribute_name = attribute_name
     self.init_checkpoint = init_checkpoint
     self.is_training = is_training
     self.learning_rate = FLAGS.learning_rate
     self.bert_config = modeling.BertConfig.from_json_file(
         FLAGS.bert_config_file)
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=FLAGS.vocab_file, do_lower_case=False)
     self.data_processor = NerProcessor(self.attribute_name)
     self.attribute_name = attribute_name
     self.attribute_info = attr_ner_label.attribute_infos[attribute_name]
     self.attribute_dict = self.attribute_info['dict']
     self.attribute_label = self.attribute_info['label']
     self.file_prefix = self.attribute_info['file_prefix']
     self.labels = self.data_processor.get_labels()
     self.num_labels = len(self.labels)
     self.graph = tf.Graph()
     with self.graph.as_default():
         self._model_builder()
         self.sess.run(tf.initialize_all_variables())
 def load(self, dir):
     assert os.path.exists(dir)
     with open(os.path.join(dir, "config"), "rb") as fin:
         self.config = pickle.load(fin)
     vocab_file = os.path.join(self.config['bert_path'], "vocab.txt")
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=vocab_file, do_lower_case=self.config['do_lower_case'])
     saved_model = sorted(glob.glob(os.path.join(dir, "exported", "*")))[-1]
     self.predictor = tf.contrib.predictor.from_saved_model(saved_model)
예제 #5
0
def load_bert(BERT_PATH):
    config_file = os.path.join(BERT_PATH, f'config.json')
    vocab_file = os.path.join(BERT_PATH, f'vocab.txt')
    init_checkpoint = os.path.join(BERT_PATH, f'pytorch_model.bin')

    bert_config = BertConfig.from_json_file(config_file)
    bert_config.print_status()

    bert_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                do_lower_case=True)

    bert_model = BertModel(bert_config)
    bert_model.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))

    return bert_model, bert_tokenizer, bert_config
예제 #6
0
 def __init__(self, do_train=False):
     self.bert_config = modeling.BertConfig.from_json_file(
         FLAGS.bert_config_file)
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
     self.data_processor = NerProcessor()
     self.label_list = self.data_processor.get_labels()
     self.id2label = {i: label for i, label in enumerate(self.label_list)}
     self.train_examples = None
     self.num_train_steps = None
     self.num_warmup_steps = None
     self.do_train = do_train
     if do_train:
         self.train_examples = self.data_processor.get_train_examples(
             FLAGS.data_dir)
         self.num_train_steps = int(
             len(self.train_examples) / FLAGS.train_batch_size *
             FLAGS.num_train_epochs)
         self.num_warmup_steps = int(self.num_train_steps *
                                     FLAGS.warmup_proportion)
     self.model_fn = model_fn_builder(
         bert_config=self.bert_config,
         num_labels=len(self.label_list),
         init_checkpoint=FLAGS.init_checkpoint,
         learning_rate=FLAGS.learning_rate,
         num_train_steps=self.num_train_steps,
         num_warmup_steps=self.num_warmup_steps,
         use_tpu=FLAGS.use_tpu,
         use_one_hot_embeddings=FLAGS.use_tpu)
     is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
     run_config = tf.contrib.tpu.RunConfig(
         cluster=None,
         master=FLAGS.master,
         model_dir=FLAGS.output_dir,
         save_checkpoints_steps=FLAGS.save_checkpoints_steps,
         tpu_config=tf.contrib.tpu.TPUConfig(
             iterations_per_loop=FLAGS.iterations_per_loop,
             num_shards=FLAGS.num_tpu_cores,
             per_host_input_for_training=is_per_host))
     self.estimator = tf.contrib.tpu.TPUEstimator(
         use_tpu=FLAGS.use_tpu,
         model_fn=self.model_fn,
         config=run_config,
         train_batch_size=FLAGS.train_batch_size,
         eval_batch_size=FLAGS.eval_batch_size,
         predict_batch_size=FLAGS.predict_batch_size)
예제 #7
0
 def fit(self):
     tf.gfile.MakeDirs(self.save_path)
     processor = Processor()
     train_examples, labels = processor.get_train_examples(self.train_path)
     num_train_steps = int(
         len(train_examples) / self.batch_size * self.epoch)
     num_warmup_steps = 0 if self.model else int(num_train_steps *
                                                 self.warmup_ratio)
     if not self.model:
         self.labels = labels
         self.config['labels'] = self.labels
         init_checkpoint = os.path.join(self.bert_path, "bert_model.ckpt")
         bert_config_file = os.path.join(self.bert_path, "bert_config.json")
         bert_config = modeling.BertConfig.from_json_file(bert_config_file)
         model_fn = model_fn_builder(bert_config=bert_config,
                                     num_labels=len(labels),
                                     init_checkpoint=init_checkpoint,
                                     filters=self.filters,
                                     kernel_size=self.kernel_size,
                                     strides=self.strides,
                                     pool_size=self.pool_size,
                                     learning_rate=self.learning_rate,
                                     num_train_steps=num_train_steps,
                                     num_warmup_steps=num_warmup_steps)
         run_config = tf.estimator.RunConfig(
             model_dir=self.save_path,
             save_checkpoints_steps=self.save_checkpoints_steps,
             session_config=self.tf_config)
         self.model = tf.estimator.Estimator(model_fn=model_fn,
                                             config=run_config)
     vocab_file = os.path.join(self.bert_path, "vocab.txt")
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=vocab_file, do_lower_case=self.do_lower_case)
     train_file = os.path.join(self.save_path, "train.tf_record")
     file_based_convert_examples_to_features(train_examples, labels,
                                             self.max_length,
                                             self.tokenizer, train_file)
     train_input_fn = file_based_input_fn_builder(
         input_file=train_file,
         seq_length=self.max_length,
         is_training=True,
         drop_remainder=True,
         batch_size=self.batch_size)
     self.model.train(input_fn=train_input_fn, max_steps=num_train_steps)
     with open(os.path.join(self.save_path, "config"), "wb") as out:
         pickle.dump(self.config, out)
예제 #8
0
 def __init__(self, init_checkpoint=FLAGS.init_checkpoint, is_training=False):
     """
     checkpoint: Initial checkpoint (usually from a pre-trained BERT model).
     is_training: bool. true for training model, false for eval model. Controls
     whether dropout will be applied.
     """
     self.init_checkpoint = init_checkpoint
     self.is_training = is_training
     self.learning_rate = FLAGS.learning_rate
     self.bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
     self.tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file)
     self.data_processor = DataProcessor()
     self.labels = self.data_processor.get_labels()
     self.num_labels = len(self.labels)
     self.graph = tf.Graph()
     self.summary_writer = tf.summary.FileWriter(f'{DIR}/logs', tf.get_default_graph())
     with self.graph.as_default():
         self._model_builder()
         self.sess.run(tf.initialize_all_variables())
예제 #9
0
 def __init__(self, init_checkpoint, is_training=False):
     """
     checkpoint: Initial checkpoint (usually from a pre-trained BERT model).
     is_training: bool. true for training model, false for eval model. Controls
     whether dropout will be applied.
     """
     self.init_checkpoint = init_checkpoint
     self.is_training = is_training
     self.bert_config = modeling.BertConfig.from_json_file(
         f"{DIR}/model/pretrained_model/bert_config.json")
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=f"{DIR}/model/pretrained_model/vocab.txt")
     self.data_processor = DataProcessor()
     self.labels = self.data_processor.get_labels()
     self.num_labels = len(self.labels)
     self.max_seq_length = 256
     self.graph = tf.Graph()
     with self.graph.as_default():
         self._model_builder()
         self.sess.run(tf.initialize_all_variables())
예제 #10
0
def main(_):
    logging.set_verbosity(logging.INFO)
    processors = {"ner": NerProcessor}
    # if not FLAGS.do_train and not FLAGS.do_eval:
    #     raise ValueError("At least one of `do_train` or `do_eval` must be True.")
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))
    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))
    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        _, _ = filed_based_convert_examples_to_features(
            train_examples, label_list, FLAGS.max_seq_length, tokenizer,
            train_file)
        logging.info("***** Running training *****")
        logging.info("  Num examples = %d", len(train_examples))
        logging.info("  Batch size = %d", FLAGS.train_batch_size)
        logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        batch_tokens, batch_labels = filed_based_convert_examples_to_features(
            eval_examples, label_list, FLAGS.max_seq_length, tokenizer,
            eval_file)

        logging.info("***** Running evaluation *****")
        logging.info("  Num examples = %d", len(eval_examples))
        logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        # if FLAGS.use_tpu:
        #     eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
        # eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)
        result = estimator.evaluate(input_fn=eval_input_fn)
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as wf:
            logging.info("***** Eval results *****")
            confusion_matrix = result["confusion_matrix"]
            p, r, f = metrics.calculate(confusion_matrix, len(label_list) - 1)
            logging.info("***********************************************")
            logging.info("********************P = %s*********************",
                         str(p))
            logging.info("********************R = %s*********************",
                         str(r))
            logging.info("********************F = %s*********************",
                         str(f))
            logging.info("***********************************************")

    if FLAGS.do_predict:
        with open(FLAGS.data_dir + '/label2id.pkl', 'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}

        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        batch_tokens, batch_labels = filed_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            predict_file)
        logging.info("***** Running prediction*****")
        logging.info("  Num examples = %d", len(predict_examples))
        logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")
        # here if the tag is "X" means it belong to its before token, here for convenient evaluate use
        # conlleval.pl we discarding it directly
        Writer(output_predict_file, result, batch_tokens, batch_labels,
               id2label)