示例#1
0
def initsetting():
    print("DOING INITIALSETTING!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    tokenizer = run_classifier_with_tfhub.create_tokenizer_from_hub_module(
        BERT_MODEL_HUB)

    processors = {
        "cola": run_classifier.ColaProcessor,
        "mnli": run_classifier.MnliProcessor,
        "mrpc": run_classifier.MrpcProcessor,
        "korean_sa": run_classifier.KsaProcessor,
    }
    processor = processors[TASK.lower()]()
    label_list = processor.get_labels()

    num_train_steps = 1
    num_warmup_steps = None

    bert_config = modeling.BertConfig.from_json_file(CONFIG_DIR)
    model_fn = run_classifier.model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=CKPT_DIR,
        learning_rate=3e-4,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=False,
        use_one_hot_embeddings=False)

    estimator = tf.contrib.tpu.TPUEstimator(use_tpu=False,
                                            model_fn=model_fn,
                                            config=get_run_config(),
                                            train_batch_size=32,
                                            eval_batch_size=8,
                                            predict_batch_size=1)
    return tokenizer, estimator, label_list
示例#2
0
    def __init__(self):
        tf.logging.set_verbosity(tf.logging.INFO)
        self.config = BertPredictionConfig()

        bert_config = modeling.BertConfig.from_json_file(
            self.config.bert_config_file)

        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=self.config.vocab_file,
            do_lower_case=self.config.do_lower_case)

        self.processor = PlwiProcessor()
        self.label_list = self.processor.get_labels()

        tpu_cluster_resolver = None
        if self.config.use_tpu and self.config.tpu_name:
            tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
                self.config.tpu_name,
                zone=self.config.tpu_zone,
                project=self.config.gcp_project)

        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=None,
            model_dir="out",
            save_checkpoints_steps=False,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=1000,
                num_shards=8,
                per_host_input_for_training=is_per_host))

        model_fn = model_fn_builder(
            bert_config=bert_config,
            num_labels=len(self.label_list),
            init_checkpoint=self.config.init_checkpoint,
            learning_rate=0,
            num_train_steps=0,
            num_warmup_steps=0,
            use_tpu=self.config.use_tpu,
            use_one_hot_embeddings=self.config.use_tpu)

        self.estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=self.config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=self.config.batch_size,
            eval_batch_size=self.config.batch_size,
            predict_batch_size=self.config.batch_size)
示例#3
0
def load_estimator(config, FLAGS):
    
    bert_config_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.json')
    bert_config_file.write(json.dumps({k:str_to_value(v) for k,v in config['BERT-CONFIG'].items()}))
    bert_config_file.seek(0)
    bert_config = modeling.BertConfig.from_json_file(bert_config_file.name)
    
    tpu_cluster_resolver = None
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=FLAGS.master,
            model_dir=FLAGS.output_dir,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            tpu_config=tf.contrib.tpu.TPUConfig(
                    iterations_per_loop=FLAGS.iterations_per_loop,
                    num_shards=FLAGS.num_tpu_cores,
                    per_host_input_for_training=is_per_host
                )
        )

    model_fn = model_fn_builder(
            bert_config=bert_config,
            num_labels=len(FLAGS.task_proc.get_labels()),
            init_checkpoint=FLAGS.init_checkpoint,
            learning_rate=FLAGS.learning_rate,
            num_train_steps=FLAGS.num_train_steps,
            num_warmup_steps=FLAGS.num_warmup_steps,
            use_tpu=FLAGS.use_tpu,
            use_one_hot_embeddings=FLAGS.use_tpu
        )

    estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.predict_batch_size
        )
    
    return estimator
 def __init__(self, path):
     self.init_checkpoint = path + "/anshaj.ckpt"
     self.tokenization = run_classifier.tokenization
     processor = run_classifier.ColaProcessor()
     BATCH_SIZE = 32
     self.MAX_SEQ_LENGTH = 50
     self.tokenization.validate_case_matches_checkpoint(
         False, self.init_checkpoint)
     bert_config = run_classifier.modeling.BertConfig.from_json_file(
         path + "/bert_config.json")
     self.tokenizer = self.tokenization.FullTokenizer(vocab_file=path +
                                                      "/vocab.txt",
                                                      do_lower_case=False)
     is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
     run_config = tf.contrib.tpu.RunConfig(
         model_dir=path,
         cluster=None,
         master=None,
         save_checkpoints_steps=500,
         tpu_config=tf.contrib.tpu.TPUConfig(
             iterations_per_loop=1000,
             num_shards=8,
             per_host_input_for_training=is_per_host))
     model_fn = run_classifier.model_fn_builder(
         bert_config=bert_config,
         num_labels=3,
         init_checkpoint=self.init_checkpoint,
         learning_rate=1e-05,
         num_train_steps=None,
         num_warmup_steps=None,
         use_tpu=False,
         use_one_hot_embeddings=False)
     self.estimator = tf.contrib.tpu.TPUEstimator(
         use_tpu=False,
         model_fn=model_fn,
         config=run_config,
         train_batch_size=BATCH_SIZE,
         eval_batch_size=BATCH_SIZE,
         predict_batch_size=BATCH_SIZE)
示例#5
0
tpu_cluster_resolver = None
run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=OUTPUT_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
        PER_HOST_V2))

model_fn = run_classifier.model_fn_builder(
    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels=len(label_list),
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False,
    use_one_hot_embeddings=True)

estimator = tf.contrib.tpu.TPUEstimator(use_tpu=False,
                                        model_fn=model_fn,
                                        config=run_config,
                                        train_batch_size=TRAIN_BATCH_SIZE,
                                        eval_batch_size=EVAL_BATCH_SIZE)

print('\n__________\nStarted training at {} '.format(datetime.datetime.now()))
print('\nNum examples = {}'.format(len(train_examples)))
print('\nBatch size = {}'.format(TRAIN_BATCH_SIZE))
tf.logging.info("Num steps = %d", num_train_steps)
示例#6
0
def main():
    # bert
    bert_config_file = tempfile.NamedTemporaryFile(mode='w+t',
                                                   encoding='utf-8',
                                                   suffix='.json')
    bert_config_file.write(
        json.dumps(
            {k: str_to_value(v)
             for k, v in config['BERT-CONFIG'].items()}))
    bert_config_file.seek(0)  # [注意] 最初からread するから
    bert_config = modeling.BertConfig.from_json_file(bert_config_file.name)
    latest_ckpt = latest_ckpt_model()
    # model.ckpt-11052.index, model.ckpt-11052.meta データの prefix
    finetuned_model_path = latest_ckpt.split('.data-00000-of-00001')[0]
    flags = FLAGS(finetuned_model_path)
    processor = LivedoorProcessor()
    label_list = processor.get_labels()

    # sentencepiece
    tokenizer = tokenization.FullTokenizer(model_file=flags.model_file,
                                           vocab_file=flags.vocab_file,
                                           do_lower_case=flags.do_lower_case)

    # no use TPU
    tpu_cluster_resolver = None
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    # config
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=flags.master,
        model_dir=flags.output_dir,
        save_checkpoints_steps=flags.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=flags.iterations_per_loop,
            num_shards=flags.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=flags.init_checkpoint,
                                learning_rate=flags.learning_rate,
                                num_train_steps=flags.num_train_steps,
                                num_warmup_steps=flags.num_warmup_steps,
                                use_tpu=flags.use_tpu,
                                use_one_hot_embeddings=flags.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=flags.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=flags.train_batch_size,
        eval_batch_size=flags.eval_batch_size,
        predict_batch_size=flags.predict_batch_size)

    # テストデータコレクションの取得
    predict_examples = processor.get_test_examples(flags.data_dir)
    predict_file = tempfile.NamedTemporaryFile(mode='w+t',
                                               encoding='utf-8',
                                               suffix='.tf_record')
    """Convert a set of `InputExample`s to a TFRecord file."""
    """出力: predict_file.name """
    # https://github.com/yoheikikuta/bert-japanese/blob/master/src/run_classifier.py#L371-L380
    file_based_convert_examples_to_features(predict_examples, label_list,
                                            flags.max_seq_length, tokenizer,
                                            predict_file.name)
    predict_drop_remainder = True if flags.use_tpu else False

    # TPUEstimatorに渡すクロージャを作成
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file.name,
        seq_length=flags.max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)
    # 推論
    result = estimator.predict(input_fn=predict_input_fn)
    result = list(result)

    # 精度を計算
    accracy(result, label_list)
示例#7
0
def construct_bert_predictor(init_checkpoint = '/home/ubuntu/bert/models/imdb_350_16_output/model.ckpt-7812', dataset_name = 'sst'):
  model_dir = './bert/models/uncased_L-12_H-768_A-12'
  bert_config_file = os.path.join(model_dir, 'bert_config.json')
  vocab_file = os.path.join(model_dir, 'vocab.txt')
  
  
  output_dir = '.'
  save_checkpoints_steps = 1000
  iterations_per_loop = 1000
  num_tpu_cores = 8
  if dataset_name == 'sst':
    if init_checkpoint is None:
      init_checkpoint = './bert/models/sst_output/model.ckpt-6313'
    max_seq_length = 128
    task_name = 'sst-2'
    batch_size = 32 

  bert_config = modeling.BertConfig.from_json_file(bert_config_file)

  processor = processors[task_name]()

  label_list = processor.get_labels()

  tokenizer = tokenization.FullTokenizer(
  vocab_file=vocab_file, do_lower_case=True)

  tpu_cluster_resolver = None

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  run_config = tf.contrib.tpu.RunConfig(
  cluster=tpu_cluster_resolver,
  master=None,
  model_dir=output_dir,
  save_checkpoints_steps=save_checkpoints_steps,
  tpu_config=tf.contrib.tpu.TPUConfig(
      iterations_per_loop=iterations_per_loop,
      num_shards=num_tpu_cores,
      per_host_input_for_training=is_per_host))

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None

  model_fn = model_fn_builder(
  bert_config=bert_config,
  num_labels=len(label_list),
  init_checkpoint=init_checkpoint,
  learning_rate=2e-5,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=False,
  use_one_hot_embeddings=False)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=False,
  model_fn=model_fn,
  config=run_config,
  train_batch_size=16,
  eval_batch_size=8,
  predict_batch_size=batch_size)

  return processor, estimator, tokenizer
示例#8
0
tpu_config = tf.contrib.tpu.TPUConfig(iterations_per_loop=ITERATIONS_PER_LOOP,
                                      num_shards=NUM_TPU_CORES,
                                      per_host_input_for_training=IS_PER_HOST)

run_config = tf.contrib.tpu.RunConfig(
    cluster=TPU_CLUSTER_RESOLVER,
    master=MASTER,
    model_dir=OUTPUT_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tpu_config)

model_fn = run_classifier.model_fn_builder(
    bert_config=bert_config,
    num_labels=2,
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=NUM_TRAIN_STEPS,
    num_warmup_steps=NUM_WARMUP_STEPS,
    use_tpu=USE_TPU,
    use_one_hot_embeddings=USE_ONE_HOT_EMBEDDING)

estimator = tf.contrib.tpu.TPUEstimator(use_tpu=USE_TPU,
                                        model_fn=model_fn,
                                        config=run_config,
                                        train_batch_size=TRAIN_BATCH_SIZE,
                                        eval_batch_size=EVAL_BATCH_SIZE,
                                        predict_batch_size=PREDICT_BATCH_SIZE)

predictions = estimator.predict(test_input_fn)

probs0 = []
示例#9
0
    def setup_estimator(self, num_train_examples, label_list):
        """
        setup tensorflow estimator to use
        and set as self.estimator
        """
        ## clean output
        if num_train_examples > 0 and tf.gfile.Exists(self.config.output_dir):
            tf.gfile.DeleteRecursively(self.config.output_dir)
        ## make output
        tf.gfile.MakeDirs(self.config.output_dir)

        bert_config = modeling.BertConfig.from_json_file(
            self.config.bert_config_file)

        if self.config.max_seq_length > bert_config.max_position_embeddings:
            raise ValueError(
                "Cannot use sequence length %d because the BERT model "
                "was only trained up to sequence length %d" %
                (self.config.max_seq_length,
                 bert_config.max_position_embeddings))

        tpu_cluster_resolver = None
        if self.config.use_tpu and self.config.tpu_name:
            tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
                self.config.tpu_name,
                zone=self.config.tpu_zone,
                project=self.config.gcp_project)

        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=self.config.master,
            model_dir=self.config.output_dir,
            save_checkpoints_steps=self.config.save_checkpoints_steps,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=self.config.iterations_per_loop,
                num_shards=self.config.num_tpu_cores,
                per_host_input_for_training=is_per_host))

        num_train_steps = int(num_train_examples /
                              self.config.train_batch_size *
                              self.config.num_train_epochs)
        num_warmup_steps = int(num_train_steps * self.config.warmup_proportion)

        model_fn = model_fn_builder(
            bert_config=bert_config,
            num_labels=len(label_list),
            init_checkpoint=self.config.init_checkpoint,
            learning_rate=self.config.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=self.config.use_tpu,
            use_one_hot_embeddings=self.config.use_tpu)

        # If TPU is not available, this will fall back to normal Estimator on CPU
        # or GPU.
        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=self.config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=self.config.train_batch_size,
            eval_batch_size=self.config.eval_batch_size,
            predict_batch_size=self.config.predict_batch_size)
        self.estimator = estimator
        self.num_train_steps = num_train_steps
示例#10
0
def main(_):
    tf.logging.set_verbosity(tf.logging.DEBUG)

    processors = {
        "mana169": ManaProcessor169,
    }

    tokenization.validate_case_matches_checkpoint(DO_LOWER_CASE, INIT_CKPT)

    bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE)

    if MAX_SEQ_LENGTH > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (MAX_SEQ_LENGTH, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(OUTPUT_DIR)

    task_name = 'mana169'

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE,
                                           do_lower_case=DO_LOWER_CASE)

    tpu_cluster_resolver = None

    hooks = []
    # create a logging tensor hook because this takes forever on cpu
    logger = tf.train.LoggingTensorHook({"Input": "IteratorGetNext:0"},
                                        every_n_iter=1)
    hooks.append(logger)
    # debug_hook = tfdbg.LocalCLIDebugHook()
    # hooks.append(debug_hook)

    run_config = tf.contrib.tpu.RunConfig(cluster=tpu_cluster_resolver,
                                          model_dir=OUTPUT_DIR,
                                          save_checkpoints_steps=1)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=INIT_CKPT,
                                learning_rate=5e-5,
                                num_train_steps=None,
                                num_warmup_steps=None,
                                use_tpu=False,
                                use_one_hot_embeddings=False)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=False,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=PREDICT_BATCH_SIZE)

    input_file = sys.argv[1]

    predict_examples = read_input_examples(input_file)
    num_actual_predict_examples = len(predict_examples)
    # if FLAGS.use_tpu:
    #     # TPU requires a fixed batch size for all batches, therefore the number
    #     # of examples must be a multiple of the batch size, or else examples
    #     # will get dropped. So we pad with fake examples which are ignored
    #     # later on.
    #     while len(predict_examples) % FLAGS.predict_batch_size != 0:
    #         predict_examples.append(PaddingInputExample())

    predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record")
    file_based_convert_examples_to_features(predict_examples, label_list,
                                            MAX_SEQ_LENGTH, tokenizer,
                                            predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(predict_examples), num_actual_predict_examples,
                    len(predict_examples) - num_actual_predict_examples)
    tf.logging.info("  Batch size = %d", PREDICT_BATCH_SIZE)

    predict_drop_remainder = False
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    result = estimator.predict(input_fn=predict_input_fn, hooks=hooks)

    output_predict_file = os.path.join(OUTPUT_DIR, sys.argv[2])

    scores_list = []

    num_written_lines = 0
    tf.logging.info("***** Predict results *****")
    for (i, prediction) in enumerate(result):
        probabilities = prediction["probabilities"]
        if i >= num_actual_predict_examples:
            break
        # writer.write(output_line)
        scores_list.append(probabilities)
        num_written_lines += 1
    assert num_written_lines == num_actual_predict_examples

    scores_array = np.array(scores_list)
    # write the scores in a useful form
    top3_scores = []
    all_topics = processor.get_labels()
    for i, row in enumerate(scores_array):
        top3_indices = row.argsort()[::-1][:3]
        # index 1, score 1, index 2, score 2, etc
        l = []
        l += [input_df.values[i][j] for j in range(input_df.shape[1] - 1)
              ]  # all but the original input
        l.append(str(input_df.values[i][-1]).replace(
            '\n', ''))  # take the original input and remove newlines
        for v in top3_indices:
            l.append(all_topics[v])
            l.append(row[v])
        top3_scores.append(l)

    score_df = pd.DataFrame(
        top3_scores,
        columns=list(input_df.columns.values) +
        ["Class 1", "Score 1", "Class 2", "Score 2", "Class 3", "Score 3"])
    score_df.to_csv(output_predict_file, index=None)
示例#11
0
    def traineval_model(self, local_dir,
                       nb_epoch,
                       batch_size):
        """
        Use the BERT Uncased language model to train on
        new data
        """
        tf.logging.set_verbosity(tf.logging.INFO)
        logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH)
        logging.info("*:Local Dir%s",local_dir)


        mod_name = self.model_name
        BERT_MODEL = 'uncased_L-12_H-768_A-12'
        BERT_PRETRAINED_DIR = BERT_MODEL_PATH
        OUTPUT_DIR = os.path.join(local_dir,'output_bert')
        DATA_DIR = os.path.join(local_dir,'data')
        logging.info('***** Model output directory: %s*****',OUTPUT_DIR)
        logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR)
        logging.info('***** DATA directory: %s *****',DATA_DIR)
        TRAIN_BATCH_SIZE = 32
        EVAL_BATCH_SIZE = 8
        LEARNING_RATE = 2e-5
        NUM_TRAIN_EPOCHS = 3.0
        WARMUP_PROPORTION = 0.1
        MAX_SEQ_LENGTH = 128
        # Model configs
        # if you wish to finetune a model on a larger dataset, use larger interval
        SAVE_CHECKPOINTS_STEPS = 1000
        # each checpoint weights about 1,5gb
        ITERATIONS_PER_LOOP = 1000
        NUM_TPU_CORES = 8

        VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt')
        BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json')
        INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
        DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

        logging.info("Found VOCAB File:%s",VOCAB_FILE)
        bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE)
        tf.gfile.MakeDirs(OUTPUT_DIR)
        processor = run_classifier.ColaProcessor()
        label_list = processor.get_labels()
        tokenizer = tokenization.FullTokenizer(
        vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

        # Since training will happen on GPU, we won't need a cluster resolver
        tpu_cluster_resolver = None
        # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator.
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=OUTPUT_DIR,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=ITERATIONS_PER_LOOP,
                num_shards=NUM_TPU_CORES,
                per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

        train_examples = None
        num_train_steps = None
        num_warmup_steps = None
        train_examples = processor.get_train_examples(DATA_DIR)
        num_train_steps = int(
            len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
        num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

        model_fn = run_classifier.model_fn_builder(
            bert_config=bert_config,
            num_labels=len(label_list),
            init_checkpoint=INIT_CHECKPOINT,
            learning_rate=LEARNING_RATE,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            use_one_hot_embeddings=False) #Try with True

        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            model_fn=model_fn,
            config=run_config,
            train_batch_size=TRAIN_BATCH_SIZE,
            eval_batch_size=EVAL_BATCH_SIZE)

        # Train the model.
        logging.info('Starting Training...')
        train_file = os.path.join(OUTPUT_DIR, "train.tf_record")
        run_classifier.file_based_convert_examples_to_features(
            train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, train_file)
        tf.logging.info('***** Started training at {} *****'.format(datetime.datetime.now()))
        tf.logging.info('  Num examples = {}'.format(len(train_examples)))
        tf.logging.info('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=train_file,
            seq_length=MAX_SEQ_LENGTH,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
        final_ckpt = estimator.latest_checkpoint()
        print('***** Finished training at {} *****'.format(datetime.datetime.now()))
        logging.info("*****Final Checkpoint*****%s",final_ckpt)
        final_ckpt_file = os.path.join(OUTPUT_DIR, "final_ckpt.txt")
        with tf.gfile.GFile(final_ckpt_file, "w") as writer:
            writer.write("%s" % final_ckpt)


        # Do Eval
        logging.info('Starting Eval..')
        eval_examples = processor.get_dev_examples(DATA_DIR)
        num_actual_eval_examples = len(eval_examples)
        eval_file = os.path.join(OUTPUT_DIR, "eval.tf_record")
        run_classifier.file_based_convert_examples_to_features(
            eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", TRAIN_BATCH_SIZE)
        eval_steps = None

        eval_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=MAX_SEQ_LENGTH,
            is_training=False,
            drop_remainder=False)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
          tf.logging.info("***** Eval results *****")
          for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
        
        return result
示例#12
0
    def test_model(self, local_dir,
                       nb_epoch,
                       batch_size,
                       bucket_name):
        """
        Use the BERT Uncased language model to train on
        new data
        """
        tf.logging.set_verbosity(tf.logging.INFO)
        logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH)
        logging.info("*:Local Dir%s",local_dir)


        mod_name = self.model_name
        BERT_MODEL = 'uncased_L-12_H-768_A-12'
        BERT_PRETRAINED_DIR = BERT_MODEL_PATH
        OUTPUT_DIR = os.path.join(local_dir,'output_bert')
        DATA_DIR = os.path.join(local_dir,'data')
        logging.info('***** Model output directory: %s*****',OUTPUT_DIR)
        logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR)
        logging.info('***** DATA directory: %s *****',DATA_DIR)
        TRAIN_BATCH_SIZE = 32
        EVAL_BATCH_SIZE = 8
        PREDICT_BATCH_SIZE = 32
        LEARNING_RATE = 2e-5
        NUM_TRAIN_EPOCHS = 3.0
        WARMUP_PROPORTION = 0.1
        MAX_SEQ_LENGTH = 128
        # Model configs
        # if you wish to finetune a model on a larger dataset, use larger interval
        SAVE_CHECKPOINTS_STEPS = 1000
        # each checpoint weights about 1,5gb
        ITERATIONS_PER_LOOP = 1000
        NUM_TPU_CORES = 8

        VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt')
        BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json')
        with open(os.path.join(OUTPUT_DIR,'final_ckpt.txt')) as f:
            content = f.readlines()
            logging.info("***Final_cktp->%s\n",content)
        test_ckpt = content[0].split('/')[-1]
        INIT_CHECKPOINT = os.path.join(OUTPUT_DIR, test_ckpt)
        DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

        logging.info("Found VOCAB File:%s",VOCAB_FILE)
        bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE)
        tf.gfile.MakeDirs(OUTPUT_DIR)
        processor = run_classifier.ColaProcessor()
        label_list = processor.get_labels()
        tokenizer = tokenization.FullTokenizer(
        vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

        # Since training will happen on GPU, we won't need a cluster resolver
        tpu_cluster_resolver = None
        # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator.
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=OUTPUT_DIR,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=ITERATIONS_PER_LOOP,
                num_shards=NUM_TPU_CORES,
                per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

        train_examples = None
        num_train_steps = None
        num_warmup_steps = None
        

        model_fn = run_classifier.model_fn_builder(
            bert_config=bert_config,
            num_labels=len(label_list),
            init_checkpoint=INIT_CHECKPOINT,
            learning_rate=LEARNING_RATE,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            use_one_hot_embeddings=False) #Try with True

        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            model_fn=model_fn,
            config=run_config,
            train_batch_size=TRAIN_BATCH_SIZE,
            eval_batch_size=EVAL_BATCH_SIZE,
            predict_batch_size=PREDICT_BATCH_SIZE)
        
        predict_examples = processor.get_test_examples(DATA_DIR)
        num_actual_predict_examples = len(predict_examples)
        predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record")
        run_classifier.file_based_convert_examples_to_features(predict_examples, label_list,
                                                MAX_SEQ_LENGTH, tokenizer,
                                                predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", batch_size)

        predict_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=MAX_SEQ_LENGTH,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(OUTPUT_DIR, "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
        s3 = boto3.resource('s3')
        tf.logging.info("Done with prediction uploading results to S3")
        try:
            s3.Bucket(bucket_name).upload_file(output_predict_file, output_predict_file)
        except Exception as err:
            logging.info("Unable to upload to S3")
            logging.info(err)


        return 1
示例#13
0
run_config = tf.contrib.tpu.RunConfig(
    model_dir=OUTPUT_DIR,
    cluster=None,
    master=None,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=1000,
        num_shards=8,
        per_host_input_for_training=is_per_host))

#model
model_fn = run_classifier.model_fn_builder(bert_config=bert_config,
                                           num_labels=len(label_list),
                                           init_checkpoint=init_checkpoint,
                                           learning_rate=5e-5,
                                           num_train_steps=None,
                                           num_warmup_steps=None,
                                           use_tpu=False,
                                           use_one_hot_embeddings=False)

#estimator
estimator = tf.contrib.tpu.TPUEstimator(use_tpu=False,
                                        model_fn=model_fn,
                                        config=run_config,
                                        train_batch_size=BATCH_SIZE,
                                        eval_batch_size=BATCH_SIZE,
                                        predict_batch_size=BATCH_SIZE)

# emotions
emotions = pd.read_csv(emotions_file, header=None)
示例#14
0
def main(argv):

    BERT_MODEL = 'uncased_L-12_H-768_A-12'
    VOCAB_FILE = '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/vocab.txt'
    CONFIG_FILE = '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/bert_config.json'
    INIT_CHECKPOINT = '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/bert_model.ckpt'
    DO_LOWER_CASE = BERT_MODEL.startswith('uncased')
    model_dir = "{}/{}".format("/opt/tftuner", mltunerUtil.get_job_id())

    # model fix parameter
    TRAIN_BATCH_SIZE = mltunerUtil.get_batch_size()
    NUM_TRAIN_EPOCHS = 3
    LEARNING_RATE = mltunerUtil.get_learning_rate()
    WARMUP_PROPORTION = 0.05
    EVAL_BATCH_SIZE = 8
    MAX_SEQ_LENGTH = 128

    #data loading
    train_df = pd.read_csv(
        '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/train.csv')
    train_df = train_df.sample(1000)
    train, test = train_test_split(train_df, test_size=0.1, random_state=42)
    train_lines, train_labels = train.question_text.values, train.target.values
    test_lines, test_labels = test.question_text.values, test.target.values
    label_list = ['0', '1']
    tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE,
                                           do_lower_case=DO_LOWER_CASE)
    train_examples = create_examples(train_lines, 'train', labels=train_labels)

    num_train_steps = int(
        len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    strategy = tf.distribute.experimental.ParameterServerStrategy()
    session_config = mltunerUtil.get_tf_session_config()
    config = tf.compat.v1.estimator.tpu.RunConfig(
        train_distribute=strategy,
        model_dir=model_dir,
        save_checkpoints_steps=None,
        save_checkpoints_secs=None,
        session_config=session_config)

    model_fn = run_classifier.model_fn_builder(
        bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
        num_labels=len(label_list),
        init_checkpoint=None,
        learning_rate=LEARNING_RATE,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=
        False,  #If False training will fall on CPU or GPU, depending on what is available  
        use_one_hot_embeddings=True)

    estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
        use_tpu=
        False,  #If False training will fall on CPU or GPU, depending on what is available 
        model_fn=model_fn,
        config=config,
        train_batch_size=TRAIN_BATCH_SIZE,
        eval_batch_size=EVAL_BATCH_SIZE)

    class LoggerHook(tf.estimator.SessionRunHook):
        """Logs loss and runtime."""
        def __init__(self):
            self.last_run_timestamp = time.time()

        def after_run(self, run_context, run_values):
            session: tf.Session = run_context.session
            loss, step = session.run([
                tf.compat.v1.get_collection("losses")[0],
                tf.compat.v1.get_collection("global_step_read_op_cache")[0]
            ])
            logging.debug("step:{} loss:{}".format(step, loss))
            mltunerUtil.report_iter_loss(step, loss,
                                         time.time() - self.last_run_timestamp)
            self.last_run_timestamp = time.time()

    # prepare for train
    train_features = run_classifier.convert_examples_to_features(
        train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    train_input_fn = input_fn_builder(features=train_features,
                                      seq_length=MAX_SEQ_LENGTH,
                                      is_training=True,
                                      drop_remainder=True)

    predict_examples = create_examples(test_lines, 'test')
    predict_features = run_classifier.convert_examples_to_features(
        predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = input_fn_builder(features=predict_features,
                                        seq_length=MAX_SEQ_LENGTH,
                                        is_training=False,
                                        drop_remainder=False)

    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=num_train_steps,
                                        hooks=[LoggerHook()])
    eval_spec = tf.estimator.EvalSpec(input_fn=predict_input_fn)

    # wait for chief ready?
    if not (mltunerUtil.is_chief() or mltunerUtil.is_ps()):
        time.sleep(1)
        if not tf.io.gfile.exists(model_dir):
            logging.debug("wait for chief init")
            time.sleep(1)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#15
0
def mrpc_classifier(sent_list1, sent_list2, args):
    TUNED_MODEL_DIR = args.tuned_model_dir

    config = {
        "task_name": 'MRPC',
        "do_predict": True,
        "vocab_file": f"{TUNED_MODEL_DIR}/vocab.txt",
        "bert_config_file": f"{TUNED_MODEL_DIR}/bert_config.json",
        "init_checkpoint": f"{TUNED_MODEL_DIR}",
        "max_seq_length": 128,
        "output_dir": f"{TUNED_MODEL_DIR}",
        "do_lower_case": True,
        "predict_batch_size": 8
    }

    bert_config = modeling.BertConfig.from_json_file(
        config["bert_config_file"])
    processor = run_classifier.MrpcProcessor()
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(
        vocab_file=config["vocab_file"], do_lower_case=config["do_lower_case"])

    run_config = tf.contrib.tpu.RunConfig(
        cluster=None,
        master=None,
        model_dir=config["output_dir"],
        save_checkpoints_steps=1000,
    )

    model_fn = run_classifier.model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=config["init_checkpoint"],
        learning_rate=5e-5,
        num_train_steps=None,
        num_warmup_steps=None,
        use_tpu=False,
        use_one_hot_embeddings=False)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=False,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=config["predict_batch_size"])

    predict_examples = get_predict_examples(sent_list1, sent_list2)
    num_actual_predict_examples = len(predict_examples)

    predict_file = os.path.join(config["output_dir"], "predict.tf_record")
    run_classifier.file_based_convert_examples_to_features(
        predict_examples, label_list, config["max_seq_length"], tokenizer,
        predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(predict_examples), num_actual_predict_examples,
                    len(predict_examples) - num_actual_predict_examples)
    tf.logging.info("  Batch size = %d", config["predict_batch_size"])

    predict_drop_remainder = False
    predict_input_fn = run_classifier.file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=config["max_seq_length"],
        is_training=False,
        drop_remainder=predict_drop_remainder)

    result = estimator.predict(input_fn=predict_input_fn)

    probabilities = [
        prediction["probabilities"][1] for (i, prediction) in enumerate(result)
    ]
    return probabilities