示例#1
0
def write_fake_predictions(output_path, task="MRPC"):
    """
  :param input_path:
  :param output_path:
  :param task:
  :return:
  >>> write_fake_predictions("/work/anlausch/replant/bert/predictions/base_32_5e-05_3.0/copy_for_submission/fakes/STS-B.tsv", task="STSB")
  """
    if task != "STSB":
        import run_classifier
    else:
        import run_regression
    if task == "MNLI":
        test_examples = run_classifier.MnliProcessor().get_test_examples(
            os.environ['GLUE_DIR'] + "/" + task, False)
        labels = run_classifier.MnliProcessor().get_labels()
    elif task == "QQP":
        test_examples = run_classifier.QQPProcessor().get_test_examples(
            os.environ['GLUE_DIR'] + "/" + task)
        labels = run_classifier.QQPProcessor().get_labels()
    elif task == "WNLI":
        test_examples = run_classifier.WNLIProcessor().get_test_examples(
            os.environ['GLUE_DIR'] + "/" + task)
        labels = run_classifier.WNLIProcessor().get_labels()
    elif task == "CoLA":
        test_examples = run_classifier.ColaProcessor().get_test_examples(
            os.environ['GLUE_DIR'] + "/" + task)
        labels = run_classifier.ColaProcessor().get_labels()
    elif task == "STSB":
        test_examples = run_regression.STSBProcessor().get_test_examples(
            os.environ['GLUE_DIR'] + "/" + task)
    elif task == "diagnostic":
        test_examples = run_classifier.DiagnosticProcessor().get_test_examples(
            os.environ['GLUE_DIR'] + "/" + task)
        labels = run_classifier.DiagnosticProcessor().get_labels()
    with codecs.open(output_path, "w", "utf8") as f_out:
        f_out.write("index\tprediction\n")
        if task != "STSB":
            for i, data in enumerate(test_examples):
                f_out.write(str(i) + "\t" + str(labels[0]) + "\n")
        else:
            for i, data in enumerate(test_examples):
                f_out.write(str(i) + "\t" + str(2.5) + "\n")
        f_out.close()
示例#2
0
def parse_predictions(input_path, output_path, task="STSB"):
    """
  :param input_path:
  :param output_path:
  :param task:
  :return:
  >>> parse_predictions("/work/anlausch/replant/bert/predictions/wn_binary/mnli_neu_32_5e-05_3.0/test_results.tsv", "/work/anlausch/replant/bert/predictions/wn_binary_32_5e-05_3.0/MNLI-mm-neu.tsv", task="MNLI")
  """
    if task != "STSB":
        import run_classifier
    else:
        import run_regression
    predicted_labels = []
    if task == "MRPC":
        #ids = MrpcProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/MRPC")
        labels = run_classifier.MrpcProcessor().get_labels()
    if task == "RTE":
        labels = run_classifier.RTEProcessor().get_labels()
    if task == "QNLI":
        labels = run_classifier.QNLIProcessor().get_labels()
    if task == "QNLIV2":
        labels = run_classifier.QNLIProcessor().get_labels()
    if task == "MNLI":
        labels = run_classifier.MnliProcessor().get_labels()
    if task == "SST2":
        labels = run_classifier.SST2Processor().get_labels()
    if task == "CoLA":
        labels = run_classifier.ColaProcessor().get_labels()
    if task == "QQP":
        labels = run_classifier.QQPProcessor().get_labels()
    if task == "diagnostic":
        labels = run_classifier.DiagnosticProcessor().get_labels()
    with codecs.open(input_path, "r", "utf8") as f_in:
        for line in f_in.readlines():
            predictions = np.array(line.split("\t"), dtype=np.float32)
            if task != "STSB":
                predicted_index = np.argmax(predictions)
                predicted_labels.append(labels[predicted_index])
            else:
                predicted_labels.append(predictions[0])
        f_in.close()
    with codecs.open(output_path, "w", "utf8") as f_out:
        f_out.write("index\tprediction\n")
        for i, prediction in enumerate(predicted_labels):
            f_out.write(str(i) + "\t" + str(prediction) + "\n")
        f_out.close()
 def __init__(self, path):
     self.init_checkpoint = path + "/anshaj.ckpt"
     self.tokenization = run_classifier.tokenization
     processor = run_classifier.ColaProcessor()
     BATCH_SIZE = 32
     self.MAX_SEQ_LENGTH = 50
     self.tokenization.validate_case_matches_checkpoint(
         False, self.init_checkpoint)
     bert_config = run_classifier.modeling.BertConfig.from_json_file(
         path + "/bert_config.json")
     self.tokenizer = self.tokenization.FullTokenizer(vocab_file=path +
                                                      "/vocab.txt",
                                                      do_lower_case=False)
     is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
     run_config = tf.contrib.tpu.RunConfig(
         model_dir=path,
         cluster=None,
         master=None,
         save_checkpoints_steps=500,
         tpu_config=tf.contrib.tpu.TPUConfig(
             iterations_per_loop=1000,
             num_shards=8,
             per_host_input_for_training=is_per_host))
     model_fn = run_classifier.model_fn_builder(
         bert_config=bert_config,
         num_labels=3,
         init_checkpoint=self.init_checkpoint,
         learning_rate=1e-05,
         num_train_steps=None,
         num_warmup_steps=None,
         use_tpu=False,
         use_one_hot_embeddings=False)
     self.estimator = tf.contrib.tpu.TPUEstimator(
         use_tpu=False,
         model_fn=model_fn,
         config=run_config,
         train_batch_size=BATCH_SIZE,
         eval_batch_size=BATCH_SIZE,
         predict_batch_size=BATCH_SIZE)
示例#4
0
    def traineval_model(self, local_dir,
                       nb_epoch,
                       batch_size):
        """
        Use the BERT Uncased language model to train on
        new data
        """
        tf.logging.set_verbosity(tf.logging.INFO)
        logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH)
        logging.info("*:Local Dir%s",local_dir)


        mod_name = self.model_name
        BERT_MODEL = 'uncased_L-12_H-768_A-12'
        BERT_PRETRAINED_DIR = BERT_MODEL_PATH
        OUTPUT_DIR = os.path.join(local_dir,'output_bert')
        DATA_DIR = os.path.join(local_dir,'data')
        logging.info('***** Model output directory: %s*****',OUTPUT_DIR)
        logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR)
        logging.info('***** DATA directory: %s *****',DATA_DIR)
        TRAIN_BATCH_SIZE = 32
        EVAL_BATCH_SIZE = 8
        LEARNING_RATE = 2e-5
        NUM_TRAIN_EPOCHS = 3.0
        WARMUP_PROPORTION = 0.1
        MAX_SEQ_LENGTH = 128
        # Model configs
        # if you wish to finetune a model on a larger dataset, use larger interval
        SAVE_CHECKPOINTS_STEPS = 1000
        # each checpoint weights about 1,5gb
        ITERATIONS_PER_LOOP = 1000
        NUM_TPU_CORES = 8

        VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt')
        BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json')
        INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
        DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

        logging.info("Found VOCAB File:%s",VOCAB_FILE)
        bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE)
        tf.gfile.MakeDirs(OUTPUT_DIR)
        processor = run_classifier.ColaProcessor()
        label_list = processor.get_labels()
        tokenizer = tokenization.FullTokenizer(
        vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

        # Since training will happen on GPU, we won't need a cluster resolver
        tpu_cluster_resolver = None
        # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator.
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=OUTPUT_DIR,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=ITERATIONS_PER_LOOP,
                num_shards=NUM_TPU_CORES,
                per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

        train_examples = None
        num_train_steps = None
        num_warmup_steps = None
        train_examples = processor.get_train_examples(DATA_DIR)
        num_train_steps = int(
            len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
        num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

        model_fn = run_classifier.model_fn_builder(
            bert_config=bert_config,
            num_labels=len(label_list),
            init_checkpoint=INIT_CHECKPOINT,
            learning_rate=LEARNING_RATE,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            use_one_hot_embeddings=False) #Try with True

        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            model_fn=model_fn,
            config=run_config,
            train_batch_size=TRAIN_BATCH_SIZE,
            eval_batch_size=EVAL_BATCH_SIZE)

        # Train the model.
        logging.info('Starting Training...')
        train_file = os.path.join(OUTPUT_DIR, "train.tf_record")
        run_classifier.file_based_convert_examples_to_features(
            train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, train_file)
        tf.logging.info('***** Started training at {} *****'.format(datetime.datetime.now()))
        tf.logging.info('  Num examples = {}'.format(len(train_examples)))
        tf.logging.info('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=train_file,
            seq_length=MAX_SEQ_LENGTH,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
        final_ckpt = estimator.latest_checkpoint()
        print('***** Finished training at {} *****'.format(datetime.datetime.now()))
        logging.info("*****Final Checkpoint*****%s",final_ckpt)
        final_ckpt_file = os.path.join(OUTPUT_DIR, "final_ckpt.txt")
        with tf.gfile.GFile(final_ckpt_file, "w") as writer:
            writer.write("%s" % final_ckpt)


        # Do Eval
        logging.info('Starting Eval..')
        eval_examples = processor.get_dev_examples(DATA_DIR)
        num_actual_eval_examples = len(eval_examples)
        eval_file = os.path.join(OUTPUT_DIR, "eval.tf_record")
        run_classifier.file_based_convert_examples_to_features(
            eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", TRAIN_BATCH_SIZE)
        eval_steps = None

        eval_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=MAX_SEQ_LENGTH,
            is_training=False,
            drop_remainder=False)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
          tf.logging.info("***** Eval results *****")
          for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
        
        return result
示例#5
0
    def test_model(self, local_dir,
                       nb_epoch,
                       batch_size,
                       bucket_name):
        """
        Use the BERT Uncased language model to train on
        new data
        """
        tf.logging.set_verbosity(tf.logging.INFO)
        logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH)
        logging.info("*:Local Dir%s",local_dir)


        mod_name = self.model_name
        BERT_MODEL = 'uncased_L-12_H-768_A-12'
        BERT_PRETRAINED_DIR = BERT_MODEL_PATH
        OUTPUT_DIR = os.path.join(local_dir,'output_bert')
        DATA_DIR = os.path.join(local_dir,'data')
        logging.info('***** Model output directory: %s*****',OUTPUT_DIR)
        logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR)
        logging.info('***** DATA directory: %s *****',DATA_DIR)
        TRAIN_BATCH_SIZE = 32
        EVAL_BATCH_SIZE = 8
        PREDICT_BATCH_SIZE = 32
        LEARNING_RATE = 2e-5
        NUM_TRAIN_EPOCHS = 3.0
        WARMUP_PROPORTION = 0.1
        MAX_SEQ_LENGTH = 128
        # Model configs
        # if you wish to finetune a model on a larger dataset, use larger interval
        SAVE_CHECKPOINTS_STEPS = 1000
        # each checpoint weights about 1,5gb
        ITERATIONS_PER_LOOP = 1000
        NUM_TPU_CORES = 8

        VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt')
        BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json')
        with open(os.path.join(OUTPUT_DIR,'final_ckpt.txt')) as f:
            content = f.readlines()
            logging.info("***Final_cktp->%s\n",content)
        test_ckpt = content[0].split('/')[-1]
        INIT_CHECKPOINT = os.path.join(OUTPUT_DIR, test_ckpt)
        DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

        logging.info("Found VOCAB File:%s",VOCAB_FILE)
        bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE)
        tf.gfile.MakeDirs(OUTPUT_DIR)
        processor = run_classifier.ColaProcessor()
        label_list = processor.get_labels()
        tokenizer = tokenization.FullTokenizer(
        vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

        # Since training will happen on GPU, we won't need a cluster resolver
        tpu_cluster_resolver = None
        # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator.
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=OUTPUT_DIR,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=ITERATIONS_PER_LOOP,
                num_shards=NUM_TPU_CORES,
                per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

        train_examples = None
        num_train_steps = None
        num_warmup_steps = None
        

        model_fn = run_classifier.model_fn_builder(
            bert_config=bert_config,
            num_labels=len(label_list),
            init_checkpoint=INIT_CHECKPOINT,
            learning_rate=LEARNING_RATE,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            use_one_hot_embeddings=False) #Try with True

        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=False,  # If False training will fall on CPU or GPU, depending on what is available
            model_fn=model_fn,
            config=run_config,
            train_batch_size=TRAIN_BATCH_SIZE,
            eval_batch_size=EVAL_BATCH_SIZE,
            predict_batch_size=PREDICT_BATCH_SIZE)
        
        predict_examples = processor.get_test_examples(DATA_DIR)
        num_actual_predict_examples = len(predict_examples)
        predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record")
        run_classifier.file_based_convert_examples_to_features(predict_examples, label_list,
                                                MAX_SEQ_LENGTH, tokenizer,
                                                predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", batch_size)

        predict_input_fn = run_classifier.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=MAX_SEQ_LENGTH,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn)
        output_predict_file = os.path.join(OUTPUT_DIR, "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
        s3 = boto3.resource('s3')
        tf.logging.info("Done with prediction uploading results to S3")
        try:
            s3.Bucket(bucket_name).upload_file(output_predict_file, output_predict_file)
        except Exception as err:
            logging.info("Unable to upload to S3")
            logging.info(err)


        return 1
示例#6
0
import json
import pandas as pd

tokenization = run_classifier.tokenization
model_base_path = 'model'  #modify accordingly
init_checkpoint = os.path.sep.join([
    os.path.dirname(os.path.realpath(__file__)), model_base_path, 'model.ckpt'
])
bert_config_file = os.path.sep.join([
    os.path.dirname(os.path.realpath(__file__)), model_base_path,
    'bert_config.json'
])
vocab_file = os.path.sep.join([
    os.path.dirname(os.path.realpath(__file__)), model_base_path, 'vocab.txt'
])
processor = run_classifier.ColaProcessor()
label_list = processor.get_labels()
emotions_file = os.path.sep.join(
    [os.path.dirname(os.path.realpath(__file__)), 'emotions.txt'])

#since the original bert source code combines train, eval and predict in one single configuration,
#we need to feed such data during initialization, can be anything as it is needed for run configuration
BATCH_SIZE = 8
SAVE_SUMMARY_STEPS = 100
SAVE_CHECKPOINTS_STEPS = 500
OUTPUT_DIR = "./output"

#variables that needed to be modified
labels = [str(i) for i in range(28)]  #modify based on the labels that you have
MAX_SEQ_LENGTH = 50  #modify based on the seq length
is_lower_case = True  #modify based on uncased or cased