def write_fake_predictions(output_path, task="MRPC"): """ :param input_path: :param output_path: :param task: :return: >>> write_fake_predictions("/work/anlausch/replant/bert/predictions/base_32_5e-05_3.0/copy_for_submission/fakes/STS-B.tsv", task="STSB") """ if task != "STSB": import run_classifier else: import run_regression if task == "MNLI": test_examples = run_classifier.MnliProcessor().get_test_examples( os.environ['GLUE_DIR'] + "/" + task, False) labels = run_classifier.MnliProcessor().get_labels() elif task == "QQP": test_examples = run_classifier.QQPProcessor().get_test_examples( os.environ['GLUE_DIR'] + "/" + task) labels = run_classifier.QQPProcessor().get_labels() elif task == "WNLI": test_examples = run_classifier.WNLIProcessor().get_test_examples( os.environ['GLUE_DIR'] + "/" + task) labels = run_classifier.WNLIProcessor().get_labels() elif task == "CoLA": test_examples = run_classifier.ColaProcessor().get_test_examples( os.environ['GLUE_DIR'] + "/" + task) labels = run_classifier.ColaProcessor().get_labels() elif task == "STSB": test_examples = run_regression.STSBProcessor().get_test_examples( os.environ['GLUE_DIR'] + "/" + task) elif task == "diagnostic": test_examples = run_classifier.DiagnosticProcessor().get_test_examples( os.environ['GLUE_DIR'] + "/" + task) labels = run_classifier.DiagnosticProcessor().get_labels() with codecs.open(output_path, "w", "utf8") as f_out: f_out.write("index\tprediction\n") if task != "STSB": for i, data in enumerate(test_examples): f_out.write(str(i) + "\t" + str(labels[0]) + "\n") else: for i, data in enumerate(test_examples): f_out.write(str(i) + "\t" + str(2.5) + "\n") f_out.close()
def parse_predictions(input_path, output_path, task="STSB"): """ :param input_path: :param output_path: :param task: :return: >>> parse_predictions("/work/anlausch/replant/bert/predictions/wn_binary/mnli_neu_32_5e-05_3.0/test_results.tsv", "/work/anlausch/replant/bert/predictions/wn_binary_32_5e-05_3.0/MNLI-mm-neu.tsv", task="MNLI") """ if task != "STSB": import run_classifier else: import run_regression predicted_labels = [] if task == "MRPC": #ids = MrpcProcessor().get_test_examples(os.environ['GLUE_DIR'] + "/MRPC") labels = run_classifier.MrpcProcessor().get_labels() if task == "RTE": labels = run_classifier.RTEProcessor().get_labels() if task == "QNLI": labels = run_classifier.QNLIProcessor().get_labels() if task == "QNLIV2": labels = run_classifier.QNLIProcessor().get_labels() if task == "MNLI": labels = run_classifier.MnliProcessor().get_labels() if task == "SST2": labels = run_classifier.SST2Processor().get_labels() if task == "CoLA": labels = run_classifier.ColaProcessor().get_labels() if task == "QQP": labels = run_classifier.QQPProcessor().get_labels() if task == "diagnostic": labels = run_classifier.DiagnosticProcessor().get_labels() with codecs.open(input_path, "r", "utf8") as f_in: for line in f_in.readlines(): predictions = np.array(line.split("\t"), dtype=np.float32) if task != "STSB": predicted_index = np.argmax(predictions) predicted_labels.append(labels[predicted_index]) else: predicted_labels.append(predictions[0]) f_in.close() with codecs.open(output_path, "w", "utf8") as f_out: f_out.write("index\tprediction\n") for i, prediction in enumerate(predicted_labels): f_out.write(str(i) + "\t" + str(prediction) + "\n") f_out.close()
def __init__(self, path): self.init_checkpoint = path + "/anshaj.ckpt" self.tokenization = run_classifier.tokenization processor = run_classifier.ColaProcessor() BATCH_SIZE = 32 self.MAX_SEQ_LENGTH = 50 self.tokenization.validate_case_matches_checkpoint( False, self.init_checkpoint) bert_config = run_classifier.modeling.BertConfig.from_json_file( path + "/bert_config.json") self.tokenizer = self.tokenization.FullTokenizer(vocab_file=path + "/vocab.txt", do_lower_case=False) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( model_dir=path, cluster=None, master=None, save_checkpoints_steps=500, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=1000, num_shards=8, per_host_input_for_training=is_per_host)) model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=3, init_checkpoint=self.init_checkpoint, learning_rate=1e-05, num_train_steps=None, num_warmup_steps=None, use_tpu=False, use_one_hot_embeddings=False) self.estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, train_batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, predict_batch_size=BATCH_SIZE)
def traineval_model(self, local_dir, nb_epoch, batch_size): """ Use the BERT Uncased language model to train on new data """ tf.logging.set_verbosity(tf.logging.INFO) logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH) logging.info("*:Local Dir%s",local_dir) mod_name = self.model_name BERT_MODEL = 'uncased_L-12_H-768_A-12' BERT_PRETRAINED_DIR = BERT_MODEL_PATH OUTPUT_DIR = os.path.join(local_dir,'output_bert') DATA_DIR = os.path.join(local_dir,'data') logging.info('***** Model output directory: %s*****',OUTPUT_DIR) logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR) logging.info('***** DATA directory: %s *****',DATA_DIR) TRAIN_BATCH_SIZE = 32 EVAL_BATCH_SIZE = 8 LEARNING_RATE = 2e-5 NUM_TRAIN_EPOCHS = 3.0 WARMUP_PROPORTION = 0.1 MAX_SEQ_LENGTH = 128 # Model configs # if you wish to finetune a model on a larger dataset, use larger interval SAVE_CHECKPOINTS_STEPS = 1000 # each checpoint weights about 1,5gb ITERATIONS_PER_LOOP = 1000 NUM_TPU_CORES = 8 VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt') BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json') INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt') DO_LOWER_CASE = BERT_MODEL.startswith('uncased') logging.info("Found VOCAB File:%s",VOCAB_FILE) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE) tf.gfile.MakeDirs(OUTPUT_DIR) processor = run_classifier.ColaProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) # Since training will happen on GPU, we won't need a cluster resolver tpu_cluster_resolver = None # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator. run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=OUTPUT_DIR, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=ITERATIONS_PER_LOOP, num_shards=NUM_TPU_CORES, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2)) train_examples = None num_train_steps = None num_warmup_steps = None train_examples = processor.get_train_examples(DATA_DIR) num_train_steps = int( len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=INIT_CHECKPOINT, learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available use_one_hot_embeddings=False) #Try with True estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available model_fn=model_fn, config=run_config, train_batch_size=TRAIN_BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE) # Train the model. logging.info('Starting Training...') train_file = os.path.join(OUTPUT_DIR, "train.tf_record") run_classifier.file_based_convert_examples_to_features( train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, train_file) tf.logging.info('***** Started training at {} *****'.format(datetime.datetime.now())) tf.logging.info(' Num examples = {}'.format(len(train_examples))) tf.logging.info(' Batch size = {}'.format(TRAIN_BATCH_SIZE)) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = run_classifier.file_based_input_fn_builder( input_file=train_file, seq_length=MAX_SEQ_LENGTH, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) final_ckpt = estimator.latest_checkpoint() print('***** Finished training at {} *****'.format(datetime.datetime.now())) logging.info("*****Final Checkpoint*****%s",final_ckpt) final_ckpt_file = os.path.join(OUTPUT_DIR, "final_ckpt.txt") with tf.gfile.GFile(final_ckpt_file, "w") as writer: writer.write("%s" % final_ckpt) # Do Eval logging.info('Starting Eval..') eval_examples = processor.get_dev_examples(DATA_DIR) num_actual_eval_examples = len(eval_examples) eval_file = os.path.join(OUTPUT_DIR, "eval.tf_record") run_classifier.file_based_convert_examples_to_features( eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", TRAIN_BATCH_SIZE) eval_steps = None eval_input_fn = run_classifier.file_based_input_fn_builder( input_file=eval_file, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def test_model(self, local_dir, nb_epoch, batch_size, bucket_name): """ Use the BERT Uncased language model to train on new data """ tf.logging.set_verbosity(tf.logging.INFO) logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH) logging.info("*:Local Dir%s",local_dir) mod_name = self.model_name BERT_MODEL = 'uncased_L-12_H-768_A-12' BERT_PRETRAINED_DIR = BERT_MODEL_PATH OUTPUT_DIR = os.path.join(local_dir,'output_bert') DATA_DIR = os.path.join(local_dir,'data') logging.info('***** Model output directory: %s*****',OUTPUT_DIR) logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR) logging.info('***** DATA directory: %s *****',DATA_DIR) TRAIN_BATCH_SIZE = 32 EVAL_BATCH_SIZE = 8 PREDICT_BATCH_SIZE = 32 LEARNING_RATE = 2e-5 NUM_TRAIN_EPOCHS = 3.0 WARMUP_PROPORTION = 0.1 MAX_SEQ_LENGTH = 128 # Model configs # if you wish to finetune a model on a larger dataset, use larger interval SAVE_CHECKPOINTS_STEPS = 1000 # each checpoint weights about 1,5gb ITERATIONS_PER_LOOP = 1000 NUM_TPU_CORES = 8 VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt') BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json') with open(os.path.join(OUTPUT_DIR,'final_ckpt.txt')) as f: content = f.readlines() logging.info("***Final_cktp->%s\n",content) test_ckpt = content[0].split('/')[-1] INIT_CHECKPOINT = os.path.join(OUTPUT_DIR, test_ckpt) DO_LOWER_CASE = BERT_MODEL.startswith('uncased') logging.info("Found VOCAB File:%s",VOCAB_FILE) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE) tf.gfile.MakeDirs(OUTPUT_DIR) processor = run_classifier.ColaProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) # Since training will happen on GPU, we won't need a cluster resolver tpu_cluster_resolver = None # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator. run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=OUTPUT_DIR, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=ITERATIONS_PER_LOOP, num_shards=NUM_TPU_CORES, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2)) train_examples = None num_train_steps = None num_warmup_steps = None model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=INIT_CHECKPOINT, learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available use_one_hot_embeddings=False) #Try with True estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available model_fn=model_fn, config=run_config, train_batch_size=TRAIN_BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE, predict_batch_size=PREDICT_BATCH_SIZE) predict_examples = processor.get_test_examples(DATA_DIR) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record") run_classifier.file_based_convert_examples_to_features(predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", batch_size) predict_input_fn = run_classifier.file_based_input_fn_builder( input_file=predict_file, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(OUTPUT_DIR, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples s3 = boto3.resource('s3') tf.logging.info("Done with prediction uploading results to S3") try: s3.Bucket(bucket_name).upload_file(output_predict_file, output_predict_file) except Exception as err: logging.info("Unable to upload to S3") logging.info(err) return 1
import json import pandas as pd tokenization = run_classifier.tokenization model_base_path = 'model' #modify accordingly init_checkpoint = os.path.sep.join([ os.path.dirname(os.path.realpath(__file__)), model_base_path, 'model.ckpt' ]) bert_config_file = os.path.sep.join([ os.path.dirname(os.path.realpath(__file__)), model_base_path, 'bert_config.json' ]) vocab_file = os.path.sep.join([ os.path.dirname(os.path.realpath(__file__)), model_base_path, 'vocab.txt' ]) processor = run_classifier.ColaProcessor() label_list = processor.get_labels() emotions_file = os.path.sep.join( [os.path.dirname(os.path.realpath(__file__)), 'emotions.txt']) #since the original bert source code combines train, eval and predict in one single configuration, #we need to feed such data during initialization, can be anything as it is needed for run configuration BATCH_SIZE = 8 SAVE_SUMMARY_STEPS = 100 SAVE_CHECKPOINTS_STEPS = 500 OUTPUT_DIR = "./output" #variables that needed to be modified labels = [str(i) for i in range(28)] #modify based on the labels that you have MAX_SEQ_LENGTH = 50 #modify based on the seq length is_lower_case = True #modify based on uncased or cased