def read_data(dataset_path, from_list_to_examples, class_labels, max_seq_length, use_tpu, tokenizer): """ Read dataset file and prepare feature list acceptable to estimators """ df_src = pd.read_csv(dataset_path, sep='\t') input_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.tf_record') file_based_convert_examples_to_features( from_list_to_examples([list(df_src)] + df_src.values.tolist()), class_labels, max_seq_length, tokenizer, input_file.name ) input_fn = file_based_input_fn_builder( input_file=input_file.name, seq_length=max_seq_length, is_training=False, drop_remainder=True if use_tpu else False, ) # Input_file object has to be kept # by the caller during prediction not to be deleted return df_src, input_file, input_fn
def create_tfrecord_All(kmer): tsv_root = "DatasetAll/asTSV/" + str(kmer) + "kmer_tsv_data/" tfrecord_root = "DatasetAll/asTF_Record/" + str(kmer) + "kmer_tfrecord/" vocab_file = "vocab/vocab_" + str(kmer) + "kmer.txt" processor = ColaProcessor() label_list = processor.get_labels() examples = processor.fatma_get_train_examples_All(tsv_root) train_file = tfrecord_root + "train_All.tf_record" tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=True) file_based_convert_examples_to_features( examples, label_list, 128, tokenizer, train_file)
def predict(self, dir_in, filename, dir_out): predict_examples = self.processor.get_examples_from( os.path.join(dir_in, filename)) num_actual_predict_examples = len(predict_examples) if self.config.use_tpu: while len(predict_examples) % self.config.batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join( dir_out, "{0}.tf_record".format( self.get_filename_without_extension(filename))) file_based_convert_examples_to_features(predict_examples, self.label_list, self.config.max_seq_length, self.tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", self.config.batch_size) predict_drop_remainder = True if self.config.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=self.config.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = self.estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join( dir_out, "{0}_result.tsv".format( self.get_filename_without_extension(filename))) with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def predict(self, X, y=None): predict_examples = X num_actual_predict_examples = len(predict_examples) if self.config.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % self.config.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(self.config.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, y, self.config.max_seq_length, self.tokenizer, predict_file) print("***** Running prediction*****") print(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) print(" Batch size = %d", self.config.predict_batch_size) predict_drop_remainder = True if self.config.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=self.config.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = self.estimator.predict(input_fn=predict_input_fn) fulldata = [] for (i, prediction) in enumerate(result): if i >= num_actual_predict_examples: break probs = [prob for prob in prediction["probabilities"]] data = [] data.append(X[i].label) data.append(y[numpy.argsort(probs)[::-1][0]]) data.append(X[i].text_a) data.extend(probs) fulldata.append(data) cols = ["true", "pred", "text"] cols.extend(y) df = pandas.DataFrame(data=fulldata, columns=cols) return df
def train(self, X, y): ## X is training examples, y is label list train_file = os.path.join(self.config.output_dir, "train.tf_record") file_based_convert_examples_to_features(X, y, self.config.max_seq_length, self.tokenizer, train_file) print("***** Running training *****") print(" Num examples = %d", len(X)) print(" Batch size = %d", self.config.train_batch_size) print(" Num steps = %d", self.num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=self.config.max_seq_length, is_training=True, drop_remainder=True) self.estimator.train(input_fn=train_input_fn, max_steps=self.num_train_steps)
def fasta2record(input_file, output_file, vocab_file, step=1): # This function gets an input_file which is .fasta # This function returns the numbers of sequences in input_file # This function will check if the input_file is right with open(input_file) as f: lines = f.readlines() print(lines) for index, line in enumerate(lines): print(line) if index % 2 == 0: if line[0] != ">": print("Row " + str(index + 1) + " is wrong!") exit() else: if line[0] == ">": print("Row " + str(index + 1) + " is wrong!") exit() seq_num = int(len(lines) / 2) with open("temp.tsv", "w") as f: for line in lines: if line[0] != ">": seq = "" line = line.strip() length = len(line) # step = 1 for i in range(0, length, step): if length - i >= step: seq += line[i:i + step] + " " else: seq += line[i:] + " " seq += "\n" f.write("train\t1\t\t" + seq) processor = ColaProcessor() label_list = processor.get_labels() examples = processor.ljy_get_dev_examples("temp.tsv") train_file = "predict.tf_record" tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) file_based_convert_examples_to_features(examples, label_list, 128, tokenizer, train_file) return seq_num
def evaluate(self, X, y): eval_examples = X num_actual_eval_examples = len(eval_examples) if self.config.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % self.config.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(self.config.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, y, self.config.max_seq_length, self.tokenizer, eval_file) print("***** Running evaluation *****") print(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) print(" Batch size = %d", self.config.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if self.config.use_tpu: assert len(eval_examples) % self.config.eval_batch_size == 0 eval_steps = int(len(eval_examples) // self.config.eval_batch_size) eval_drop_remainder = True if self.config.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=self.config.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = self.estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) return result
label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) # In[6]: # Converting training examples to features print("################ Processing Training Data #####################") TRAIN_TF_RECORD = os.path.join(OUTPUT_DIR, "train.tf_record") train_examples = processor.get_train_examples(TASK_DATA_DIR) num_train_examples = len(train_examples) num_train_steps = int(num_train_examples / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) run_classifier.file_based_convert_examples_to_features(train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, TRAIN_TF_RECORD) # ## Creating Classification Model # In[7]: def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): """Creates a classification model.""" # Bert Model instant model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids,
def main(): # bert bert_config_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.json') bert_config_file.write( json.dumps( {k: str_to_value(v) for k, v in config['BERT-CONFIG'].items()})) bert_config_file.seek(0) # [注意] 最初からread するから bert_config = modeling.BertConfig.from_json_file(bert_config_file.name) latest_ckpt = latest_ckpt_model() # model.ckpt-11052.index, model.ckpt-11052.meta データの prefix finetuned_model_path = latest_ckpt.split('.data-00000-of-00001')[0] flags = FLAGS(finetuned_model_path) processor = LivedoorProcessor() label_list = processor.get_labels() # sentencepiece tokenizer = tokenization.FullTokenizer(model_file=flags.model_file, vocab_file=flags.vocab_file, do_lower_case=flags.do_lower_case) # no use TPU tpu_cluster_resolver = None is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 # config run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=flags.master, model_dir=flags.output_dir, save_checkpoints_steps=flags.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=flags.iterations_per_loop, num_shards=flags.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=flags.init_checkpoint, learning_rate=flags.learning_rate, num_train_steps=flags.num_train_steps, num_warmup_steps=flags.num_warmup_steps, use_tpu=flags.use_tpu, use_one_hot_embeddings=flags.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=flags.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=flags.train_batch_size, eval_batch_size=flags.eval_batch_size, predict_batch_size=flags.predict_batch_size) # テストデータコレクションの取得 predict_examples = processor.get_test_examples(flags.data_dir) predict_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.tf_record') """Convert a set of `InputExample`s to a TFRecord file.""" """出力: predict_file.name """ # https://github.com/yoheikikuta/bert-japanese/blob/master/src/run_classifier.py#L371-L380 file_based_convert_examples_to_features(predict_examples, label_list, flags.max_seq_length, tokenizer, predict_file.name) predict_drop_remainder = True if flags.use_tpu else False # TPUEstimatorに渡すクロージャを作成 predict_input_fn = file_based_input_fn_builder( input_file=predict_file.name, seq_length=flags.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) # 推論 result = estimator.predict(input_fn=predict_input_fn) result = list(result) # 精度を計算 accracy(result, label_list)
def main(): tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) processor = MyProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=None, master=None, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=8, # 这个参数在没有调用到tpu的时候实际上无法用到 per_host_input_for_training=is_per_host)) train_examples = processor.get_train_examples(FLAGS.data_dir) train_file = os.path.join(FLAGS.output_dir, "train.tf_record") # 这里的train_examples在前面的processor中已经处理出来了 file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) eval_steps = None eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) for i in range(FLAGS.watch_times): num_train_steps = int((i + 1) * len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={"batch_size": 8}) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "a") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): writer.write("%s" % "The" + str(i) + "-th eval\n") tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("\n")
def test_model(self, local_dir, nb_epoch, batch_size, bucket_name): """ Use the BERT Uncased language model to train on new data """ tf.logging.set_verbosity(tf.logging.INFO) logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH) logging.info("*:Local Dir%s",local_dir) mod_name = self.model_name BERT_MODEL = 'uncased_L-12_H-768_A-12' BERT_PRETRAINED_DIR = BERT_MODEL_PATH OUTPUT_DIR = os.path.join(local_dir,'output_bert') DATA_DIR = os.path.join(local_dir,'data') logging.info('***** Model output directory: %s*****',OUTPUT_DIR) logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR) logging.info('***** DATA directory: %s *****',DATA_DIR) TRAIN_BATCH_SIZE = 32 EVAL_BATCH_SIZE = 8 PREDICT_BATCH_SIZE = 32 LEARNING_RATE = 2e-5 NUM_TRAIN_EPOCHS = 3.0 WARMUP_PROPORTION = 0.1 MAX_SEQ_LENGTH = 128 # Model configs # if you wish to finetune a model on a larger dataset, use larger interval SAVE_CHECKPOINTS_STEPS = 1000 # each checpoint weights about 1,5gb ITERATIONS_PER_LOOP = 1000 NUM_TPU_CORES = 8 VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt') BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json') with open(os.path.join(OUTPUT_DIR,'final_ckpt.txt')) as f: content = f.readlines() logging.info("***Final_cktp->%s\n",content) test_ckpt = content[0].split('/')[-1] INIT_CHECKPOINT = os.path.join(OUTPUT_DIR, test_ckpt) DO_LOWER_CASE = BERT_MODEL.startswith('uncased') logging.info("Found VOCAB File:%s",VOCAB_FILE) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE) tf.gfile.MakeDirs(OUTPUT_DIR) processor = run_classifier.ColaProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) # Since training will happen on GPU, we won't need a cluster resolver tpu_cluster_resolver = None # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator. run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=OUTPUT_DIR, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=ITERATIONS_PER_LOOP, num_shards=NUM_TPU_CORES, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2)) train_examples = None num_train_steps = None num_warmup_steps = None model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=INIT_CHECKPOINT, learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available use_one_hot_embeddings=False) #Try with True estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available model_fn=model_fn, config=run_config, train_batch_size=TRAIN_BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE, predict_batch_size=PREDICT_BATCH_SIZE) predict_examples = processor.get_test_examples(DATA_DIR) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record") run_classifier.file_based_convert_examples_to_features(predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", batch_size) predict_input_fn = run_classifier.file_based_input_fn_builder( input_file=predict_file, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(OUTPUT_DIR, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples s3 = boto3.resource('s3') tf.logging.info("Done with prediction uploading results to S3") try: s3.Bucket(bucket_name).upload_file(output_predict_file, output_predict_file) except Exception as err: logging.info("Unable to upload to S3") logging.info(err) return 1
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": run_classifier.ColaProcessor, "mnli": run_classifier.MnliProcessor, "mrpc": run_classifier.MrpcProcessor, } if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( num_labels=len(label_list), learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, bert_hub_module_handle=FLAGS.bert_hub_module_handle) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_features = run_classifier.convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = run_classifier.input_fn_builder( features=train_features, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_features = run_classifier.convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch. eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = run_classifier.input_fn_builder( features=eval_features, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) if FLAGS.use_tpu: # Discard batch remainder if running on TPU n = len(predict_examples) predict_examples = predict_examples[:( n - n % FLAGS.predict_batch_size)] predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") run_classifier.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = run_classifier.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=FLAGS.use_tpu) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: tf.logging.info("***** Predict results *****") for prediction in result: probabilities = prediction["probabilities"] output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line)
def main(_): tf.logging.set_verbosity(tf.logging.DEBUG) processors = { "mana169": ManaProcessor169, } tokenization.validate_case_matches_checkpoint(DO_LOWER_CASE, INIT_CKPT) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE) if MAX_SEQ_LENGTH > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (MAX_SEQ_LENGTH, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(OUTPUT_DIR) task_name = 'mana169' if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) tpu_cluster_resolver = None hooks = [] # create a logging tensor hook because this takes forever on cpu logger = tf.train.LoggingTensorHook({"Input": "IteratorGetNext:0"}, every_n_iter=1) hooks.append(logger) # debug_hook = tfdbg.LocalCLIDebugHook() # hooks.append(debug_hook) run_config = tf.contrib.tpu.RunConfig(cluster=tpu_cluster_resolver, model_dir=OUTPUT_DIR, save_checkpoints_steps=1) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=INIT_CKPT, learning_rate=5e-5, num_train_steps=None, num_warmup_steps=None, use_tpu=False, use_one_hot_embeddings=False) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, predict_batch_size=PREDICT_BATCH_SIZE) input_file = sys.argv[1] predict_examples = read_input_examples(input_file) num_actual_predict_examples = len(predict_examples) # if FLAGS.use_tpu: # # TPU requires a fixed batch size for all batches, therefore the number # # of examples must be a multiple of the batch size, or else examples # # will get dropped. So we pad with fake examples which are ignored # # later on. # while len(predict_examples) % FLAGS.predict_batch_size != 0: # predict_examples.append(PaddingInputExample()) predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", PREDICT_BATCH_SIZE) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn, hooks=hooks) output_predict_file = os.path.join(OUTPUT_DIR, sys.argv[2]) scores_list = [] num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break # writer.write(output_line) scores_list.append(probabilities) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples scores_array = np.array(scores_list) # write the scores in a useful form top3_scores = [] all_topics = processor.get_labels() for i, row in enumerate(scores_array): top3_indices = row.argsort()[::-1][:3] # index 1, score 1, index 2, score 2, etc l = [] l += [input_df.values[i][j] for j in range(input_df.shape[1] - 1) ] # all but the original input l.append(str(input_df.values[i][-1]).replace( '\n', '')) # take the original input and remove newlines for v in top3_indices: l.append(all_topics[v]) l.append(row[v]) top3_scores.append(l) score_df = pd.DataFrame( top3_scores, columns=list(input_df.columns.values) + ["Class 1", "Score 1", "Class 2", "Score 2", "Class 3", "Score 3"]) score_df.to_csv(output_predict_file, index=None)
def fasta2record(en_file, pr_file, output_train_file, vocab_file): # This function gets two input files (en_file and pr_file) which are .fasta # This function returns the numbers of sequences pairs # This function will check if the input_files are right with open(en_file) as f: lines = f.readlines() f.close() print(lines) for index, line in enumerate(lines): print(line) if index % 2 == 0: if line[0] != ">": print("Row " + str(index + 1) + " is wrong!") exit() else: if line[0] == ">": print("Row " + str(index + 1) + " is wrong!") exit() seq_num = int(len(lines) / 2) with open("en_temp.tsv", "w") as f: for line in lines: if line[0] != ">": seq = "" length = len(line.strip()) for i in range(length): seq += line[i] + " " seq += "\n" f.write("train\t1\t\t" + seq) f.close() with open(pr_file) as f: lines = f.readlines() f.close() print(lines) for index, line in enumerate(lines): print(line) if index % 2 == 0: if line[0] != ">": print("Row " + str(index + 1) + " is wrong!") exit() else: if line[0] == ">": print("Row " + str(index + 1) + " is wrong!") exit() with open("pr_temp.tsv", "w") as f: for line in lines: if line[0] != ">": seq = "" length = len(line.strip()) for i in range(length): seq += line[i] + " " seq += "\n" f.write("train\t1\t\t" + seq) f.close() processor = ColaProcessor() label_list = processor.get_labels() examples = processor.fatma_get_dev_examples_predict("en_temp.tsv", "pr_temp.tsv") #train_file = "predict.tf_record" tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=True) file_based_convert_examples_to_features( examples, label_list, 128, tokenizer, output_train_file) return seq_num
def traineval_model(self, local_dir, nb_epoch, batch_size): """ Use the BERT Uncased language model to train on new data """ tf.logging.set_verbosity(tf.logging.INFO) logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH) logging.info("*:Local Dir%s",local_dir) mod_name = self.model_name BERT_MODEL = 'uncased_L-12_H-768_A-12' BERT_PRETRAINED_DIR = BERT_MODEL_PATH OUTPUT_DIR = os.path.join(local_dir,'output_bert') DATA_DIR = os.path.join(local_dir,'data') logging.info('***** Model output directory: %s*****',OUTPUT_DIR) logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR) logging.info('***** DATA directory: %s *****',DATA_DIR) TRAIN_BATCH_SIZE = 32 EVAL_BATCH_SIZE = 8 LEARNING_RATE = 2e-5 NUM_TRAIN_EPOCHS = 3.0 WARMUP_PROPORTION = 0.1 MAX_SEQ_LENGTH = 128 # Model configs # if you wish to finetune a model on a larger dataset, use larger interval SAVE_CHECKPOINTS_STEPS = 1000 # each checpoint weights about 1,5gb ITERATIONS_PER_LOOP = 1000 NUM_TPU_CORES = 8 VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt') BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json') INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt') DO_LOWER_CASE = BERT_MODEL.startswith('uncased') logging.info("Found VOCAB File:%s",VOCAB_FILE) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE) tf.gfile.MakeDirs(OUTPUT_DIR) processor = run_classifier.ColaProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) # Since training will happen on GPU, we won't need a cluster resolver tpu_cluster_resolver = None # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator. run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=OUTPUT_DIR, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=ITERATIONS_PER_LOOP, num_shards=NUM_TPU_CORES, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2)) train_examples = None num_train_steps = None num_warmup_steps = None train_examples = processor.get_train_examples(DATA_DIR) num_train_steps = int( len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=INIT_CHECKPOINT, learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available use_one_hot_embeddings=False) #Try with True estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available model_fn=model_fn, config=run_config, train_batch_size=TRAIN_BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE) # Train the model. logging.info('Starting Training...') train_file = os.path.join(OUTPUT_DIR, "train.tf_record") run_classifier.file_based_convert_examples_to_features( train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, train_file) tf.logging.info('***** Started training at {} *****'.format(datetime.datetime.now())) tf.logging.info(' Num examples = {}'.format(len(train_examples))) tf.logging.info(' Batch size = {}'.format(TRAIN_BATCH_SIZE)) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = run_classifier.file_based_input_fn_builder( input_file=train_file, seq_length=MAX_SEQ_LENGTH, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) final_ckpt = estimator.latest_checkpoint() print('***** Finished training at {} *****'.format(datetime.datetime.now())) logging.info("*****Final Checkpoint*****%s",final_ckpt) final_ckpt_file = os.path.join(OUTPUT_DIR, "final_ckpt.txt") with tf.gfile.GFile(final_ckpt_file, "w") as writer: writer.write("%s" % final_ckpt) # Do Eval logging.info('Starting Eval..') eval_examples = processor.get_dev_examples(DATA_DIR) num_actual_eval_examples = len(eval_examples) eval_file = os.path.join(OUTPUT_DIR, "eval.tf_record") run_classifier.file_based_convert_examples_to_features( eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", TRAIN_BATCH_SIZE) eval_steps = None eval_input_fn = run_classifier.file_based_input_fn_builder( input_file=eval_file, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def main(_): tf.logging.set_verbosity(tf.logging.INFO) num_iter = 1 jit_xla = tf.OptimizerOptions.ON_1 if FLAGS.xla else 0 processors = { "cola": rc.ColaProcessor, "mnli": rc.MnliProcessor, "mrpc": rc.MrpcProcessor, "xnli": rc.XnliProcessor, } # sanity check tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) bert_config = my_modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) # prepare data processor = processors[task_name]() label_list = processor.get_labels() predict_examples = processor.get_test_examples(FLAGS.data_dir) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") rc.file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) # get model function and input function # drop_remainder option should be turned on for fast transformer inference drop_remainder = True predict_input_fn = rc.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=drop_remainder) def graph_fn(): model_fn = model_fn_builder(bert_config=bert_config) dataset = predict_input_fn({'batch_size': FLAGS.predict_batch_size}) next_item = dataset.make_one_shot_iterator().get_next() output_var = model_fn(next_item) return output_var, next_item if FLAGS.tf_profile: tf.logging.info("***** Running tensorflow transformer*****") p1 = profile_util.Profiler(os.path.join( FLAGS.output_dir, 'prof/bert_origin')) t1, r1 = profile_util.run_profile( graph_fn, jit_xla, num_iter, p1, init_checkpoint=FLAGS.init_checkpoint) tf.reset_default_graph() my_modeling.transformer_model = fiu.fast_transformer_model_trans tf.logging.info("***** Running fast transformer*****") p2 = profile_util.Profiler(os.path.join( FLAGS.output_dir, 'prof/bert_fastinfer')) t2, r2 = profile_util.run_profile( graph_fn, jit_xla, num_iter, p2, init_checkpoint=FLAGS.init_checkpoint) else: tf.logging.info("***** Running tensorflow transformer*****") t1, r1 = profile_util.run_profile( graph_fn, jit_xla, num_iter, check_result=False, init_checkpoint=FLAGS.init_checkpoint, export_path='./export_default_{}/{}/model.savedmodel/'.format(FLAGS.max_seq_length, FLAGS.predict_batch_size)) tf.reset_default_graph() my_modeling.transformer_model = fiu.fast_transformer_model_trans tf.logging.info("***** Running fast transformer*****") t2, r2 = profile_util.run_profile( graph_fn, jit_xla, num_iter, check_result=False, init_checkpoint=FLAGS.init_checkpoint, export_path='./export_ft_{}/{}/model.savedmodel/'.format(FLAGS.max_seq_length, FLAGS.predict_batch_size)) print('average time (seconds) elasped original tensorflow:', t1) print('average time (seconds) elasped fast transformer:', t2) if len(r1) + len(r2) > 0: check_res = np.asarray([np.allclose( r1[i], r2[i], atol=1e-4, rtol=0) for i in range(num_iter)]) if check_res.all(): print('Pass') print(np.mean(r1)) print(np.mean(r2)) else: for i in np.where(np.logical_not(check_res))[0]: diff = np.fabs(r1[i] - r2[i]) idx = np.unravel_index(diff.argmax(), diff.shape) print('Failed iter:', i, "max diff:", diff[idx], idx, r1[i][idx], r2[i][idx])
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) def convert_input(x): return run_classifier.InputExample(guid=x["id"], text_a=x["comment_text"], text_b=None, label=0) test_InputExamples = data.apply(convert_input, axis=1) run_classifier.file_based_convert_examples_to_features(test_InputExamples, [0, 1], MAX_SEQ_LENGTH, tokenizer, TEST_FILE) NUM_TRAIN_STEPS = 0 # int(len(test_InputExamples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) NUM_WARMUP_STEPS = 0 # int(NUM_TRAIN_STEPS * WARMUP_PROPORTION) test_input_fn = run_classifier.file_based_input_fn_builder( input_file=TEST_FILE, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE) tpu_config = tf.contrib.tpu.TPUConfig(iterations_per_loop=ITERATIONS_PER_LOOP, num_shards=NUM_TPU_CORES,
# coding:utf-8 from run_classifier import ColaProcessor import tokenization from run_classifier import file_based_convert_examples_to_features data_name = "Legionellapneumophilatmp" input_root = "./dataset/1kmer_tsv_data/" output_root = "./dataset/1kmer_tfrecord/" vocab_file = "./vocab/vocab_1kmer.txt" processor = ColaProcessor() label_list = processor.get_labels() examples = processor.get_dev_examples(input_root + data_name + "/") train_file = output_root + data_name + "/dev.tf_record" tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) file_based_convert_examples_to_features(examples, label_list, 128, tokenizer, train_file) examples = processor.get_train_examples(input_root + data_name + "/") train_file = output_root + data_name + "/train.tf_record" tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) file_based_convert_examples_to_features(examples, label_list, 128, tokenizer, train_file)
def mrpc_classifier(sent_list1, sent_list2, args): TUNED_MODEL_DIR = args.tuned_model_dir config = { "task_name": 'MRPC', "do_predict": True, "vocab_file": f"{TUNED_MODEL_DIR}/vocab.txt", "bert_config_file": f"{TUNED_MODEL_DIR}/bert_config.json", "init_checkpoint": f"{TUNED_MODEL_DIR}", "max_seq_length": 128, "output_dir": f"{TUNED_MODEL_DIR}", "do_lower_case": True, "predict_batch_size": 8 } bert_config = modeling.BertConfig.from_json_file( config["bert_config_file"]) processor = run_classifier.MrpcProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=config["vocab_file"], do_lower_case=config["do_lower_case"]) run_config = tf.contrib.tpu.RunConfig( cluster=None, master=None, model_dir=config["output_dir"], save_checkpoints_steps=1000, ) model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=config["init_checkpoint"], learning_rate=5e-5, num_train_steps=None, num_warmup_steps=None, use_tpu=False, use_one_hot_embeddings=False) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, predict_batch_size=config["predict_batch_size"]) predict_examples = get_predict_examples(sent_list1, sent_list2) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(config["output_dir"], "predict.tf_record") run_classifier.file_based_convert_examples_to_features( predict_examples, label_list, config["max_seq_length"], tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", config["predict_batch_size"]) predict_drop_remainder = False predict_input_fn = run_classifier.file_based_input_fn_builder( input_file=predict_file, seq_length=config["max_seq_length"], is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) probabilities = [ prediction["probabilities"][1] for (i, prediction) in enumerate(result) ] return probabilities