def generate_results(model_dir, results_path): os.makedirs(os.path.dirname(results_path), exist_ok=True) vocab = np.load('output/processed-annotations/vocab.npy') run_config = RunConfig(model_dir=model_dir) hparams = get_hparams(model_dir=model_dir, create=False) print(hparams) estimator = Estimator(model_fn=model_fn, config=run_config, params=hparams) val_path = tf.flags.FLAGS.batch_path splits = tf.flags.FLAGS.batch_splits batch_size = tf.flags.FLAGS.batch_size hook = FeedFnHook(path_fmt=val_path, splits=splits, batch_size=batch_size, predict=True, single_pass=True) results = [] it = tqdm(desc='Generating results') for prediction in estimator.predict(input_fn=predict_input_fn, hooks=[hook]): caption = calc_caption(prediction=prediction, vocab=vocab) results.append({ 'image_id': np.asscalar(prediction['image_ids']), 'caption': caption }) it.update(1) with open(results_path, 'w') as f: json.dump(results, f)
def main(): sequence_schema_path = f'{input_path}/train/sequence_schema' context_schema_path = f'{input_path}/train/context_schema' context_schema, sequence_schema = read_schemata(context_schema_path, sequence_schema_path) tf_ctx_schema, tf_seq_schema = build_schema(context_schema, sequence_schema) train_parts = glob.glob(input_path + '/train' + '/part-*') validation_parts = glob.glob(input_path + '/test' + '/part-*') run_config = RunConfig(log_step_count_steps=10, save_checkpoints_steps=100, save_summary_steps=200, keep_checkpoint_max=32) shared_input_fn = partial(input_fn, params, tf_seq_schema, tf_ctx_schema) train_input_fn = partial(shared_input_fn, train_parts) validation_input_fn = partial(shared_input_fn, validation_parts) train_spec = TrainSpec(train_input_fn, max_steps=1000000) eval_spec = EvalSpec(validation_input_fn, steps=200, name='validation', start_delay_secs=30, throttle_secs=1) estimator = Estimator(model_fn=model.model_fn, model_dir=model_dir, params=params, config=run_config) logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) logging.getLogger('tensorflow').propagate = False train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec) prediction = list(estimator.predict(input_fn=partial(predict_input_fn, {'epochs': 1, 'batch_size': 10}, grid))) scores = [p.tolist() for p in prediction] pairwise_prob = pairwise_probability(scores) zero = pairwise_prob[0] A_zero = build_diags(zero) print(optimize(A_zero).x)
def process(question, contexts): # TODO Replace all abbreviation code bert_config = modeling.BertConfig.from_json_file( os.path.join(modelDir, 'bert_config.json')) # Loading bert config tokenizer = tokenization.FullTokenizer( vocab_file=os.path.join(modelDir, 'vocab.txt'), do_lower_case=False) # Loading tokenizer candidates = read_QA(question, contexts) eval_features = convert_candidates_to_features(candidates=candidates, tokenizer=tokenizer, max_seq_length=512, doc_stride=256, max_query_length=128) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=os.path.join( modelDir, 'bert_model.ckpt'), use_one_hot_embeddings=False) run_config = RunConfig(model_dir=modelDir, save_checkpoints_steps=1000) estimator = Estimator(model_fn=model_fn, config=run_config, params={'batch_size': 14}) predict_input_fn = input_fn_builder(features=eval_features, seq_length=512, drop_remainder=True) all_results = [] counter = 0 RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits"]) for result in estimator.predict(predict_input_fn, yield_single_examples=True): unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) counter += 1 if len(eval_features) == counter: break all_nbest_json = write_QA(candidates, eval_features, all_results, 2, 128, False) return all_nbest_json
def main(_): tf.logging.set_verbosity(tf.logging.WARN) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) examples = read_examples(FLAGS.input_file) features = convert_examples_to_features(examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, use_one_hot_embeddings=FLAGS.use_one_hot_embeddings, pooling_layer=FLAGS.feature_pooling_layer) # Remove TPU Estimator. #estimator = tf.contrib.tpu.TPUEstimator( # use_tpu=FLAGS.use_tpu, # model_fn=model_fn, # config=run_config, # predict_batch_size=FLAGS.batch_size) estimator = Estimator(model_fn, params=pack_params()) input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length) with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, "w")) as writer: for result in estimator.predict(input_fn, yield_single_examples=True): unique_id = int(result["unique_id"]) encodes = [round(float(x), 6) for x in result["encodes"].flat] feature = unique_id_to_feature[unique_id] text = feature.tokens[1:-1] #tf.logging.info("str: %s, encodes[%d]=%s" % ("".join(text), len(encodes), ",".join([str(x) for x in encodes]))) writer.write("\t".join([ str(unique_id), "".join(text), ",".join( [str(x) for x in encodes]) ]) + "\n")
def main(): parsed_args = get_parser().parse_args() with open(os.path.join("data/challenger.ai", 'word_to_idx.pkl'), 'rb') as f: word_to_idx = pickle.load(f) hparams = HParams(vocab_size=len(word_to_idx), batch_size=parsed_args.batch_size, selector=parsed_args.selector, dropout=parsed_args.dropout, ctx2out=parsed_args.ctx2out, prev2out=parsed_args.prev2out, hard_attention=parsed_args.hard_attention, bin_size=14) run_config = RunConfig(model_dir=parsed_args.model_dir) estimator = Estimator( model_fn=model_fn_inner, params=hparams, config=run_config) dataset = ChallengerAI("data/challenger.ai") input_fn = dataset.get_tfrecords_test_input_fn(bin_size=hparams.bin_size) val_init_hook = IteratorInitializerHook("infer") idx_to_word = {v: k for k, v in word_to_idx.items()} del word_to_idx results = estimator.predict(input_fn, hooks=[val_init_hook]) all_predicions = [] image_ids = [] num_generated = 0 for batch_result in results: image_id, pred = batch_result["image_id"], batch_result["predictions"] result = ''.join([idx_to_word[idx] for idx in pred if idx != 0 and idx != 2]) all_predicions.append(result) image_ids.append(image_id.decode("utf-8").split(".")[0]) num_generated = num_generated + 1 if num_generated % 1000 == 0: print("Generated %d" % num_generated) total_results = [{"image_id": img_id, "caption": pred} for img_id, pred in zip(image_ids, all_predicions)] with open("result.json", "w", encoding="utf-8") as f: json.dump(total_results, f, ensure_ascii=False)
def generate_captions(model_dir, output_dir): os.makedirs(output_dir, exist_ok=True) vocab = np.load('output/processed-annotations/vocab.npy') run_config = RunConfig(model_dir=model_dir) hparams = get_hparams(model_dir, create=False) print(hparams) estimator = Estimator( model_fn=model_fn, config=run_config, params=hparams) val_path = tf.flags.FLAGS.batch_path use_slot_vocab = hparams.use_slot_vocab hook = FeedFnHook(path_fmt=val_path, splits=1, batch_size=tf.flags.FLAGS.batch_size, predict=True) with open(os.path.join(output_dir, 'captions.csv'), 'w', newline='') as f: w = csv.writer(f) w.writerow(['Index', 'Caption']) for i, prediction in enumerate(estimator.predict(input_fn=predict_input_fn, hooks=[hook])): caption = write_prediction(os.path.join(output_dir, '{:08d}'.format(i)), prediction=prediction, vocab=vocab, use_slot_vocab=use_slot_vocab) w.writerow([i, caption]) if i > 100: break
def main(): parsed_args = get_parser().parse_args() with open(os.path.join("data", 'word_to_idx.pkl'), 'rb') as f: word_to_idx = pickle.load(f) hparams = HParams(vocab_size=len(word_to_idx), batch_size=parsed_args.batch_size, selector=parsed_args.selector, dropout=parsed_args.dropout, ctx2out=parsed_args.ctx2out, prev2out=parsed_args.prev2out) run_config = RunConfig(model_dir=parsed_args.model_dir) estimator = Estimator(model_fn=model_fn, params=hparams, config=run_config) image_ids, input_fn = get_input_fn() val_init_hook = IteratorInitializerHook("infer") idx_to_word = {v: k for k, v in word_to_idx.items()} del word_to_idx pred_results = estimator.predict(input_fn, hooks=[val_init_hook]) all_predicions = [] num_generated = 0 for pred in pred_results: result = ' '.join( [idx_to_word[idx] for idx in pred if idx != 0 and idx != 2]) all_predicions.append(result) num_generated = num_generated + 1 if num_generated % 1000 == 0: print("Generated %d" % num_generated) total_results = [{ "image_id": img_id, "caption": pred } for img_id, pred in zip(image_ids, all_predicions)] with open("result.json", "w") as f: json.dump(total_results, f)
class BertWorker: def __init__(self): # the pooling layer index of bert-original model self.pooling_layer = [-2] # the pooling_strategy of bert-original model self.pooling_strategy = PoolingStrategy.REDUCE_MEAN # "The maximum total input sequence length after WordPiece tokenization. " # "Sequences longer than this will be truncated, and sequences shorter " # "than this will be padded." self.max_seq_len = 128 self.bert_model_dir = sys_conf["bert_dir"] self.config_fp = os.path.join(self.bert_model_dir, "bert_config.json") self.ckpt_fp = os.path.join(self.bert_model_dir, "bert_model.ckpt") self.vocab_fp = os.path.join(self.bert_model_dir, "vocab.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.ckpt_fp, pooling_strategy=self.pooling_strategy, pooling_layer=self.pooling_layer) self.estimator = Estimator(self.model_fn) def input_fn_builder_file_path(self, file_path): def gen_asap_article(): dataset = pd.read_csv(file_path) articles = dataset["essay"] articles_set = dataset["essay_set"] domain1_score = dataset["domain1_score"] articles_id = dataset["essay_id"] for i in range(len(articles)): doc = articles[i] sentences = sentence_tokenize(doc) tmp_f = list( convert_lst_to_features(sentences, self.max_seq_len, self.tokenizer)) yield { "input_ids": [f.input_ids for f in tmp_f], "input_mask": [f.input_mask for f in tmp_f], "input_type_ids": [f.input_type_ids for f in tmp_f], "article_set": articles_set[i], "domain1_score": float(domain1_score[i]), "article_id": articles_id[i] } def input_fn(): return (tf.data.Dataset.from_generator( gen_asap_article, output_types={ "input_ids": tf.int32, "input_mask": tf.int32, "input_type_ids": tf.int32, "article_set": tf.int32, "domain1_score": tf.float32, "article_id": tf.int32 }, output_shapes={ "input_ids": (None, self.max_seq_len), "input_mask": (None, self.max_seq_len), "input_type_ids": (None, self.max_seq_len), "article_set": [], "domain1_score": [], "article_id": [] })) return input_fn def inference_from_path_with_permfile(self, file_path): input_fn = self.input_fn_builder_file_path(file_path) for r in self.estimator.predict(input_fn, yield_single_examples=False): temp_sample = { "doc_encodes": r["encodes"], "article_set": r["article_set"], "domain1_score": r["domain1_score"], "article_id": r["article_id"] } yield temp_sample def input_fn_builder_eilts_path(self, essay_path, score_path): def gen_eilts_article(): score = dict() with open(score_path, "r", encoding="utf-8") as sr: for line in sr: score[line.split()[0]] = float(line.split()[1]) for dirpath, dirnames, filenames in os.walk(essay_path): if filenames: for filename in filenames: filepath = os.path.join(dirpath, filename) with open(filepath, "r") as dr: lines = [] for line in dr: if line.strip(): lines.append(line.strip()) title_and_doc = " ".join(lines) title = title_and_doc.split("\t", 1)[0].strip() doc = title_and_doc.split("\t", 1)[1].strip() sentences = sentence_tokenize(doc) tmp_f = list( convert_lst_to_features( sentences, self.max_seq_len, self.tokenizer)) yield { "input_ids": [f.input_ids for f in tmp_f], "input_mask": [f.input_mask for f in tmp_f], "input_type_ids": [f.input_type_ids for f in tmp_f], "article_set": 9, "domain1_score": float(score[filename]), "article_id": int(filename) } def input_fn(): return (tf.data.Dataset.from_generator( gen_eilts_article, output_types={ "input_ids": tf.int32, "input_mask": tf.int32, "input_type_ids": tf.int32, "article_set": tf.int32, "domain1_score": tf.float32, "article_id": tf.int32 }, output_shapes={ "input_ids": (None, self.max_seq_len), "input_mask": (None, self.max_seq_len), "input_type_ids": (None, self.max_seq_len), "article_set": [], "domain1_score": [], "article_id": [] })) return input_fn def inference_from_eitls_path(self, essay_path, score_path): input_fn = self.input_fn_builder_eilts_path(essay_path, score_path) for r in self.estimator.predict(input_fn, yield_single_examples=False): temp_sample = { "doc_encodes": r["encodes"], "article_set": r["article_set"], "domain1_score": r["domain1_score"], "article_id": r["article_id"] } yield temp_sample def input_fn_builder_client(self): pass def inference_from_client(self): pass
class BertWorker(Process): def __init__(self, id, args): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_len = args.max_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id) self.estimator = Estimator(self.model_fn) self.result = [] def run(self): socket = zmq.Context().socket(zmq.REQ) socket.identity = u'Worker-{}'.format(self.worker_id).encode('ascii') socket.connect('ipc:///tmp/bert.service') input_fn = self.input_fn_builder(socket) socket.send(b'READY') logger.info('worker %d is ready and listening' % self.worker_id) for r in self.estimator.predict(input_fn): self.result.append([round(float(x), 6) for x in r.flat]) socket.close() logger.info('worker is terminated!') @staticmethod def is_valid_input(texts): return isinstance(texts, list) and all( isinstance(s, str) for s in texts) def input_fn_builder(self, worker): def gen(): while True: if self.result: num_result = len(self.result) worker.send_multipart( [ident, b'', pickle.dumps(self.result)]) self.result = [] time_used = time.clock() - start logger.info('encoded %d strs from %s in %.2fs @ %d/s' % (num_result, ident, time_used, int(num_result / time_used))) ident, empty, msg = worker.recv_multipart() start = time.clock() msg = pickle.loads(msg) if self.is_valid_input(msg): tmp_f = list( convert_lst_to_features(msg, self.max_len, self.tokenizer)) yield { 'input_ids': [f.input_ids for f in tmp_f], 'input_mask': [f.input_mask for f in tmp_f], 'input_type_ids': [f.input_type_ids for f in tmp_f] } else: logger.warning( 'worker %d: received unsupported type! sending back None' % self.id) worker.send_multipart([ident, b'', pickle.dumps(None)]) def input_fn(): return (tf.data.Dataset.from_generator( gen, output_types={ k: tf.int32 for k in ['input_ids', 'input_mask', 'input_type_ids'] }, output_shapes={ 'input_ids': (None, self.max_len), 'input_mask': (None, self.max_len), 'input_type_ids': (None, self.max_len) })) return input_fn
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) token_word2id, token_vocab_size = read_vocab(FLAGS.token_vocab_file) #input_files = [FLAGS.input_file] input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) #tf.logging.info("*** Input Files ***") #tf.logging.info(" %s" % input_files[0]) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.n_gpus, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=FLAGS.n_gpus), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 run_config = RunConfig(train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, word2id=token_word2id) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. # estimator = Estimator( # model_fn=model_fn, # params={}, # config=run_config) estimator = Estimator( model_fn=model_fn, config=run_config, ) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_files = [FLAGS.eval_input_file] eval_input_fn = input_fn_builder( input_files=eval_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) tf.logging.info("***** Running test *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) test_input_files = [FLAGS.test_input_file] eval_input_fn = input_fn_builder( input_files=test_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "test_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Test results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) tf.logging.info("***** Running Small test *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) small_test_input_files = [FLAGS.small_test_input_file] eval_input_fn = input_fn_builder( input_files=small_test_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "small_test_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Small Test results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_test: tf.logging.info("***** Running test *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) test_input_files = [FLAGS.small_eval_input_file] test_input_fn = input_fn_builder( input_files=test_input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.predict(input_fn=test_input_fn) tf.logging.info("***** Test results *****") output_eval_file = os.path.join(FLAGS.output_dir, "small_id_eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: for i, p in tqdm(enumerate(result)): writer.write( str(p['masked_pre']) + ' ' + str(p['masked_tar']) + '\n')
class BertWorker(Process): def __init__(self, id, args, worker_address, sink_address): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_seq_len = args.max_seq_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp, pooling_strategy=args.pooling_strategy, pooling_layer=args.pooling_layer ) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id) self.estimator = Estimator(self.model_fn) self.exit_flag = multiprocessing.Event() self.logger = set_logger('WORKER-%d' % self.worker_id) self.worker_address = worker_address self.sink_address = sink_address def close(self): self.logger.info('shutting down...') self.exit_flag.set() self.terminate() self.join() self.logger.info('terminated!') def run(self): context = zmq.Context() receiver = context.socket(zmq.PULL) receiver.connect(self.worker_address) sink = context.socket(zmq.PUSH) sink.connect(self.sink_address) input_fn = self.input_fn_builder(receiver) self.logger.info('ready and listening') start_t = time.perf_counter() for r in self.estimator.predict(input_fn, yield_single_examples=False): # logger.info('new result!') send_ndarray(sink, r['client_id'], r['encodes']) time_used = time.perf_counter() - start_t start_t = time.perf_counter() self.logger.info('job %s\tsamples: %4d\tdone: %.2fs' % (r['client_id'], r['encodes'].shape[0], time_used)) receiver.close() sink.close() context.term() self.logger.info('terminated!') def input_fn_builder(self, worker): def gen(): while not self.exit_flag.is_set(): client_id, msg = worker.recv_multipart() msg = jsonapi.loads(msg) self.logger.info('new job %s, size: %d' % (client_id, len(msg))) if BertClient.is_valid_input(msg): tmp_f = list(convert_lst_to_features(msg, self.max_seq_len, self.tokenizer)) yield { 'client_id': client_id, 'input_ids': [f.input_ids for f in tmp_f], 'input_mask': [f.input_mask for f in tmp_f], 'input_type_ids': [f.input_type_ids for f in tmp_f] } else: self.logger.error('unsupported type of job %s! sending back None' % client_id) def input_fn(): return (tf.data.Dataset.from_generator( gen, output_types={'input_ids': tf.int32, 'input_mask': tf.int32, 'input_type_ids': tf.int32, 'client_id': tf.string}, output_shapes={ 'client_id': (), 'input_ids': (None, self.max_seq_len), 'input_mask': (None, self.max_seq_len), 'input_type_ids': (None, self.max_seq_len)})) return input_fn
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": classifier_utils.ColaProcessor, "mnli": classifier_utils.MnliProcessor, "mrpc": classifier_utils.MrpcProcessor, # "xnli": XnliProcessor, "sts-b": classifier_utils.StsbProcessor, "qqp": classifier_utils.QqpProcessor, "sst-2": classifier_utils.Sst2Processor, "qnli": classifier_utils.QnliProcessor, "rte": classifier_utils.RteProcessor, "wnli": classifier_utils.WnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) albert_config.hidden_dropout_prob = FLAGS.albert_dropout_prob albert_config.attention_probs_dropout_prob = FLAGS.albert_dropout_prob if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]( use_spm=True if FLAGS.spm_model_file else False, do_lower_case=FLAGS.do_lower_case) label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) # multiple gpus NUM_GPUS = FLAGS.num_gpu_cores if FLAGS.strategy_type == 'mirror' else 1 using_customized_optimizer = None if NUM_GPUS > 1 and FLAGS.strategy_type == "mirror": os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in list(range(NUM_GPUS))]) # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=NUM_GPUS, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=NUM_GPUS), ) using_customized_optimizer = True tf.logging.info('Use MirroredStrategy with %d devices.', strategy.num_replicas_in_sync) else: strategy = tf.distribute.OneDeviceStrategy("GPU:0") using_customized_optimizer = False tf.logging.info('Single device mode.') tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), train_distribute=strategy, eval_distribute=strategy, #get error during evaluation ) train_examples = None total_time = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) model_fn = classifier_utils.model_fn_builder( albert_config=albert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.train_step, num_warmup_steps=FLAGS.warmup_step, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, task_name=task_name, customized=using_customized_optimizer, optimizer=FLAGS.optimizer, discard_classifier_weights=FLAGS.discard_classifier_weights) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_tpu and FLAGS.tpu_name: tf.logging.info("Use TPUEstimator") estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) else: tf.logging.info("Use normal Estimator") estimator = Estimator( model_fn=model_fn, params={}, config=run_config, ) if FLAGS.do_train: cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir train_file = os.path.join(cached_dir, task_name + "_train.tf_record") if not tf.gfile.Exists(train_file): classifier_utils.file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, task_name) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info( f" Batch size = {FLAGS.train_batch_size} * {NUM_GPUS}") tf.logging.info(" Num steps = %d", FLAGS.train_step) train_input_fn = classifier_utils.file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size) time_hist = TimeHistory() estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step, hooks=[time_hist]) total_time = sum(time_hist.times) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_features = classifier_utils.convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, task_name) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir eval_file = os.path.join(cached_dir, task_name + "_eval.tf_record") if not tf.gfile.Exists(eval_file): classifier_utils.file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, task_name) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = classifier_utils.file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.eval_batch_size) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") if task_name == "sts-b": key_name = "pearson" elif task_name == "cola": key_name = "matthew_corr" else: key_name = "eval_accuracy" if tf.gfile.Exists(checkpoint_path + ".index"): result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) best_perf = result[key_name] global_step = result["global_step"] else: global_step = -1 best_perf = -1 checkpoint_path = None writer = tf.gfile.GFile(output_eval_file, "w") writer.write("===== Hyperparameters =====\n") writer.write("Training batch size: {}\n".format( FLAGS.train_batch_size)) writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length)) writer.write("Learning rate: {}\n".format(FLAGS.learning_rate)) writer.write("Num of GPU cores: {}\n".format(NUM_GPUS)) if FLAGS.do_train: avg_time_per_batch = np.mean(time_hist.times) writer.write("Total time: {}\n".format(total_time)) writer.write("Speed: {}\n".format(FLAGS.train_batch_size * NUM_GPUS / avg_time_per_batch)) if FLAGS.train_step and FLAGS.warmup_step: writer.write("Training steps: {}\n".format(FLAGS.train_step)) writer.write("Warmup steps: {}\n".format(FLAGS.warmup_step)) while global_step < FLAGS.train_step: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(1) else: for checkpoint in sorted(steps_and_files.items()): step, checkpoint_path = checkpoint if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in [ "meta", "data-00000-of-00001", "index" ]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") tf.logging.info(f"num_gpu_cores = {NUM_GPUS}") writer.write("===== Evuations =====\n") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format( src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format( src_ckpt, tgt_ckpt)) if len(_find_valid_cands(global_step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") writer.close() if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") classifier_utils.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, task_name) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = classifier_utils.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size) checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result = estimator.predict(input_fn=predict_input_fn, checkpoint_path=checkpoint_path) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\ tf.gfile.GFile(output_submit_file, "w") as sub_writer: sub_writer.write("index" + "\t" + "prediction\n") num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, (example, prediction)) in\ enumerate(zip(predict_examples, result)): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" pred_writer.write(output_line) if task_name != "sts-b": actual_label = label_list[int(prediction["predictions"])] else: actual_label = str(prediction["predictions"]) sub_writer.write(example.guid + "\t" + actual_label + "\n") num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
class Sentence2Vec(object): def __init__(self, sen2vec_conf_file): tf.logging.set_verbosity(tf.logging.WARN) ##Sentence2Vec配置 self.sen2vec_conf = Sentence2VecConfig.from_json_file( sen2vec_conf_file) ##bert模型+分词器 self.bert_config = modeling.BertConfig.from_json_file( self.sen2vec_conf.bert_config_file) self.tokenizer = tokenization.FullTokenizer( vocab_file=self.sen2vec_conf.vocab_file, do_lower_case=self.sen2vec_conf.do_lower_case) self.model_fn = self.model_fn_builder( bert_config=self.bert_config, init_checkpoint=self.sen2vec_conf.init_checkpoint, use_one_hot_embeddings=self.sen2vec_conf.use_one_hot_embeddings, pooling_layer=self.sen2vec_conf.feature_pooling_layer) # Remove TPU Estimator. #estimator = tf.contrib.tpu.TPUEstimator( # use_tpu=self.sen2vec_conf.use_tpu, # model_fn=model_fn, # config=run_config, # predict_batch_size=self.sen2vec_conf.batch_size) self.params = {} self.params["batch_size"] = self.sen2vec_conf.batch_size self.estimator = Estimator(self.model_fn, params=self.params) self.seq_length = self.sen2vec_conf.max_seq_length def run(self, sentence_list): """ 批量句子计算sen2vec @return []按照顺序 """ rsl = [] features = [ self.sentence2feature(sentence_list[i], i, self.seq_length) for i in range(len(sentence_list)) ] input_fn = self.input_fn_builder(features, self.seq_length) for result in self.estimator.predict(input_fn, yield_single_examples=True): unique_id = int(result["unique_id"]) encodes = [round(float(x), 6) for x in result["encodes"].flat] rsl.append( SentenceVecTuple(unique_id=unique_id, sentence=sentence_list[unique_id], vector=encodes)) return rsl def model_fn_builder(self, bert_config, init_checkpoint, use_one_hot_embeddings=False, pooling_strategy=PoolingStrategy.REDUCE_MEAN, pooling_layer=-2): """Returns `model_fn` closure for Estimator.""" def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] input_type_ids = features["input_type_ids"] model = modeling.BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=use_one_hot_embeddings) if mode != tf.estimator.ModeKeys.PREDICT: raise ValueError("Only PREDICT modes are supported: %s" % (mode)) tvars = tf.trainable_variables() (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) #all_layers = model.get_all_encoder_layers() encoder_layer = model.all_encoder_layers[pooling_layer] if pooling_strategy == PoolingStrategy.REDUCE_MEAN: pooled = tf.reduce_mean(encoder_layer, axis=1) elif pooling_strategy == PoolingStrategy.REDUCE_MAX: pooled = tf.reduce_max(encoder_layer, axis=1) elif pooling_strategy == PoolingStrategy.REDUCE_MEAN_MAX: pooled = tf.concat([ tf.reduce_max(encoder_layer, axis=1), tf.reduce_max(encoder_layer, axis=1) ], axis=1) elif pooling_strategy == PoolingStrategy.FIRST_TOKEN or pooling_strategy == PoolingStrategy.CLS_TOKEN: pooled = tf.squeeze(encoder_layer[:, 0:1, :], axis=1) elif pooling_strategy == PoolingStrategy.LAST_TOKEN or pooling_strategy == PoolingStrategy.SEP_TOKEN: seq_len = tf.cast(tf.reduce_sum(input_mask, axis=1), tf.int32) rng = tf.range(0, tf.shape(seq_len)[0]) indexes = tf.stack([rng, seq_len - 1], 1) pooled = tf.gather_nd(encoder_layer, indexes) else: raise NotImplementedError() predictions = {"unique_id": unique_ids, "encodes": pooled} return EstimatorSpec(mode=mode, predictions=predictions) return model_fn def input_fn_builder(self, features, seq_length): """Creates an `input_fn` closure to be passed to Estimator.""" all_unique_ids = [] all_input_ids = [] all_input_mask = [] all_input_type_ids = [] for feature in features: all_unique_ids.append(feature.unique_id) all_input_ids.append(feature.input_ids) all_input_mask.append(feature.input_mask) all_input_type_ids.append(feature.input_type_ids) def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] num_examples = len(features) # This is for demo purposes and does NOT scale to large data sets. We do # not use Dataset.from_generator() because that uses tf.py_func which is # not TPU compatible. The right way to load data is with TFRecordReader. d = tf.data.Dataset.from_tensor_slices({ "unique_ids": tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), "input_ids": tf.constant(all_input_ids, shape=[num_examples, seq_length], dtype=tf.int32), "input_mask": tf.constant(all_input_mask, shape=[num_examples, seq_length], dtype=tf.int32), "input_type_ids": tf.constant(all_input_type_ids, shape=[num_examples, seq_length], dtype=tf.int32), }) d = d.batch(batch_size=batch_size, drop_remainder=False) return d return input_fn def sentence2feature(self, sentence, unique_id, seq_length): line = tokenization.convert_to_unicode(sentence) assert line line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) example = InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) tokens_a = self.tokenizer.tokenize(example.text_a) tokens_b = self.tokenizer.tokenize( example.text_b) if example.text_b else None if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > seq_length - 2: tokens_a = tokens_a[0:(seq_length - 2)] feature = self.tokens2feature(example.unique_id, tokens_a, tokens_b, seq_length) return feature def tokens2feature(self, unique_id, tokens_a, tokens_b, seq_length): """ seq_length """ tokens = [] input_type_ids = [] ##Q Part=a tokens.append("[CLS]") input_type_ids.append(0) for token in tokens_a: tokens.append(token) input_type_ids.append(0) tokens.append("[SEP]") input_type_ids.append(0) ##A Part=b if tokens_b: for token in tokens_b: tokens.append(token) input_type_ids.append(1) tokens.append("[SEP]") input_type_ids.append(1) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < seq_length: input_ids.append(0) input_mask.append(0) input_type_ids.append(0) assert len(input_ids) == seq_length assert len(input_mask) == seq_length assert len(input_type_ids) == seq_length return InputFeatures(unique_id=unique_id, tokens=tokens, input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids) def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "named_entity": NamedEntityProcessor, "punct": PunctProcessor, "norm": NormProcessor } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal RunConfig") dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), ) log_every_n_steps = 8 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) else: tf.logging.info("Use TPURunConfig") run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) init_checkpoint = FLAGS.init_checkpoint model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores, fp16=FLAGS.use_fp16) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal Estimator") estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: tf.logging.info("Use TPUEstimator") estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, batch_size=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch. eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, batch_size=FLAGS.eval_batch_size) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, batch_size=FLAGS.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: tf.logging.info("***** Predict results *****") for item in result: predictions = item['predictions'] seq_len = item['seq_len'] predictions = predictions[1:seq_len + 1] labels = [] for pred in predictions: labels.append(label_list[pred]) writer.write( tokenization.printable_text(' '.join(labels)) + '\n') if FLAGS.do_train and FLAGS.save_for_serving: serving_dir = os.path.join(FLAGS.output_dir, 'serving') is_tpu_estimator = not FLAGS.use_gpu or int(FLAGS.num_gpu_cores) < 2 save_for_serving(estimator, serving_dir, FLAGS.max_seq_length, is_tpu_estimator)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 session_config = tf.ConfigProto(log_device_placement=True) session_config.gpu_options.per_process_gpu_memory_fraction = 0.7 session_config.gpu_options.allow_growth = True if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal RunConfig") # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps( 'nccl', num_packs=FLAGS.num_gpu_cores), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) else: tf.logging.info("Use TPURunConfig") run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_meta = os.path.join(FLAGS.data_dir, "train.json") with open(train_meta, 'r') as f: d = json.load(f) num_train_example = d['num_train_example'] num_train_steps = int(num_train_example / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) init_checkpoint = FLAGS.init_checkpoint model_fn = model_fn_builder(bert_config=bert_config, num_labels=125, init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores, fp16=FLAGS.use_fp16) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2: tf.logging.info("Use normal Estimator") estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: tf.logging.info("Use TPUEstimator") estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.data_dir, "train*.tfrecord") train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, batch_size=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_file = os.path.join(FLAGS.data_dir, "eval*.tfrecord") eval_meta = os.path.join(FLAGS.data_dir, "eval.json") with open(eval_meta, 'r') as f: d = json.load(f) num_eval_examples = d['num_eval_examples'] tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = FLAGS.eval_steps if eval_steps == 0: eval_steps = None eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch.1 eval_steps = int(num_eval_examples / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, batch_size=FLAGS.eval_batch_size) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: pred_meta = os.path.join(FLAGS.data_dir, "predict.json") predict_file = os.path.join(FLAGS.data_dir, "predict*.tfrecord") with open(pred_meta, 'r') as f: d = json.load(f) num_pred_examples = d['num_pred_examples'] tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", num_pred_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, batch_size=FLAGS.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: tf.logging.info("***** Predict results *****") for prediction in result: output_line = "\t".join( str(class_probability) for class_probability in prediction) + "\n" writer.write(output_line) if FLAGS.do_train and FLAGS.save_for_serving: serving_dir = os.path.join(FLAGS.output_dir, 'serving') is_tpu_estimator = not FLAGS.use_gpu or int(FLAGS.num_gpu_cores) < 2 save_for_serving(estimator, serving_dir, FLAGS.max_seq_length, is_tpu_estimator)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, "qqp": QqpProcessor, 'chnsenticorp': ChnsenticorpProcessor, 'gt': GTProcessor, 'tcl': TCLProcessor } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=FLAGS.num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=FLAGS.num_gpu_cores), # cross_device_ops=AllReduceCrossDeviceOps('hierarchical_copy'), ) log_every_n_steps = 8 dist_run_config = RunConfig( train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps) tpu_run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) init_checkpoint = FLAGS.init_checkpoint is_multi_gpu = FLAGS.use_gpu and int(FLAGS.num_gpu_cores) >= 2 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores, fp16=FLAGS.use_fp16, weight_list = FLAGS.weight_list) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if is_multi_gpu: estimator = Estimator( model_fn=model_fn, params={}, config=dist_run_config) else: estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=tpu_run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, batch_size=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) # TF Serving if FLAGS.save_for_serving: serving_dir = os.path.join(FLAGS.output_dir, 'serving') save_for_serving(estimator, serving_dir, FLAGS.max_seq_length, not is_multi_gpu) # Find the latest checkpoint max_idx = 0 for filename in os.listdir(FLAGS.output_dir): if filename.startswith('model.ckpt-'): max_idx = max(int(filename.split('.')[1].split('-')[1]), max_idx) init_checkpoint = os.path.join(FLAGS.output_dir, f'model.ckpt-{max_idx}') tf.logging.info(f'Current checkpoint: {init_checkpoint}') if FLAGS.do_eval: model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, use_gpu=FLAGS.use_gpu, num_gpu_cores=FLAGS.num_gpu_cores, fp16=FLAGS.use_fp16, weight_list = FLAGS.weight_list) eval_estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=tpu_run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, batch_size=FLAGS.eval_batch_size) result = eval_estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # dump result as json file (easy parsing for other tasks) class ExtEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): return int(obj) if isinstance(obj, np.floating): return float(obj) if isinstance(obj, np.ndarray): return obj.tolist() else: return super(ExtEncoder, self).default(obj) output_eval_file2 = os.path.join(FLAGS.output_dir, "eval_results.json") with tf.gfile.GFile(output_eval_file2, "w") as writer: json.dump(result, writer, indent=4, cls=ExtEncoder) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, batch_size=FLAGS.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
for i in gen(): print(i) return (tf.data.Dataset.from_generator(gen, output_types={ 'input_ids': tf.int32, 'input_mask': tf.int32, 'input_type_ids': tf.int32, }, output_shapes={ 'input_ids': (None, max_seq_len), 'input_mask': (None, max_seq_len), 'input_type_ids': (None, max_seq_len) }).prefetch(10)) return input_fn t1 = time.time() input_fn = input_fn_builder(["NLP好难啊!", "怎么办呢?"]) result = estimator.predict(input_fn) for rq in result: a = rq['encodes'] print(rq['encodes']) t2 = time.time() print(a.shape) print("cost time:", t2 - t1)
def predict(): #FLAGS = common.model_config() tf.logging.set_verbosity(tf.logging.INFO) args.test_data = common.parse_path(args.test_data) model_config = common.loadJsonConfig( os.path.join(args.trained_model_dir, "model_config.json")) #args.added_layer_config = common.parse_path(args.added_layer_config) df = dataprocess.load_data(args.test_data) test_column_names = args.predict_column_names.split(' ') ckpt = tf.train.get_checkpoint_state(args.trained_model_dir) checkpoint_file = ckpt.model_checkpoint_path tokenization.validate_case_matches_checkpoint( model_config['do_lower_case'], checkpoint_file) # file = open(args.bert_model, 'r', encoding='utf-8') # sub_dir = file.read().strip('\n') # file.close() # bert_model_dir = args.bert_dir + sub_dir #bert_model_dir = args.bert_dir bert_config_file = os.path.join(args.trained_model_dir, "bert_config.json") bert_config = modeling.BertConfig.from_json_file(bert_config_file) if model_config['max_seq_length'] > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (model_config['max_seq_length'], bert_config.max_position_embeddings)) tf.gfile.MakeDirs(args.output_dir) processor = common.toxicCommentProcessor() vocab_file = os.path.join(args.trained_model_dir, "vocab.txt") tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=model_config['do_lower_case']) tpu_cluster_resolver = None if args.use_tpu and args.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( args.tpu_name, zone=args.tpu_zone, project=args.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 if args.use_gpu and args.num_gpu_cores == None: num_gpu_cores = len([ x for x in device_lib.list_local_devices() if x.device_type == 'GPU' ]) else: num_gpu_cores = args.num_gpu_cores if args.use_gpu and int(num_gpu_cores) >= 2: tf.logging.info("Use normal RunConfig, GPU number: %", num_gpu_cores) dist_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=num_gpu_cores, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=num_gpu_cores)) log_every_n_steps = 8 run_config = RunConfig(train_distribute=dist_strategy, eval_distribute=dist_strategy, log_step_count_steps=log_every_n_steps) else: tf.logging.info("Use TPURunConfig") run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=args.master, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=args.iterations_per_loop, num_shards=args.num_tpu_cores, per_host_input_for_training=is_per_host)) num_train_steps = None num_warmup_steps = None learning_rate = None #added_layer = common.loadJsonConfig(args.added_layer_config) model = common.get_model(model_config['layer_name']) # model = common.get_model(args.add_layer) label_num = model_config['num_labels'] model_fn = common.model_fn_builder(bert_config=bert_config, is_training_bert=False, num_labels=label_num, init_checkpoint=checkpoint_file, learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=args.use_tpu, use_one_hot_embeddings=args.use_tpu, use_gpu=args.use_gpu, num_gpu_cores=num_gpu_cores, fp16=args.use_fp16, model=model) if args.use_gpu and int(num_gpu_cores) >= 2: tf.logging.info("Use normal Estimator") estimator = Estimator(model_fn=model_fn, params={}, config=run_config) else: tf.logging.info("Use TPUEstimator") # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=args.use_tpu, model_fn=model_fn, config=run_config, predict_batch_size=args.predict_batch_size) predict_examples = processor.get_test_examples(df, test_column_names, label_num) num_actual_predict_examples = len(predict_examples) if args.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % args.predict_batch_size != 0: predict_examples.append(common.PaddingInputExample()) predict_file = os.path.join(args.output_dir, "predict.tf_record") if not os.path.isfile(predict_file): common.file_based_convert_examples_to_features( predict_examples, model_config['max_seq_length'], tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", args.predict_batch_size) predict_drop_remainder = True if args.use_tpu else False predict_input_fn = common.file_based_input_fn_builder( input_file=predict_file, seq_length=model_config['max_seq_length'], label_length=label_num, is_training=False, drop_remainder=predict_drop_remainder, batch_size=args.predict_batch_size) result = estimator.predict(input_fn=predict_input_fn) #output_predict_file = os.path.join(args.output_dir, "predict_results.csv") output_predict_file = common.generate_path(args.output_dir) #with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") predict_res = [] for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] neg_preds = np.zeros(shape=probabilities.shape, dtype=float) pos_preds = np.ones(shape=probabilities.shape, dtype=float) predictions = np.where(probabilities < 0.5, neg_preds, pos_preds) if i >= num_actual_predict_examples: break piece = np.r_[probabilities, predictions] predict_res.append(piece) num_written_lines += 1 output_colums = [] for i in range(len(predict_res[0]) // 2): col_name = "probability_" + str(i + 1) output_colums.append(col_name) for i in range(len(predict_res[0]) // 2): col_name = "prediction_" + str(i + 1) output_colums.append(col_name) out_df = pd.DataFrame(columns=output_colums, data=predict_res) print(out_df.head(3)) out_df = pd.concat([df, out_df], axis=1) print(out_df.head(3)) out_df.to_csv(output_predict_file, index=False)
class BertWorker(Process): def __init__(self, id, args, worker_address, sink_address, device_id): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_seq_len = args.max_seq_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp, pooling_strategy=args.pooling_strategy, pooling_layer=args.pooling_layer, use_xla=args.xla ) os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id) config = tf.ConfigProto(device_count={'GPU': 0 if device_id < 0 else 1}) # session-wise XLA doesn't seem to work on tf 1.10 # if args.xla: # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction self.estimator = Estimator(self.model_fn, config=RunConfig(session_config=config)) self.exit_flag = multiprocessing.Event() self.logger = set_logger(colored('WORKER-%d' % self.worker_id, 'yellow')) self.worker_address = worker_address self.sink_address = sink_address self.prefetch_factor = 10 def close(self): self.logger.info('shutting down...') self.exit_flag.set() self.terminate() self.join() self.logger.info('terminated!') def run(self): context = zmq.Context() receiver = context.socket(zmq.PULL) receiver.connect(self.worker_address) sink = context.socket(zmq.PUSH) sink.connect(self.sink_address) input_fn = self.input_fn_builder(receiver) for r in self.estimator.predict(input_fn, yield_single_examples=False): send_ndarray(sink, r['client_id'], r['encodes']) self.logger.info('job done\tsize: %s\tclient: %s' % (r['encodes'].shape, r['client_id'])) receiver.close() sink.close() context.term() self.logger.info('terminated!') def input_fn_builder(self, worker): def gen(): self.logger.info('ready and listening!') while not self.exit_flag.is_set(): client_id, msg = worker.recv_multipart() msg = jsonapi.loads(msg) self.logger.info('new job\tsize: %d\tclient: %s' % (len(msg), client_id)) tmp_f = list(convert_lst_to_features(msg, self.max_seq_len, self.tokenizer)) yield { 'client_id': client_id, 'input_ids': [f.input_ids for f in tmp_f], 'input_mask': [f.input_mask for f in tmp_f], 'input_type_ids': [f.input_type_ids for f in tmp_f] } def input_fn(): return (tf.data.Dataset.from_generator( gen, output_types={'input_ids': tf.int32, 'input_mask': tf.int32, 'input_type_ids': tf.int32, 'client_id': tf.string}, output_shapes={ 'client_id': (), 'input_ids': (None, self.max_seq_len), 'input_mask': (None, self.max_seq_len), 'input_type_ids': (None, self.max_seq_len)}).prefetch(self.prefetch_factor)) return input_fn
class BertWorker(Process): def __init__(self, id, args): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_seq_len = args.max_seq_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id) self.estimator = Estimator(self.model_fn) self.dest = None self._start_t = time.perf_counter() self.socket = None self.exit_flag = multiprocessing.Event() def close(self): logger.info('shutting down bert-worker %d ...' % self.worker_id) self.exit_flag.set() self.terminate() self.join() logger.info('bert-worker %d is terminated!' % self.worker_id) def run(self): self.socket = zmq.Context().socket(zmq.REQ) self.socket.identity = u'worker-{}'.format( self.worker_id).encode('ascii') self.socket.connect('ipc:///tmp/bert.service') input_fn = self.input_fn_builder(self.socket) self.socket.send(b'READY') logger.info('worker %d is ready and listening' % self.worker_id) for r in self.estimator.predict(input_fn, yield_single_examples=False): send_ndarray(self.socket, self.dest, r) time_used = time.perf_counter() - self._start_t logger.info('job %s is done in %.2fs' % (self.dest, time_used)) def input_fn_builder(self, worker): def gen(): while not self.exit_flag.is_set(): self.dest, empty, msg = worker.recv_multipart() self._start_t = time.perf_counter() msg = pickle.loads(msg) if BertClient.is_valid_input(msg): tmp_f = list( convert_lst_to_features(msg, self.max_seq_len, self.tokenizer)) yield { 'input_ids': [f.input_ids for f in tmp_f], 'input_mask': [f.input_mask for f in tmp_f], 'input_type_ids': [f.input_type_ids for f in tmp_f] } else: logger.warning( 'worker %s: received unsupported type! sending back None' % self.dest) worker.send_multipart([self.dest, b'', b'']) worker.close() def input_fn(): return (tf.data.Dataset.from_generator( gen, output_types={ k: tf.int32 for k in ['input_ids', 'input_mask', 'input_type_ids'] }, output_shapes={ 'input_ids': (None, self.max_seq_len), 'input_mask': (None, self.max_seq_len), 'input_type_ids': (None, self.max_seq_len) })) return input_fn