def predict_squad(strategy, input_meta_data): """Makes predictions for the squad dataset.""" bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) run_squad_helper.predict_squad( strategy, input_meta_data, tokenizer, bert_config, squad_lib_wp)
def __init__(self, bert_config_file, bert_init_ckpt, bert_max_seq_length, bert_vocab_file=None, do_lower_case=None): """Constructor. Args: bert_config_file: (string) path to Bert configuration file. bert_init_ckpt: (string) path to pretrained Bert checkpoint. bert_max_seq_length: (int) maximum input sequence length (#words) after WordPiece tokenization. Sequences longer than this will be truncated, and shorter than this will be padded. bert_vocab_file (optional): (string) path to Bert vocabulary file. do_lower_case (optional): (bool) whether to lower case the input text. This should be aligned with the `vocab_file`. """ self._bert_config_file = bert_config_file self._bert_init_ckpt = bert_init_ckpt self._bert_max_seq_length = bert_max_seq_length self._tokenizer = None if bert_vocab_file is not None and do_lower_case is not None: self._tokenizer = tokenization.FullTokenizer( vocab_file=bert_vocab_file, do_lower_case=do_lower_case)
def generate_classifier_dataset(): """Generates classifier dataset and returns input meta data.""" assert FLAGS.input_data_dir and FLAGS.classification_task_name processors = { "cola": classifier_data_lib.ColaProcessor, "mnli": classifier_data_lib.MnliProcessor, "mrpc": classifier_data_lib.MrpcProcessor, "qnli": classifier_data_lib.QnliProcessor, "sst-2": classifier_data_lib.SstProcessor, "xnli": classifier_data_lib.XnliProcessor, } task_name = FLAGS.classification_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) if FLAGS.tokenizer_impl == "word_piece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenizer_impl == "sentence_piece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) processor = processors[task_name](processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, FLAGS.input_data_dir, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, max_seq_length=FLAGS.max_seq_length)
def main(_): tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.io.gfile.glob(input_pattern)) logging.info("*** Reading from input files ***") for input_file in input_files: logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) instances = create_training_instances( input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng, FLAGS.do_whole_word_mask, FLAGS.max_ngram_size) output_files = FLAGS.output_file.split(",") logging.info("*** Writing to output files ***") for output_file in output_files: logging.info(" %s", output_file) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files, FLAGS.gzip_compress)
def test_single_cell(self, cell, text, exepected=None): with tempfile.TemporaryDirectory() as temp_dir: vocab_file = os.path.join(temp_dir, "vocab.txt") self._get_vocab_file( vocab_file, [ "a", "b", "bb", "##b", "3", ".", "5", "insti", "##tuto", "reacao", "##d", ], ) detokenizer = e2e_eval_utils.DeTokenizer(vocab_file) tokenizer = tokenization.FullTokenizer( vocab_file, do_lower_case=True, split_on_punc=True, ) table = interaction_pb2.Table() table.rows.add().cells.add().text = cell token_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) actual = detokenizer.detokenize( table, token_ids, ) if exepected is None: exepected = text self.assertEqual(actual, exepected)
def __init__(self, max_sequence_length: int, **kwargs): super().__init__(**kwargs) self.tokenizer = tokenization.FullTokenizer( vocab_file=constants.BERT_VOCAB_PATH, do_lower_case=True, ) self.max_sequence_length = max_sequence_length
def generate_regression_dataset(): """Generates regression dataset and returns input meta data.""" if FLAGS.tokenizer_impl == "word_piece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenizer_impl == "sentence_piece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) if FLAGS.tfds_params: processor = classifier_data_lib.TfdsProcessor( tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn) return classifier_data_lib.generate_tf_record_from_data_file( processor, None, tokenizer, train_data_output_path=FLAGS.train_data_output_path, eval_data_output_path=FLAGS.eval_data_output_path, test_data_output_path=FLAGS.test_data_output_path, max_seq_length=FLAGS.max_seq_length) else: raise ValueError( "No data processor found for the given regression task.")
def generate_retrieval_dataset(): """Generate retrieval test and dev dataset and returns input meta data.""" assert (FLAGS.input_data_dir and FLAGS.retrieval_task_name) if FLAGS.tokenizer_impl == "word_piece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode else: assert FLAGS.tokenizer_impl == "sentence_piece" tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) processors = { "bucc": sentence_retrieval_lib.BuccProcessor, "tatoeba": sentence_retrieval_lib.TatoebaProcessor, } task_name = FLAGS.retrieval_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name](process_text_fn=processor_text_fn) return sentence_retrieval_lib.generate_sentence_retrevial_tf_record( processor, FLAGS.input_data_dir, tokenizer, FLAGS.eval_data_output_path, FLAGS.test_data_output_path, FLAGS.max_seq_length)
def get_bert_tokenizer(bert_layer): vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case) return tokenizer
def generate_tagging_dataset(): """Generates tagging dataset.""" processors = { "panx": tagging_data_lib.PanxProcessor, "udpos": tagging_data_lib.UdposProcessor, } task_name = FLAGS.tagging_task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) if FLAGS.tokenizer_impl == "word_piece": tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode elif FLAGS.tokenizer_impl == "sentence_piece": tokenizer = tokenization.FullSentencePieceTokenizer( FLAGS.sp_model_file) processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case) else: raise ValueError("Unsupported tokenizer_impl: %s" % FLAGS.tokenizer_impl) processor = processors[task_name]() return tagging_data_lib.generate_tf_record_from_data_file( processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length, FLAGS.train_data_output_path, FLAGS.eval_data_output_path, FLAGS.test_data_output_path, processor_text_fn)
def eval_squad(strategy, input_meta_data): """Evaluate on the squad dataset.""" bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) eval_metrics = run_squad_helper.eval_squad( strategy, input_meta_data, tokenizer, bert_config, squad_lib_wp) return eval_metrics
def build(self): """Builds the class. Used for lazy initialization.""" if self.is_built: return self.vocab_file = os.path.join( registry.resolver(self.uri), 'assets', 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(self.vocab_file, self.do_lower_case)
def __init__(self, vocab_file, do_lower_case=True, split_on_punc=True): self._whitespace_tokenizer = tokenization.BasicTokenizer( do_lower_case=False, split_on_punc=False) self._punctuation_tokenizer = tokenization.BasicTokenizer( do_lower_case=False, split_on_punc=split_on_punc) self._full_tokenizer = tokenization.FullTokenizer( vocab_file, do_lower_case=do_lower_case, split_on_punc=split_on_punc) self._vocab = list(self._full_tokenizer.vocab.keys())
def _preprocess_eval_data(self, params): eval_examples = self.squad_lib.read_squad_examples( input_file=params.input_path, is_training=False, version_2_with_negative=params.version_2_with_negative) temp_file_path = params.input_preprocessed_data_path or self.logging_dir if not temp_file_path: raise ValueError( 'You must specify a temporary directory, either in ' 'params.input_preprocessed_data_path or logging_dir to ' 'store intermediate evaluation TFRecord data.') eval_writer = self.squad_lib.FeatureWriter(filename=os.path.join( temp_file_path, 'eval.tf_record'), is_training=False) eval_features = [] def _append_feature(feature, is_padding): if not is_padding: eval_features.append(feature) eval_writer.process_feature(feature) # XLNet preprocesses SQuAD examples in a P, Q, class order whereas # BERT preprocesses in a class, Q, P order. xlnet_ordering = self.task_config.model.encoder.type == 'xlnet' kwargs = dict(examples=eval_examples, max_seq_length=params.seq_length, doc_stride=params.doc_stride, max_query_length=params.query_length, is_training=False, output_fn=_append_feature, batch_size=params.global_batch_size, xlnet_format=xlnet_ordering) if params.tokenization == 'SentencePiece': # squad_lib_sp requires one more argument 'do_lower_case'. kwargs['do_lower_case'] = params.do_lower_case kwargs['tokenizer'] = tokenization.FullSentencePieceTokenizer( sp_model_file=params.vocab_file) elif params.tokenization == 'WordPiece': kwargs['tokenizer'] = tokenization.FullTokenizer( vocab_file=params.vocab_file, do_lower_case=params.do_lower_case) else: raise ValueError('Unexpected tokenization: %s' % params.tokenization) eval_dataset_size = self.squad_lib.convert_examples_to_features( **kwargs) eval_writer.close() logging.info('***** Evaluation input stats *****') logging.info(' Num orig examples = %d', len(eval_examples)) logging.info(' Num split examples = %d', len(eval_features)) logging.info(' Batch size = %d', params.global_batch_size) logging.info(' Dataset size = %d', eval_dataset_size) return eval_writer.filename, eval_examples, eval_features
def generate_tf_record_from_data_file(processor, data_dir, vocab_file, train_data_output_path=None, eval_data_output_path=None, max_seq_length=128, do_lower_case=True): """Generates and saves training data into a tf record file. Arguments: processor: Input processor object to be used for generating data. Subclass of `DataProcessor`. data_dir: Directory that contains train/eval data to process. Data files should be in from "dev.tsv", "test.tsv", or "train.tsv". vocab_file: Text file with words to be used for training/evaluation. train_data_output_path: Output to which processed tf record for training will be saved. eval_data_output_path: Output to which processed tf record for evaluation will be saved. max_seq_length: Maximum sequence length of the to be generated training/eval data. do_lower_case: Whether to lower case input text. Returns: A dictionary containing input meta data. """ assert train_data_output_path or eval_data_output_path label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) assert train_data_output_path train_input_data_examples = processor.get_train_examples(data_dir) file_based_convert_examples_to_features(train_input_data_examples, label_list, max_seq_length, tokenizer, train_data_output_path) num_training_data = len(train_input_data_examples) if eval_data_output_path: eval_input_data_examples = processor.get_dev_examples(data_dir) file_based_convert_examples_to_features(eval_input_data_examples, label_list, max_seq_length, tokenizer, eval_data_output_path) meta_data = { "task_type": "bert_classification", "processor_type": processor.get_processor_name(), "num_labels": len(processor.get_labels()), "train_data_size": num_training_data, "max_seq_length": max_seq_length, } if eval_data_output_path: meta_data["eval_data_size"] = len(eval_input_data_examples) return meta_data
def create_tokenizer(self): """ Create tokenizer :return: None """ vocab_file = self.bert_layer.resolved_object.vocab_file.asset_path.numpy( ) do_lower_case = self.bert_layer.resolved_object.do_lower_case.numpy() self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param) if not FLAGS.model_dir: FLAGS.model_dir = '/tmp/bert20/' bert_config = bert_configs.BertConfig.from_json_file( FLAGS.bert_config_file) tokeninzer = tokenization.FullTokenizer(FLAGS.vocab_file, do_lower_case=True) if FLAGS.mode == 'export_only': raise NotImplementedError() return strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus) eval_dataset = NERDataset(tokeninzer, FLAGS.eval_data_path, FLAGS.mode, FLAGS.label_file, FLAGS.max_seq_length) eval_input_fn = get_dataset_fn(eval_dataset, FLAGS.eval_batch_size, is_training=False, pad_value=_PADDING_LABEL_ID) if FLAGS.mode != 'train_and_eval': raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode) train_dataset = NERDataset(tokeninzer, FLAGS.train_data_path, FLAGS.mode, FLAGS.label_file, FLAGS.max_seq_length) train_input_fn = get_dataset_fn(train_dataset, FLAGS.train_batch_size, is_training=True, pad_value=_PADDING_LABEL_ID) input_meta_data = { "max_seq_length": FLAGS.max_seq_length, 'num_labels': train_dataset.label_num, "train_data_size": FLAGS.train_data_size if FLAGS.train_data_size else train_dataset.data_size, "eval_data_size": FLAGS.eval_data_size if FLAGS.eval_data_size else eval_dataset.data_size, "id2label": train_dataset.id2label_map } run_bert( strategy, input_meta_data, bert_config, train_input_fn, eval_input_fn, )
def __init__(self, vocab_dir): self.seq_len = 384 self.predict_batch_size = 8 self.query_len = 64 self.doc_stride = 128 vocab_file = os.path.join(vocab_dir, 'vocab.txt') vocab = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'good', 'bad'] with open(vocab_file, 'w') as f: f.write('\n'.join(vocab)) self.tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True)
def main(args): tknzr = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True) print('Input:') print(args.input) print('\nTokenized:') tokenized = tknzr.tokenize(args.input) print(tokenized) print('\nTokenized and converted to IDs:') ids = tknzr.convert_tokens_to_ids(tokenized) print(ids)
def main(args): vocab_file = os.path.join('..', 'vocabs', PRETRAINED_MODELS[args.model_class]['vocab_file']) tknzr = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) print('Input:') print(args.input) print('\nTokenized:') tokenized = tknzr.tokenize(args.input) print(tokenized) print('\nTokenized and converted to IDs:') ids = tknzr.convert_tokens_to_ids(tokenized) print(ids)
def load_bert_model(): label_list = [0, 1] # Label categories max_seq_length = 60 # maximum length of (token) input sequences model_path = "C:/Users/USER-PC/Downloads/bert_en_uncased_L-12_H-768_A-12_2" bert_layer = hub.KerasLayer(model_path, trainable=True) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case) return label_list, max_seq_length, tokenizer
def __init__( self, dstc8_data_dir, collection, vocab_file="./models/uncased_L-12_H-768_A-12/vocab.txt", do_lower_case=True, max_seq_length=DEFAULT_MAX_SEQ_LENGTH, ): self.dstc8_data_dir = dstc8_data_dir self._file_ranges = FILE_RANGES[collection] # BERT tokenizer self._tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) self._max_seq_length = max_seq_length
def generate_classifier_dataset(): """Generates classifier dataset and returns input meta data.""" assert FLAGS.input_data_dir tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) processor_text_fn = tokenization.convert_to_unicode processor = classifier_data_lib.WeiboProcessor(processor_text_fn) return classifier_data_lib.generate_predict_tf_record_from_data_file( processor, FLAGS.input_data_dir, tokenizer, predict_data_output_path=FLAGS.predict_data_output_path, max_seq_length=FLAGS.max_seq_length)
def load_pre_trained_bert_tf_hub_from_url(self, url=TF_HUB_URL, trainable_flag=True): """Loads the TF Hub pre-trained BERT model from URL. Args: url (str[optional]): URL string of the TF Hub link (default url is stored in TF_HUB_URL) trainable_flag (bool[optional]): True if want to train also BERT layer wights (highly suggested), False otherwise. Default is True """ self.pre_trained_bert_layer = hub.KerasLayer(url, trainable=trainable_flag) self.pre_trained_bert_layer._name = "bert_layer" self.info_dict.update([("trained_bert_weights", trainable_flag)]) self.vocab_file = self.pre_trained_bert_layer.resolved_object.vocab_file.asset_path.numpy() self.do_lower_case = self.pre_trained_bert_layer.resolved_object.do_lower_case.numpy() self.tokenizer = tokenization.FullTokenizer(self.vocab_file, self.do_lower_case) return
def test_generate_tf_record(self, task_type): processor = self.processors[task_type]() input_data_dir = os.path.join(self.get_temp_dir(), task_type) tf.io.gfile.mkdir(input_data_dir) # Write fake train file. _create_fake_file(os.path.join(input_data_dir, "train-en.tsv"), processor.get_labels(), is_test=False) # Write fake dev file. _create_fake_file(os.path.join(input_data_dir, "dev-en.tsv"), processor.get_labels(), is_test=False) # Write fake test files. for lang in processor.supported_languages: _create_fake_file(os.path.join(input_data_dir, "test-%s.tsv" % lang), processor.get_labels(), is_test=True) output_path = os.path.join(self.get_temp_dir(), task_type, "output") tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_file, do_lower_case=True) metadata = tagging_data_lib.generate_tf_record_from_data_file( processor, input_data_dir, tokenizer, max_seq_length=8, train_data_output_path=os.path.join(output_path, "train.tfrecord"), eval_data_output_path=os.path.join(output_path, "eval.tfrecord"), test_data_output_path=os.path.join(output_path, "test_{}.tfrecord"), text_preprocessing=tokenization.convert_to_unicode) self.assertEqual(metadata["train_data_size"], 5) files = tf.io.gfile.glob(output_path + "/*") expected_files = [] expected_files.append(os.path.join(output_path, "train.tfrecord")) expected_files.append(os.path.join(output_path, "eval.tfrecord")) for lang in processor.supported_languages: expected_files.append( os.path.join(output_path, "test_%s.tfrecord" % lang)) self.assertCountEqual(files, expected_files)
def setUp(self): super(BertClassifierLibTest, self).setUp() self.model_dir = self.get_temp_dir() self.processors = { "CB": classifier_data_lib.CBProcessor, "SUPERGLUE-RTE": classifier_data_lib.SuperGLUERTEProcessor, "BOOLQ": classifier_data_lib.BoolQProcessor, } vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens ]).encode("utf-8")) vocab_file = vocab_writer.name self.tokenizer = tokenization.FullTokenizer(vocab_file)
def load_pre_trained_bert_tf_hub_from_dir(self, bert_tf_hub_dir, trainable_flag=True): """Loads the TF Hub pre-trained BERT model from local disk. The TF Hub module can be downloaded on local disk with the following command: !wget "https://storage.googleapis.com/tfhub-modules/tensorflow/bert_en_uncased_L-12_H-768_A-12/bert_en_uncased_L-12_H-768_A-12_2.tar.gz" !tar -xvf '/bert_en_uncased_L-12_H-768_A-12_2.tar.gz' -C 'saved_models/pre_trained/bert_en_uncased_L-12_H-768_A-12_2' Args: bert_tf_hub_dir (str): PATH string of the TF Hub directory trainable_flag (bool[optional]): True if want to train also BERT layer wights (highly suggested), False otherwise. Default is True """ self.pre_trained_bert_layer = hub.KerasLayer(bert_tf_hub_dir, trainable=trainable_flag) self.pre_trained_bert_layer._name = "bert_layer" self.info_dict.update([("trained_bert_weights", trainable_flag)]) self.vocab_file = self.pre_trained_bert_layer.resolved_object.vocab_file.asset_path.numpy() self.do_lower_case = self.pre_trained_bert_layer.resolved_object.do_lower_case.numpy() self.tokenizer = tokenization.FullTokenizer(self.vocab_file, self.do_lower_case) return
def _preprocess_eval_data(self, params): eval_examples = self.squad_lib.read_squad_examples( input_file=params.input_path, is_training=False, version_2_with_negative=params.version_2_with_negative) temp_file_path = params.input_preprocessed_data_path or '/tmp' eval_writer = self.squad_lib.FeatureWriter(filename=os.path.join( temp_file_path, 'eval.tf_record'), is_training=False) eval_features = [] def _append_feature(feature, is_padding): if not is_padding: eval_features.append(feature) eval_writer.process_feature(feature) kwargs = dict(examples=eval_examples, tokenizer=tokenization.FullTokenizer( vocab_file=params.vocab_file, do_lower_case=params.do_lower_case), max_seq_length=params.seq_length, doc_stride=params.doc_stride, max_query_length=params.query_length, is_training=False, output_fn=_append_feature, batch_size=params.global_batch_size) if params.tokenization == 'SentencePiece': # squad_lib_sp requires one more argument 'do_lower_case'. kwargs['do_lower_case'] = params.do_lower_case eval_dataset_size = self.squad_lib.convert_examples_to_features( **kwargs) eval_writer.close() logging.info('***** Evaluation input stats *****') logging.info(' Num orig examples = %d', len(eval_examples)) logging.info(' Num split examples = %d', len(eval_features)) logging.info(' Batch size = %d', params.global_batch_size) logging.info(' Dataset size = %d', eval_dataset_size) return eval_writer.filename, eval_examples, eval_features
def __init__(self, dstc8_data_dir, train_file_range, dev_file_range, test_file_range, vocab_file, do_lower_case, max_seq_length=DEFAULT_MAX_SEQ_LENGTH, log_data_warnings=False): self.dstc8_data_dir = dstc8_data_dir self._log_data_warnings = log_data_warnings self._file_ranges = { "train": train_file_range, "dev": dev_file_range, "test": test_file_range, } # BERT tokenizer self._tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) self._max_seq_length = max_seq_length
def load_model(model_directory): """Loads the fine-tuned model from directory. Args: model_directory (str): PATH string of the fine-tuned model's directory on local disk """ tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(model_directory, "assets", "vocab.txt"), do_lower_case=True) print(type(tokenizer)) model = keras.models.load_model(model_directory) max_seq_length = 256 extractor = tf.keras.Model(inputs=model.inputs, outputs=[model.get_layer("bert_layer").output]) model.summary() return model, tokenizer, max_seq_length, extractor