def test_ids2str(self): encoder = public_parsing_ops.create_text_encoder( "sentencepiece", _SPM_VOCAB) text = "the quick brown fox jumps over the lazy dog" ids = np.array([ 367, 1910, 3619, 1660, 8068, 664, 604, 1154, 684, 367, 648, 8090, 8047, 3576, 1, 0, 0, 0 ]) decode_text = text_eval.ids2str(encoder, ids, None) self.assertEqual(text, decode_text) decode_text = text_eval.ids2str(encoder, ids, 100) self.assertEqual(text, decode_text) ids = np.array([ 367, 1910, 3619, 4, 1660, 8068, 664, 604, 1154, 684, 96, 367, 648, 8090, 8047, 3576, 25, 1, 0, 0, 0 ]) decode_text = text_eval.ids2str(encoder, ids, 100) self.assertEqual( "the quick brown <4> fox jumps over <96> the lazy dog <25> ", decode_text)
def run(self): checkpoint_path = self.model_dir checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) params = registry.get_params(self.params_transformer)( self.param_overrides) parser, shapes = params.parser(mode=tf.estimator.ModeKeys.PREDICT) estimator = estimator_utils.create_estimator(self.master, self.model_dir, self.use_tpu, self.iterations_per_loop, self.num_shards, params) encoder = public_parsing_ops.create_text_encoder( self.encoder_type, self.vocab_filename) def input_function(params): input_text1 = "hello this is a first text" target1 = "first text" input_text2 = "Eighteen sailors were injured after an explosion and fire on board a ship at the US Naval Base in San Diego, US Navy officials said.The sailors on the USS Bonhomme Richard had 'minor injuries' from the fire and were taken to a hospital, Lt. Cmdr. Patricia Kreuzberger told CNN." target2 = "18 sailors injured after an explosion and fire on a naval ship in San Diego" read_dictionary_data = np.load(self.test_dict_dataset_path, allow_pickle='TRUE').item() # dataset = tf.data.Dataset.from_tensor_slices({"inputs":[input_text1, input_text2],"targets":[target1, target2]}).map(parser) dataset = tf.data.Dataset.from_tensor_slices( read_dictionary_data).map(parser) dataset = dataset.unbatch() dataset = dataset.padded_batch(params["batch_size"], padded_shapes=shapes, drop_remainder=True) return dataset predictions = estimator.predict(input_fn=input_function, checkpoint_path=checkpoint_path) for i in predictions: print( "=======================================================================================================================================" ) print("inputs: " + text_eval.ids2str(encoder, i['inputs'], None)) print("targets: " + text_eval.ids2str(encoder, i['targets'], None)) print("outputs: " + text_eval.ids2str(encoder, i['outputs'], None))
def transformer_params(patterns, param_overrides): """Params for TransformerEncoderDecoderMLModel. Args: patterns: a dict include train_pattern, dev_pattern, test_pattern param_overrides: a string, comma separated list of name=value Returns: A instance of HParams """ hparams = contrib_training.HParams( train_pattern=patterns["train_pattern"], dev_pattern=patterns["dev_pattern"], test_pattern=patterns["test_pattern"], vocab_filename="pegasus/ops/testdata/sp_test.model", encoder_type="sentencepiece_newline", length_bucket_size=0, add_task_id=False, batch_size=patterns["batch_size"], max_input_len=patterns["max_input_len"], max_target_len=patterns["max_output_len"], max_decode_len=patterns["max_output_len"], hidden_size=1024, filter_size=4096, num_heads=16, num_encoder_layers=16, num_decoder_layers=16, beam_size=1, beam_start=5, beam_alpha=0.8, beam_min=0, beam_max=-1, temperature=0.0, top_k=0, top_p=0.0, optimizer_name="adafactor", train_steps=patterns["train_steps"], learning_rate=patterns["learning_rate"], label_smoothing=0.1, dropout=0.1, eval_max_predictions=patterns.get("eval_steps", 1000), use_bfloat16=False, model=None, parser=None, encoder=None, estimator_prediction_fn=None, eval=None, estimator_eval_metrics_fn=estimator_metrics.gen_eval_metrics_fn, ) if param_overrides: hparams.parse(param_overrides) hparams.parser = functools.partial( parsers.supervised_strings_parser, hparams.vocab_filename, hparams.encoder_type, hparams.max_input_len, hparams.max_target_len, length_bucket_size=hparams.length_bucket_size, length_bucket_start_id=pegasus_params.LENGTH_BUCKET_START_ID, length_bucket_max_id=pegasus_params.TASK_START_ID - 1, add_task_id=hparams.add_task_id, task_start_id=pegasus_params.TASK_START_ID) hparams.encoder = public_parsing_ops.create_text_encoder( hparams.encoder_type, hparams.vocab_filename) hparams.model = functools.partial( transformer.TransformerEncoderDecoderModel, hparams.encoder.vocab_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.num_encoder_layers, hparams.num_decoder_layers, hparams.label_smoothing, hparams.dropout) beam_keys = ("beam_start", "beam_alpha", "beam_min", "beam_max", "temperature", "top_k", "top_p") beam_kwargs = { k: hparams.get(k) for k in beam_keys if k in hparams.values() } def decode_fn(features): return hparams.model().predict(features, hparams.max_decode_len, hparams.beam_size, **beam_kwargs) hparams.estimator_prediction_fn = decode_fn hparams.eval = functools.partial( text_eval.text_eval, hparams.encoder, num_reserved=pegasus_params.NUM_RESERVED_TOKENS) return hparams
def pegasus_large_params(param_overrides): """Params for PegasusLarge.""" hparams = contrib_training.HParams( train_pattern="tfds_transformed:common_crawl-train", dev_pattern="tfds_transformed:common_crawl-validation", test_pattern="tfds_transformed:common_crawl-test", vocab_filename="pegasus/ops/testdata/sp_test.model", encoder_type="sentencepiece_newline", parser_strategy="dynamic_rouge", parser_masked_sentence_ratio=0.45, parser_masked_words_ratio=0.0, # Configure the options of word masking # The sum of the three probs below (mask word by MSK, random, or intact) # should be 1. # By default, following the word masking procedure of BERT, which is # 80% by <MSK>, 10% by random tokens, 10% remain unchanged. parser_mask_word_by_msk_token_prob=0.8, parser_mask_word_by_random_token_prob=0.1, parser_mask_word_by_intact_prob=0.1, # Configure the options of sentence masking. # The sum of the four probs below (mask sentence by MSK, random, intact # or remove) should be 1. # The four sentence masking options: # 1. Masking seleted sentences by <MSK>. In practice, the <MSK> token # for sentences is different from the <MSK> token for words in order # to distinguish sentence masking and word masking. # 2. Masking selected sentences by another sentences which are randomly # picked from the same document. # 3. Masking selected sentences by leaving them unchanged. # 4. Masking selected sentences by removing them from inputs. parser_mask_sentence_by_msk_token_prob=0.9, parser_mask_sentence_by_random_sentence_prob=0., parser_mask_sentence_by_intact_prob=0.1, parser_mask_sentence_by_remove_prob=0., # rouge_ngrams_size: a positive integer parser_rouge_ngrams_size=1, # rouge_metric_type: precision, recall, F parser_rouge_metric_type="F", # rouge_compute_option: standard, deduplicate, log # standard: number of each ngram counted as it appears # deduplicate: number of each ngram counted once only # log: apply log(1+n) when compute the appearance of each ngram parser_rouge_compute_option="standard", parser_rouge_stopwords_filename="pegasus/ops/testdata/english_stopwords", parser_rouge_noise_ratio=0.20, parser_dynamic_mask_min_ratio=0.33, # if greater than zero, assign target into buckets by # length // bucket_size, the bucket id is appended to the start of inputs. # the bucket id uses the reserved bucket ids, starting from the start id, # goes up to the maximum number of reseerved tokens. length_bucket_size=0, add_task_id=False, batch_size=16, max_input_len=512, max_target_len=256, max_decode_len=256, max_total_words=0, pretrain_target_filter_min=0, hidden_size=1024, filter_size=4096, num_heads=16, num_encoder_layers=16, num_decoder_layers=16, optimizer_name="adafactor", learning_rate=0.01, label_smoothing=0.0, dropout=0.1, train_steps=1500000, beam_size=1, eval_max_predictions=1000, use_bfloat16=False, model=None, encoder=None, parser=None, estimator_prediction_fn=None, eval=None, estimator_eval_metrics_fn=estimator_metrics.pretrain_eval_metrics_fn, ) if param_overrides: hparams.parse(param_overrides) # Check values if (hparams.parser_mask_word_by_msk_token_prob + hparams.parser_mask_word_by_random_token_prob + hparams.parser_mask_word_by_intact_prob) != 1.: raise ValueError("The sum of rates of the three word masking options " "(MSK, random, intact) does not equal to 1.") if (hparams.parser_mask_sentence_by_msk_token_prob + hparams.parser_mask_sentence_by_random_sentence_prob + hparams.parser_mask_sentence_by_intact_prob + hparams.parser_mask_sentence_by_remove_prob) != 1.: raise ValueError("The sum of rates of the four sentence masking options " "(MSK, random, intact, skip) does not equal to 1.") hparams.encoder = public_parsing_ops.create_text_encoder( hparams.encoder_type, hparams.vocab_filename) hparams.parser = functools.partial( parsers.string_features_for_pretraining_parser, hparams.vocab_filename, hparams.encoder_type, hparams.max_input_len, hparams.max_target_len, hparams.max_total_words, hparams.parser_strategy, hparams.parser_masked_sentence_ratio, hparams.parser_masked_words_ratio, [ hparams.parser_mask_word_by_msk_token_prob, hparams.parser_mask_word_by_random_token_prob, hparams.parser_mask_word_by_intact_prob ], [ hparams.parser_mask_sentence_by_msk_token_prob, hparams.parser_mask_sentence_by_random_sentence_prob, hparams.parser_mask_sentence_by_intact_prob, hparams.parser_mask_sentence_by_remove_prob ], hparams.parser_rouge_ngrams_size, hparams.parser_rouge_metric_type, hparams.parser_rouge_compute_option, hparams.parser_rouge_stopwords_filename, NUM_RESERVED_TOKENS, parser_rouge_noise_ratio=hparams.parser_rouge_noise_ratio, parser_dynamic_mask_min_ratio=hparams.parser_dynamic_mask_min_ratio, input_feature="inputs", pretrain_target_filter_min=hparams.pretrain_target_filter_min, length_bucket_size=hparams.length_bucket_size, length_bucket_start_id=LENGTH_BUCKET_START_ID, length_bucket_max_id=TASK_START_ID - 1, add_task_id=hparams.add_task_id, task_start_id=TASK_START_ID) hparams.model = functools.partial( transformer.TransformerEncoderDecoderModel, hparams.encoder.vocab_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.num_encoder_layers, hparams.num_decoder_layers, hparams.label_smoothing, hparams.dropout) def decode_fn(features): return hparams.model().predict(features, hparams.max_decode_len, hparams.beam_size) hparams.estimator_prediction_fn = decode_fn hparams.eval = functools.partial( text_eval.text_eval, hparams.encoder, num_reserved=NUM_RESERVED_TOKENS) return hparams
def test_py_decode(self, encoder_type): text = "the quick brown fox jumps \n over the lazy dog." e1 = text_encoder_utils.create_text_encoder(encoder_type, _SPM_VOCAB) e2 = public_parsing_ops.create_text_encoder(encoder_type, _SPM_VOCAB) ids = e1.encode(text) self.assertEqual(e1.decode(ids), e2.decode(ids))
def test_py_encode(self, encoder_type): text = "the quick brown fox\n jumps over the lazy dog.\n" e1 = text_encoder_utils.create_text_encoder(encoder_type, _SPM_VOCAB) e2 = public_parsing_ops.create_text_encoder(encoder_type, _SPM_VOCAB) self.assertEqual(e1.encode(text), e2.encode(text))
def test_vocab(self, encoder_type): e1 = text_encoder_utils.create_text_encoder(encoder_type, _SPM_VOCAB) e2 = public_parsing_ops.create_text_encoder(encoder_type, _SPM_VOCAB) self.assertEqual(e1.vocab_size, e2.vocab_size)