def _save_raw_predictions(checkpoint: str, raw_predictions: Sequence[Mapping[str, np.ndarray]], use_wordpiece: bool) -> None: """Save raw prediction to file as tf.Examples.""" output_file = f"{checkpoint}.predicted-tfrecords" with tf.python_io.TFRecordWriter(output_file) as writer: for raw_prediction in raw_predictions: features = collections.OrderedDict() for output_name in ["unique_ids", "type", "level"]: features[output_name] = input_utils.create_bytes_feature( [raw_prediction[output_name]]) for output_name in [ "long_token_ids", "long_sentence_ids", "long_token_type_ids", "global_token_ids", "global_sentence_ids", "global_paragraph_ids", "answer_begin_top_indices", "answer_end_top_indices", "answer_types" ]: features[output_name] = input_utils.create_int_feature( raw_prediction[output_name]) for output_name in [ "supporting_facts_probs", "answer_begin_top_probs", "answer_end_top_probs", ]: features[output_name] = input_utils.create_float_feature( raw_prediction[output_name]) if use_wordpiece: features[ "long_tokens_to_unigrams"] = input_utils.create_int_feature( raw_prediction["long_tokens_to_unigrams"]) writer.write( tf.train.Example(features=tf.train.Features( feature=features)).SerializeToString())
def to_tf_example(self) -> tf.train.Example: """Returns a TF Example.""" # All features are int features except for these float features. float_features = { 'global_x_coords', 'global_y_coords', 'global_widths', 'global_heights', 'global_parent_x_coords', 'global_parent_y_coords', 'global_parent_widths', 'global_parent_heights' } fields = attr.asdict(self) assert all(x in fields for x in float_features) features = collections.OrderedDict() for name in attr.fields_dict(OpenKpEtcFeatures).keys(): values = getattr(self, name) if name in float_features: features[name] = input_utils.create_float_feature(values) else: features[name] = input_utils.create_int_feature(values) return tf.train.Example(features=tf.train.Features(feature=features))
def _to_tf_example(self, features: HotPotQAInputFeatures, example: HotpotQAExample) -> tf.train.Example: """Converts a HotPotQAInputFeatures to a tf.Example.""" features_dict = collections.OrderedDict() features_dict["unique_ids"] = input_utils.create_bytes_feature( [example.id]) features_dict["type"] = input_utils.create_bytes_feature( [example.type]) features_dict["level"] = input_utils.create_bytes_feature( [example.level]) features_dict["long_token_ids"] = input_utils.create_int_feature( features.long_token_ids) features_dict["long_sentence_ids"] = input_utils.create_int_feature( features.long_sentence_ids) features_dict["long_paragraph_ids"] = input_utils.create_int_feature( features.long_paragraph_ids) features_dict[ "long_paragraph_breakpoints"] = input_utils.create_int_feature( features.long_paragraph_breakpoints) features_dict["long_token_type_ids"] = input_utils.create_int_feature( features.long_token_type_ids) features_dict["global_token_ids"] = input_utils.create_int_feature( features.global_token_ids) features_dict[ "global_paragraph_breakpoints"] = input_utils.create_int_feature( features.global_paragraph_breakpoints) features_dict[ "global_token_type_ids"] = input_utils.create_int_feature( features.global_token_type_ids) if self._config.is_training: features_dict["supporting_facts"] = input_utils.create_int_feature( features.supporting_facts) features_dict["answer_types"] = input_utils.create_int_feature( [features.answer_type]) if self._config.answer_encoding_method == "span": features_dict[ "answer_begins"] = input_utils.create_int_feature( [features.answer_span[0]]) features_dict["answer_ends"] = input_utils.create_int_feature( [features.answer_span[1]]) else: features_dict[ "answer_bio_ids"] = input_utils.create_int_feature( features.answer_bio_ids) if not self._config.is_training or self._config.debug: if self._use_wordpiece: features_dict[ "long_tokens_to_unigrams"] = input_utils.create_int_feature( features.long_tokens_to_unigrams) features_dict[ "global_paragraph_ids"] = input_utils.create_int_feature( features.global_paragraph_ids) features_dict[ "global_sentence_ids"] = input_utils.create_int_feature( features.global_sentence_ids) if self._config.debug: features_dict["long_tokens"] = input_utils.create_bytes_feature( features.long_tokens) features_dict["global_tokens"] = input_utils.create_bytes_feature( features.global_tokens) return tf.train.Example(features=tf.train.Features( feature=features_dict))
def _process_prediction( prediction: Mapping[Text, np.ndarray], text_examples: Mapping[Text, eval_utils.OpenKpTextExample], writer_tfrecord, writer_jsonl, metrics: Optional[Mapping[int, _MetricAverager]] = None) -> None: """Processes a single TF `Estimator.predict` prediction. This function assumes that `Estimator.predict` was called with `yield_single_examples=True`. Args: prediction: Prediction from `Estimator.predict` for a single example. text_examples: A dictionary of `OpenKpTextExample` objects, keyed by URL. This is used to generate the KeyPhrase predictions based on the ngram logits in the prediction. writer_tfrecord: An open `tf.python_io.TFRecordWriter` to write to. writer_jsonl: An open text file writer to write JSON Lines to. metrics: Optional `_MetricAverager`s to update with this prediction. If None, metric calculation is skipped completely. None is appropriate for example if we're just running inference for unlabeled examples. """ # [kp_max_length, long_max_length] shape. ngram_logits = prediction['ngram_logits'] features = collections.OrderedDict() features['ngram_logits'] = input_utils.create_float_feature( ngram_logits.flatten()) position_predictions = eval_utils.logits_to_predictions( ngram_logits, max_predictions=FLAGS.max_position_predictions) # Sort predictions for convenience. position_predictions.sort(key=lambda x: x.logit, reverse=True) features['top_pos_logit'] = input_utils.create_float_feature( x.logit for x in position_predictions) features['top_pos_start_idx'] = input_utils.create_int_feature( x.start_idx for x in position_predictions) features['top_pos_phrase_len'] = input_utils.create_int_feature( x.phrase_len for x in position_predictions) url = ''.join(chr(x) for x in prediction['url_code_points'] if x != -1) features['url'] = input_utils.create_bytes_feature([url]) if url in text_examples: text_example = text_examples[url] kp_predictions = text_example.get_key_phrase_predictions( position_predictions, max_predictions=FLAGS.max_kp_predictions) if len(kp_predictions) < FLAGS.max_kp_predictions: tf.logging.warn( f'Made fewer than `max_kp_predictions` for URL: {url}') writer_jsonl.write( json.dumps({ 'url': url, 'KeyPhrases': [[kp] for kp in kp_predictions] }) + '\n') features['kp_predictions'] = input_utils.create_bytes_feature( kp_predictions) if metrics is not None: precision, recall, f1 = text_example.get_score_full(kp_predictions) for i in (1, 3, 5): p = precision[i - 1] r = recall[i - 1] f = f1[i - 1] features[f'p_at_{i}'] = input_utils.create_float_feature([p]) features[f'r_at_{i}'] = input_utils.create_float_feature([r]) features[f'f1_at_{i}'] = input_utils.create_float_feature([f]) metrics[i].add_example(precision=p, recall=r, f1=f) else: tf.logging.error(f'No text example found for URL: {url}') writer_tfrecord.write( tf.train.Example(features=tf.train.Features( feature=features)).SerializeToString())