示例#1
0
def load_featurizer(params):
  tokenizer = featurization.Tokenizer(
      vocab_path=params["vocab_path"],
      do_lower_case=params["do_lower_case"])

  return featurization.Featurizer(
      query_seq_len=params["query_seq_len"],
      candidate_seq_len=params["candidate_seq_len"],
      num_candidates=params["num_candidates"],
      max_masks=params["max_masks"],
      tokenizer=tokenizer)
示例#2
0
def load_featurizer():
  """Loads a Featurizer."""
  tokenizer = featurization.Tokenizer(
      vocab_path=FLAGS.vocab_path, do_lower_case=FLAGS.do_lower_case)

  featurizer = featurization.Featurizer(
      query_seq_len=FLAGS.query_seq_len,
      candidate_seq_len=FLAGS.candidate_seq_len,
      num_candidates=FLAGS.num_candidates,
      max_masks=FLAGS.max_masks,
      tokenizer=tokenizer)

  logging.info('Loaded featurizer.')
  return featurizer
示例#3
0
        def input_fn(params):
            """Constructs the dataset fed to Estimator."""
            # We cannot access self._featurizer via closure, because this function is
            # passed to another device. Hence, we need to reconstruct the featurizer
            # from its hyerparameters (passed through `params`).

            tokenizer = featurization.Tokenizer(
                vocab_path=params['vocab_path'],
                do_lower_case=params['do_lower_case'])

            featurizer = featurization.Featurizer(
                query_seq_len=params['query_seq_len'],
                candidate_seq_len=params['candidate_seq_len'],
                num_candidates=params['num_candidates'],
                max_masks=params['max_masks'],
                tokenizer=tokenizer,
                separate_candidate_segments=params[
                    'separate_candidate_segments'])

            dataset = get_documents_dataset()

            def featurize(doc_dict):
                return featurizer.featurize_document_tf(
                    doc_dict['title_token_ids'], doc_dict['body_token_ids'])

            dataset = dataset.map(
                featurize, num_parallel_calls=tf.data.experimental.AUTOTUNE)

            # Add a document index variable.
            dataset = dataset.enumerate()

            def _enumerate_to_dict(result_idx, tensor_dict):
                return dict(tensor_dict, result_idx=result_idx)

            dataset = dataset.map(
                _enumerate_to_dict,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

            # Pad the end of the dataset with one full extra batch.
            # This ensures that we don't drop the remainder.
            if total_docs % batch_size != 0:
                # Pad using the first value of the dataset, repeated batch_size times.
                pad_vals = dataset.take(1).repeat(batch_size)
                dataset = dataset.concatenate(pad_vals)

            # Batch the dataset.
            dataset = dataset.batch(batch_size, drop_remainder=True)
            dataset = dataset.prefetch(2)  # Prefetch for efficiency.
            return dataset
示例#4
0
def load_featurizer():
    """Loads a featurizer from hyperparams specified in model_dir."""
    params_path = os.path.join(FLAGS.model_dir, "estimator_params.json")
    with tf.gfile.GFile(params_path) as f:
        params = json.load(f)

    tokenizer = featurization.Tokenizer(vocab_path=params["vocab_path"],
                                        do_lower_case=params["do_lower_case"])

    featurizer = featurization.Featurizer(
        query_seq_len=params["query_seq_len"],
        candidate_seq_len=params["candidate_seq_len"],
        num_candidates=params["num_candidates"],
        max_masks=params["max_masks"],
        tokenizer=tokenizer)

    logging.info("Loaded featurizer.")
    return featurizer
 def __init__(self, vocab_path, do_lower_case):
     self._tokenizer = featurization.Tokenizer(vocab_path=vocab_path,
                                               do_lower_case=do_lower_case)