def _generate_movielens_examples(cls,
                                  data_dir,
                                  generated_examples_dir,
                                  train_filename,
                                  test_filename,
                                  vocab_filename,
                                  meta_filename,
                                  min_timeline_length=3,
                                  max_context_length=10):
   """Generate movielens examples, and returns a dict contains meta."""
   train_file = os.path.join(generated_examples_dir, train_filename)
   test_file = os.path.join(generated_examples_dir, test_filename)
   meta_file = os.path.join(generated_examples_dir, meta_filename)
   # Create dataset and meta, only if they are not existed.
   if not all([os.path.exists(f) for f in (train_file, test_file, meta_file)]):
     stats = _gen.generate_datasets(
         data_dir,
         output_dir=generated_examples_dir,
         min_timeline_length=min_timeline_length,
         max_context_length=max_context_length,
         build_movie_vocab=True,
         train_filename=train_filename,
         test_filename=test_filename,
         vocab_filename=vocab_filename,
     )
     file_util.write_json_file(meta_file, stats)
   meta = file_util.load_json_file(meta_file)
   return meta
Пример #2
0
    def from_squad(cls,
                   filename,
                   model_spec,
                   is_training=True,
                   version_2_with_negative=False,
                   cache_dir=None):
        """Loads data in SQuAD format and preproecess text according to `model_spec`.

    Args:
      filename: Name of the file.
      model_spec: Specification for the model.
      is_training: Whether the loaded data is for training or not.
      version_2_with_negative: Whether it's SQuAD 2.0 format.
      cache_dir: The cache directory to save preprocessed data. If None,
        generates a temporary directory to cache preprocessed data.

    Returns:
      QuestionAnswerDataLoader object.
    """
        model_spec = ms.get(model_spec)
        file_base_name = os.path.basename(filename)
        is_cached, tfrecord_file, meta_data_file, _ = _get_cache_info(
            cache_dir, file_base_name, model_spec, is_training)
        # If cached, directly loads data from cache directory.
        if is_cached and is_training:
            dataset, meta_data = _load(tfrecord_file, meta_data_file,
                                       model_spec, is_training)
            return QuestionAnswerDataLoader(
                dataset=dataset,
                size=meta_data['size'],
                version_2_with_negative=meta_data['version_2_with_negative'],
                examples=[],
                features=[],
                squad_file=filename)

        meta_data, examples, features = cls._generate_tf_record_from_squad_file(
            filename, model_spec, tfrecord_file, is_training,
            version_2_with_negative)

        file_util.write_json_file(meta_data_file, meta_data)

        dataset, meta_data = _load(tfrecord_file, meta_data_file, model_spec,
                                   is_training)
        return QuestionAnswerDataLoader(dataset, meta_data['size'],
                                        meta_data['version_2_with_negative'],
                                        examples, features, filename)
Пример #3
0
    def _save_data(cls, examples, model_spec, label_names, tfrecord_file,
                   meta_data_file, vocab_file, is_training):
        """Saves preprocessed data and other assets into files."""
        # If needed, generates and saves vocabulary in vocab_file=None,
        if model_spec.need_gen_vocab and is_training:
            model_spec.gen_vocab(examples)
            model_spec.save_vocab(vocab_file)

        # Converts examples into preprocessed features and saves in tfrecord_file.
        model_spec.convert_examples_to_features(examples, tfrecord_file,
                                                label_names)

        # Generates and saves meta data in meta_data_file.
        meta_data = {
            'size': len(examples),
            'num_classes': len(label_names),
            'index_to_label': label_names
        }
        file_util.write_json_file(meta_data_file, meta_data)
    def generate_movielens_dataset(
        cls,
        data_dir,
        generated_examples_dir=None,
        train_filename='train_movielens_1m.tfrecord',
        test_filename='test_movielens_1m.tfrecord',
        vocab_filename='movie_vocab.json',
        meta_filename='meta.json',
        min_timeline_length=3,
        max_context_length=10,
        max_context_movie_genre_length=10,
        min_rating=None,
        train_data_fraction=0.9,
        build_vocabs=True,
    ):
        """Generate movielens dataset, and returns a dict contains meta.

    Args:
      data_dir: str, path to dataset containing (unzipped) text data.
      generated_examples_dir: str, path to generate preprocessed examples.
        (default: same as data_dir)
      train_filename: str, generated file name for training data.
      test_filename: str, generated file name for test data.
      vocab_filename: str, generated file name for vocab data.
      meta_filename: str, generated file name for meta data.
      min_timeline_length: int, min timeline length to split train/eval set.
      max_context_length: int, max context length as one input.
      max_context_movie_genre_length: int, max context length of movie genre as
        one input.
      min_rating: int or None, include examples with min rating.
      train_data_fraction: float, percentage of training data [0.0, 1.0].
      build_vocabs: boolean, whether to build vocabs.

    Returns:
      Dict, metadata for the movielens dataset. Containing keys:
        `train_file`, `train_size`, `test_file`, `test_size`, vocab_file`,
        `vocab_size`, etc.
    """
        if not generated_examples_dir:
            # By default, set generated examples dir to data_dir
            generated_examples_dir = data_dir
        train_file = os.path.join(generated_examples_dir, train_filename)
        test_file = os.path.join(generated_examples_dir, test_filename)
        meta_file = os.path.join(generated_examples_dir, meta_filename)
        # Create dataset and meta, only if they are not existed.
        if not all(
            [os.path.exists(f) for f in (train_file, test_file, meta_file)]):
            stats = _gen.generate_datasets(
                data_dir,
                output_dir=generated_examples_dir,
                min_timeline_length=min_timeline_length,
                max_context_length=max_context_length,
                max_context_movie_genre_length=max_context_movie_genre_length,
                min_rating=min_rating,
                build_vocabs=build_vocabs,
                train_data_fraction=train_data_fraction,
                train_filename=train_filename,
                test_filename=test_filename,
                vocab_filename=vocab_filename,
            )
            file_util.write_json_file(meta_file, stats)
        meta = file_util.load_json_file(meta_file)
        return meta