Пример #1
0
def get_data_files(data_sources):
    """Get list of data files from data_sources.

  Args:
    data_sources: a list or tuple of data file paths.

  Returns:
    A list of file paths.

  Raises:
    ValueError: if not data files are not found
  """
    # TODO(alanesuhr): Verify this is necessary for sharded TFRecord files?
    data_files = []
    for source in data_sources:
        if source.endswith('@*'):
            data_files += gfile.Glob(source[:-2] + '*')
        elif '@' in source:
            data_files += gfile.GenerateShardedFilenames(source)
        elif '*' in source or '?' in source or '[' in source:
            data_files += gfile.Glob(source)
        else:
            data_files.append(source)
    if not data_files:
        raise ValueError('No data files found in %s' % data_sources)
    return data_files
Пример #2
0
def get_existing_corners(segmentation_dir):
    corners = []
    # Legacy path format.
    for path in gfile.Glob(os.path.join(segmentation_dir, 'seg-*_*_*.npz')):
        corners.append(get_corner_from_path(path))
    for path in gfile.Glob(os.path.join(segmentation_dir,
                                        '*/*/seg-*_*_*.npz')):
        corners.append(get_corner_from_path(path))
    return corners
Пример #3
0
def get_model_data(glob, existing):
    """Read all model meta filenames and extract per model metadata."""

    globbed = sorted(gfile.Glob(glob))

    skipped = 0
    model_data = []
    for model_path in tqdm(globbed):
        assert model_path.lower().endswith(".meta"), model_path
        model_run, model_num, model_name = parse_model_components(model_path)
        row_name = MODEL_PREFIX.format(run=model_run, num=model_name)

        if row_name in existing:
            skipped += 1
            continue

        metadata = (
            (MODEL_NAME, model_name),
            (b"model_num", model_num),
            (b"run", model_run),
            (b"parent", ""),
            (b"tag", ""),
            (b"tool", "cbt_models_backfill_to_cbt"),
            (b"trained_date", ""),
        )
        model_data.append((row_name, metadata))

    print("Read {} Models, {} new, {} existing".format(len(globbed),
                                                       len(model_data),
                                                       skipped))
    return model_data
Пример #4
0
def read_games(glob, existing_paths):
    """Read all SGFs that match glob

    Parse each game and extract relevant metadata for eval games table.
    """

    globbed = sorted(gfile.Glob(glob))

    skipped = 0
    to_parse = []
    for sgf_name in tqdm(globbed):
        assert sgf_name.lower().endswith('.sgf'), sgf_name
        sgf_path = canonical_name(sgf_name)
        sgf_filename = os.path.basename(sgf_path)

        if sgf_path in existing_paths or sgf_filename in existing_paths:
            skipped += 1
            continue

        to_parse.append(sgf_name)

    game_data = []
    with multiprocessing.Pool() as pool:
        game_data = pool.map(bigtable_output.process_game, tqdm(to_parse), 100)

    print("Read {} SGFs, {} new, {} existing".format(len(globbed),
                                                     len(game_data), skipped))
    return game_data
Пример #5
0
def main(unused_argv):

    metadata = pd.read_csv(gfile.Open(FLAGS.metadata_file), sep='\t')

    # Read DeepMass:Prism outputs and merge with metadata.
    outputs = []
    for filen in gfile.Glob(FLAGS.input_data_pattern):
        with gfile.Open(filen) as infile:
            if FLAGS.batch_prediction:
                out_df = pd.read_json(infile, lines=True)
            else:
                out_df = json.load(infile)
                out_df = pd.DataFrame(out_df['predictions'])
            out_df = out_df.merge(metadata,
                                  left_on='key',
                                  right_on='index',
                                  how='left')
            outputs.append(out_df)
    outputs = pd.concat(outputs)
    outputs = outputs.apply(reformat_outputs,
                            args=(int(FLAGS.label_dim), FLAGS.neutral_losses),
                            axis=1)

    # Read additional features.
    if FLAGS.add_feature_names is not None:
        outputs_drip = []
        for filen in gfile.Glob(FLAGS.add_input_data_pattern):
            with gfile.Open(filen) as infile:
                if FLAGS.batch_prediction:
                    out_df = pd.read_json(infile, lines=True)
                    out_df = pd.DataFrame(out_df['outputs'].tolist(),
                                          columns=FLAGS.add_feature_names,
                                          index=out_df['key'])
                else:
                    pass
                outputs_drip.append(out_df)
        outputs_drip = pd.concat(outputs_drip)
        outputs = outputs.merge(outputs_drip,
                                how='left',
                                left_on='key',
                                right_index=True)

    # Write to a file.
    with gfile.Open(os.path.join(FLAGS.output_data_dir, 'outputs.tsv'),
                    'w') as outf:
        outputs.to_csv(outf, sep='\t', index=False)
Пример #6
0
def input_fn(input_pattern, mode, params, grammar):
    """Creates input features and labels tensor dicts.

  Args:
    input_pattern: String, input path.
    mode: tf.estimator.ModeKeys execution mode.
    params: HParams object containing model hyperparameters.
    grammar: arithmetic_grammar.Grammar.

  Returns:
    features: Dict containing input tensors.
    labels: label tensor.
  """
    if mode == tf.estimator.ModeKeys.TRAIN:
        randomize = True
        num_epochs = None
    else:
        randomize = False
        num_epochs = 1

    filenames = gfile.Glob(input_pattern)
    num_files = len(filenames)
    filename_dataset = tf.data.Dataset.from_tensor_slices(
        tf.convert_to_tensor(filenames))
    if randomize:
        filename_dataset = filename_dataset.shuffle(num_files)
    dataset = filename_dataset.interleave(
        tf.data.TFRecordDataset,
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
        cycle_length=num_files,
        block_length=1)
    if randomize:
        dataset = dataset.shuffle(params.shuffle_buffer_size
                                  or 1000 * params.batch_size)

    dataset = dataset.batch(params.batch_size)

    dataset = dataset.map(functools.partial(parse_examples_fn,
                                            params=params,
                                            grammar=grammar),
                          num_parallel_calls=params.num_parallel_calls)

    if params.cache_dataset:
        # Cache the expensive read and parsing from file system.
        dataset = dataset.cache()

    dataset = dataset.map(functools.partial(process_dataset_fn,
                                            params=params,
                                            grammar=grammar),
                          num_parallel_calls=params.num_parallel_calls)

    dataset = dataset.repeat(num_epochs)
    dataset = dataset.prefetch(params.prefetch_buffer_size
                               or 1000 * params.batch_size)

    features, labels = dataset.make_one_shot_iterator().get_next()
    return features, labels
Пример #7
0
def get_model_paths(model_dir):
    """Returns all model paths in the model_dir."""
    all_models = gfile.Glob(os.path.join(model_dir, '*.meta'))
    model_filenames = [os.path.basename(m) for m in all_models]
    model_numbers_names = [(shipname.detect_model_num(m),
                            shipname.detect_model_name(m))
                           for m in model_filenames]
    model_names = sorted(model_numbers_names)
    return [os.path.join(model_dir, name[1]) for name in model_names]
Пример #8
0
def get_models():
    """Finds all models, returning a list of model number and names
    sorted increasing.

    Returns: [(13, 000013-modelname), (17, 000017-modelname), ...etc]
    """
    all_models = gfile.Glob(os.path.join(models_dir(), '*.meta'))
    model_filenames = [os.path.basename(m) for m in all_models]
    model_numbers_names = sorted([(shipname.detect_model_num(m),
                                   shipname.detect_model_name(m))
                                  for m in model_filenames])
    return model_numbers_names
Пример #9
0
  def test_train_mnist(self):
    # Create the random data and write it to the disk.
    test_subdirectory = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)

    # Create the model parameters.
    model_path = os.path.join(test_subdirectory, 'temp_model')

    with flagsaver.flagsaver(
        model_path=model_path,
        save_period=1,
        num_dense_units='4,4',
        epochs=1,
        learning_rate=0.1,
        dropout=0.0,
        batch_size=32):
      train_mnist.main(argv=())

    # Verify that the trained model was saved.
    self.assertTrue(gfile.Exists(os.path.join(model_path, 'test_accuracy.txt')))
    self.assertLen(gfile.Glob(os.path.join(model_path, 'weights_epoch*')), 1)
Пример #10
0
 def _create_dataset(self):
     if not hasattr(self, "dataset_"):
         files = gfile.Glob(dataset_file_pattern(self.dataset_name_))
         if not files:
             raise IOError("Unable to find training files. data_pattern='" +
                           dataset_file_pattern(self.dataset_name_) + "'.")
         # logging.info("Number of training files: %s.", str(len(files)))
         if len(files) > 1:
             # Read in multiple tfrecord files and interleave them in parallel
             files = tf.data.Dataset.from_tensor_slices(files)
             dataset = files.interleave(
                 tf.data.TFRecordDataset,
                 cycle_length=self.num_parallel_readers,
                 num_parallel_calls=tf.data.experimental.AUTOTUNE,
             )
         else:
             # Only a single tfrecord was given
             dataset = tf.data.TFRecordDataset(
                 files, num_parallel_reads=self.num_parallel_readers)
         self.dataset_ = dataset
Пример #11
0
def create_filename_queue(coordinates_file_pattern, shuffle=True):
    """Creates a queue for reading coordinates from coordinate file.

  Args:
    coordinates_file_pattern: File pattern for TFRecords of
                              input examples of the form of a glob
                              pattern or path@shards.
    shuffle: Whether to shuffle the coordinate file list. Note that the expanded
             coordinates_file_pattern is not guaranteed to be sorted
             alphabetically.

  Returns:
    Tensorflow queue with coordinate filenames
  """
    m = re.search(r'@(\d{1,})', coordinates_file_pattern)
    if m:
        num_shards = int(m.group(1))
        coord_file_list = [
            re.sub(r'@(\d{1,})', '-%.5d-of-%.5d' % (i, num_shards),
                   coordinates_file_pattern) for i in range(num_shards)
        ]
    else:
        coord_file_list = gfile.Glob(coordinates_file_pattern)
    return tf.train.string_input_producer(coord_file_list, shuffle=shuffle)
Пример #12
0
def get_pbs():
    all_pbs = gfile.Glob(os.path.join(models_dir(), '*.pb'))
    return all_pbs
Пример #13
0
def get_games(model_name):
    return gfile.Glob(os.path.join(selfplay_dir(), model_name, '*.zz'))
Пример #14
0
def _get(pattern):
    files = gfile.Glob(pattern)
    pool = multiprocessing.Pool()
    all_results = pool.map(_load, files)
    return pd.DataFrame(all_results)