def test_serialize_round_trip_no_parse(self): np.random.seed(1) raw_data = self.create_random_data(10) tfexamples = list(map(preprocessing.make_tf_example, *zip(*raw_data))) with tempfile.NamedTemporaryFile() as start_file, \ tempfile.NamedTemporaryFile() as rewritten_file: preprocessing.write_tf_examples(start_file.name, tfexamples) # We want to test that the rewritten, shuffled file contains correctly # serialized tf.Examples. batch_size = 4 batches = list(preprocessing.shuffle_tf_examples( batch_size, [start_file.name])) # 2 batches of 4, 1 incomplete batch of 2. self.assertEqual(len(batches), 3) # concatenate list of lists into one list all_batches = list(itertools.chain.from_iterable(batches)) for batch in batches: preprocessing.write_tf_examples( rewritten_file.name, all_batches, serialize=False) original_data = self.extract_data(start_file.name) recovered_data = self.extract_data(rewritten_file.name) # stuff is shuffled, so sort before checking equality def sort_key(nparray_tuple): return nparray_tuple[2] original_data = sorted(original_data, key=sort_key) recovered_data = sorted(recovered_data, key=sort_key) self.assertEqualData(original_data, recovered_data)
def gather(selfplay_dir, training_chunk_dir, params): """Gather selfplay data into large training chunk. Args: selfplay_dir: Where to look for games. Set as 'base_dir/data/selfplay/'. training_chunk_dir: where to put collected games. Set as 'base_dir/data/training_chunks/'. params: An object of hyperparameters for the model. """ # Check the selfplay data from the most recent 50 models. _ensure_dir_exists(training_chunk_dir) sorted_model_dirs = sorted(tf.gfile.ListDirectory(selfplay_dir)) models = [ model_dir.strip('/') for model_dir in sorted_model_dirs[-params.gather_generation:] ] with utils.logged_timer('Finding existing tfrecords...'): model_gamedata = { model: tf.gfile.Glob( os.path.join(selfplay_dir, model, '*' + _TF_RECORD_SUFFIX)) for model in models } print('Found {} models'.format(len(models))) for model_name, record_files in sorted(model_gamedata.items()): print(' {}: {} files'.format(model_name, len(record_files))) meta_file = os.path.join(training_chunk_dir, 'meta.txt') try: with tf.gfile.GFile(meta_file, 'r') as f: already_processed = set(f.read().split()) except tf.errors.NotFoundError: already_processed = set() num_already_processed = len(already_processed) for model_name, record_files in sorted(model_gamedata.items()): if set(record_files) <= already_processed: continue print('Gathering files from {}:'.format(model_name)) tf_examples = preprocessing.shuffle_tf_examples( params.shuffle_buffer_size, params.examples_per_chunk, record_files) # tqdm to make the loops show a smart progress meter for i, example_batch in enumerate(tf_examples): output_record = os.path.join(training_chunk_dir, ('{}-{}' + _TF_RECORD_SUFFIX).format( model_name, str(i))) preprocessing.write_tf_examples(output_record, example_batch, serialize=False) already_processed.update(record_files) print('Processed {} new files'.format( len(already_processed) - num_already_processed)) with tf.gfile.GFile(meta_file, 'w') as f: f.write('\n'.join(sorted(already_processed)))
def gather(selfplay_dir, training_chunk_dir, params): """Gather selfplay data into large training chunk. Args: selfplay_dir: Where to look for games. Set as 'base_dir/data/selfplay/'. training_chunk_dir: where to put collected games. Set as 'base_dir/data/training_chunks/'. params: A MiniGoParams instance of hyperparameters for the model. """ # Check the selfplay data from the most recent 50 models. _ensure_dir_exists(training_chunk_dir) sorted_model_dirs = sorted(tf.gfile.ListDirectory(selfplay_dir)) models = [model_dir.strip('/') for model_dir in sorted_model_dirs[-params.gather_generation:]] with utils.logged_timer('Finding existing tfrecords...'): model_gamedata = { model: tf.gfile.Glob( os.path.join(selfplay_dir, model, '*'+_TF_RECORD_SUFFIX)) for model in models } print('Found {} models'.format(len(models))) for model_name, record_files in sorted(model_gamedata.items()): print(' {}: {} files'.format(model_name, len(record_files))) meta_file = os.path.join(training_chunk_dir, 'meta.txt') try: with tf.gfile.GFile(meta_file, 'r') as f: already_processed = set(f.read().split()) except tf.errors.NotFoundError: already_processed = set() num_already_processed = len(already_processed) for model_name, record_files in sorted(model_gamedata.items()): if set(record_files) <= already_processed: continue print('Gathering files from {}:'.format(model_name)) tf_examples = preprocessing.shuffle_tf_examples( params.shuffle_buffer_size, params.examples_per_chunk, record_files) # tqdm to make the loops show a smart progress meter for i, example_batch in enumerate(tf_examples): output_record = os.path.join( training_chunk_dir, ('{}-{}'+_TF_RECORD_SUFFIX).format(model_name, str(i))) preprocessing.write_tf_examples( output_record, example_batch, serialize=False) already_processed.update(record_files) print('Processed {} new files'.format( len(already_processed) - num_already_processed)) with tf.gfile.GFile(meta_file, 'w') as f: f.write('\n'.join(sorted(already_processed)))
def gather( input_directory: 'where to look for games' = 'data/selfplay/', output_directory: 'where to put collected games' = 'data/training_chunks/', examples_per_record: 'how many tf.examples to gather in each chunk' = EXAMPLES_PER_RECORD): qmeas.start_time('gather') _ensure_dir_exists(output_directory) models = [ model_dir.strip('/') for model_dir in sorted(gfile.ListDirectory(input_directory))[-50:] ] with timer("Finding existing tfrecords..."): model_gamedata = { model: gfile.Glob(os.path.join(input_directory, model, '*.tfrecord.zz')) for model in models } print("Found %d models" % len(models)) for model_name, record_files in sorted(model_gamedata.items()): print(" %s: %s files" % (model_name, len(record_files))) meta_file = os.path.join(output_directory, 'meta.txt') try: with gfile.GFile(meta_file, 'r') as f: already_processed = set(f.read().split()) except tf.errors.NotFoundError: already_processed = set() num_already_processed = len(already_processed) for model_name, record_files in sorted(model_gamedata.items()): if set(record_files) <= already_processed: continue print("Gathering files for %s:" % model_name) for i, example_batch in enumerate( tqdm( preprocessing.shuffle_tf_examples(examples_per_record, record_files))): output_record = os.path.join( output_directory, '{}-{}.tfrecord.zz'.format(model_name, str(i))) preprocessing.write_tf_examples(output_record, example_batch, serialize=False) already_processed.update(record_files) print("Processed %s new files" % (len(already_processed) - num_already_processed)) with gfile.GFile(meta_file, 'w') as f: f.write('\n'.join(sorted(already_processed))) qmeas.stop_time('gather')
def aggregate(): logger.info("Gathering game results") os.makedirs(PATHS.TRAINING_CHUNK_DIR, exist_ok=True) os.makedirs(PATHS.SELFPLAY_DIR, exist_ok=True) models = [ model_dir.strip('/') for model_dir in sorted(gfile.ListDirectory(PATHS.SELFPLAY_DIR))[-50:] ] with timer("Finding existing tfrecords..."): model_gamedata = { model: gfile.Glob(os.path.join(PATHS.SELFPLAY_DIR, model, '*.zz')) for model in models } logger.info("Found %d models" % len(models)) for model_name, record_files in sorted(model_gamedata.items()): logger.info(" %s: %s files" % (model_name, len(record_files))) meta_file = os.path.join(PATHS.TRAINING_CHUNK_DIR, 'meta.txt') try: with gfile.GFile(meta_file, 'r') as f: already_processed = set(f.read().split()) except tf.errors.NotFoundError: already_processed = set() num_already_processed = len(already_processed) for model_name, record_files in sorted(model_gamedata.items()): if set(record_files) <= already_processed: continue logger.info("Gathering files for %s:" % model_name) for i, example_batch in enumerate( tqdm( preprocessing.shuffle_tf_examples( GLOBAL_PARAMETER_STORE.EXAMPLES_PER_RECORD, record_files))): output_record = os.path.join( PATHS.TRAINING_CHUNK_DIR, '{}-{}.tfrecord.zz'.format(model_name, str(i))) preprocessing.write_tf_examples(output_record, example_batch, serialize=False) already_processed.update(record_files) logger.info("Processed %s new files" % (len(already_processed) - num_already_processed)) with gfile.GFile(meta_file, 'w') as f: f.write('\n'.join(sorted(already_processed)))
def gather( input_directory: 'where to look for games'='data/selfplay/', output_directory: 'where to put collected games'='data/training_chunks/', examples_per_record: 'how many tf.examples to gather in each chunk'=EXAMPLES_PER_RECORD): qmeas.start_time('gather') _ensure_dir_exists(output_directory) models = [model_dir.strip('/') for model_dir in sorted(gfile.ListDirectory(input_directory))[-50:]] with timer("Finding existing tfrecords..."): model_gamedata = { model: gfile.Glob( os.path.join(input_directory, model, '*.tfrecord.zz')) for model in models } print("Found %d models" % len(models)) for model_name, record_files in sorted(model_gamedata.items()): print(" %s: %s files" % (model_name, len(record_files))) meta_file = os.path.join(output_directory, 'meta.txt') try: with gfile.GFile(meta_file, 'r') as f: already_processed = set(f.read().split()) except tf.errors.NotFoundError: already_processed = set() num_already_processed = len(already_processed) for model_name, record_files in sorted(model_gamedata.items()): if set(record_files) <= already_processed: continue print("Gathering files for %s:" % model_name) for i, example_batch in enumerate( tqdm(preprocessing.shuffle_tf_examples(examples_per_record, record_files))): output_record = os.path.join(output_directory, '{}-{}.tfrecord.zz'.format(model_name, str(i))) preprocessing.write_tf_examples( output_record, example_batch, serialize=False) already_processed.update(record_files) print("Processed %s new files" % (len(already_processed) - num_already_processed)) with gfile.GFile(meta_file, 'w') as f: f.write('\n'.join(sorted(already_processed))) qmeas.stop_time('gather')