示例#1
0
 def _input_fn(params):
     games = bigtable_input.GameQueue(
         FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table)
     games_nr = bigtable_input.GameQueue(
         FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr')
     return preprocessing.get_tpu_bt_input_tensors(
         games,
         games_nr,
         params['batch_size'],
         number_of_games=FLAGS.window_size,
         random_rotation=True)
示例#2
0
文件: train.py 项目: luckti/minigo
def train(*tf_records: "Records to train on"):
    """Train on examples."""
    tf.logging.set_verbosity(tf.logging.INFO)
    estimator = dual_net.get_estimator()

    effective_batch_size = FLAGS.train_batch_size
    if FLAGS.use_tpu:
        effective_batch_size *= FLAGS.num_tpu_cores

    if FLAGS.use_bt:
        games = bigtable_input.GameQueue(
            FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table)
        games_nr = bigtable_input.GameQueue(
            FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr')
    if FLAGS.use_tpu:
        if FLAGS.use_bt:
            def _input_fn(params):
                return preprocessing.get_tpu_bt_input_tensors(
                    games,
                    games_nr,
                    params['batch_size'],
                    number_of_games=FLAGS.window_size,
                    random_rotation=True)
        else:
            def _input_fn(params):
                return preprocessing.get_tpu_input_tensors(
                    params['batch_size'],
                    tf_records,
                    random_rotation=True)
        # Hooks are broken with TPUestimator at the moment.
        hooks = []
    else:
        def _input_fn():
            return preprocessing.get_input_tensors(
                FLAGS.train_batch_size,
                tf_records,
                filter_amount=1.0,
                shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                random_rotation=True)

        hooks = [UpdateRatioSessionHook(FLAGS.work_dir),
                 EchoStepCounterHook(output_dir=FLAGS.work_dir)]

    steps = FLAGS.steps_to_train
    logging.info("Training, steps = %s, batch = %s -> %s examples",
                 steps or '?', effective_batch_size,
                 (steps * effective_batch_size) if steps else '?')
    estimator.train(_input_fn, steps=steps, hooks=hooks)

    if FLAGS.use_bt:
        bigtable_input.set_fresh_watermark(games, FLAGS.window_size)
示例#3
0
def main(argv):
    """Main program.
    """
    del argv  # Unused
    total_games = FLAGS.training_games
    total_moves = FLAGS.training_moves
    fresh = FLAGS.training_fresh
    batch_size = FLAGS.batch_size
    output_prefix = FLAGS.output_prefix

    spec = bigtable_input.BigtableSpec(FLAGS.cbt_project, FLAGS.cbt_instance,
                                       FLAGS.cbt_table)
    gq_r = bigtable_input.GameQueue(spec.project, spec.instance, spec.table)
    gq_c = bigtable_input.GameQueue(spec.project, spec.instance,
                                    spec.table + '-nr')

    mix = bigtable_input.mix_by_decile(total_games, total_moves, 9)
    trainings = [
        (spec, start_r, start_c, mix, batch_size,
         '{}{:0>10}_{:0>10}.tfrecord.zz'.format(output_prefix, start_r,
                                                start_c))
        for start_r, finish_r, start_c, finish_c in reversed(
            list(
                training_series(gq_r.latest_game_number,
                                gq_c.latest_game_number, mix, fresh)))
    ]

    if FLAGS.starting_game:
        game = FLAGS.starting_game
        starts = [t[1] for t in trainings]
        where = bisect.bisect_left(starts, game)
        trainings = trainings[where:]

    if FLAGS.max_trainings:
        trainings = trainings[:FLAGS.max_trainings]

    if FLAGS.dry_run:
        for t in trainings:
            print(t)
        raise SystemExit

    concurrency = min(FLAGS.concurrency, multiprocessing.cpu_count() * 2)
    with tqdm(desc='Training Sets', unit_scale=2,
              total=len(trainings)) as pbar:
        for b in utils.iter_chunks(concurrency, trainings):
            with multiprocessing.Pool(processes=concurrency) as pool:
                pool.map(_export_training_set, b)
                pbar.update(len(b))
示例#4
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    games_nr = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance,
                                        FLAGS.cbt_table + '-nr')

    while True:
        new_pct = get_95_percentile_bleak(games_nr)
        update_flagfile(fsdb.flags_path(), new_pct)
        time.sleep(60 * 3)
示例#5
0
def _export_training_set(args):
    spec, start_r, start_c, mix, batch_size, output_url = args
    gq_r = bigtable_input.GameQueue(spec.project, spec.instance, spec.table)
    gq_c = bigtable_input.GameQueue(spec.project, spec.instance,
                                    spec.table + '-nr')
    total_moves = mix.moves_r + mix.moves_c

    with tf.Session() as sess:
        ds = bigtable_input.get_unparsed_moves_from_games(
            gq_r, gq_c, start_r, start_c, mix)
        ds = ds.batch(batch_size)
        iterator = ds.make_initializable_iterator()
        sess.run(iterator.initializer)
        get_next = iterator.get_next()
        writes = 0
        print('Writing to', output_url)
        with tf.io.TFRecordWriter(
                output_url, options=tf.io.TFRecordCompressionType.ZLIB) as wr:
            log_filename = '/tmp/{}_{}.log'.format(start_r, start_c)
            with open(log_filename, 'w') as progress_file:
                with tqdm(desc='Records',
                          unit_scale=2,
                          total=total_moves,
                          file=progress_file) as pbar:
                    while True:
                        try:
                            batch = sess.run(get_next)
                            pbar.update(len(batch))
                            for b in batch:
                                wr.write(b)
                            writes += 1
                            if (writes % 10000) == 0:
                                wr.flush()
                        except tf.errors.OutOfRangeError:
                            break
            os.unlink(log_filename)
示例#6
0
文件: train.py 项目: zhiwuya/minigo
def train(*tf_records: "Records to train on"):
    """Train on examples."""
    tf.logging.set_verbosity(tf.logging.INFO)
    estimator = dual_net.get_estimator()

    effective_batch_size = FLAGS.train_batch_size
    if FLAGS.use_tpu:
        effective_batch_size *= FLAGS.num_tpu_cores

    if FLAGS.use_tpu:
        if FLAGS.use_bt:
            def _input_fn(params):
                games = bigtable_input.GameQueue(
                    FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table)
                games_nr = bigtable_input.GameQueue(
                    FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table + '-nr')
                return preprocessing.get_tpu_bt_input_tensors(
                    games,
                    games_nr,
                    params['batch_size'],
                    params['input_layout'],
                    number_of_games=FLAGS.window_size,
                    random_rotation=True)
        else:
            def _input_fn(params):
                return preprocessing.get_tpu_input_tensors(
                    params['batch_size'],
                    params['input_layout'],
                    tf_records,
                    filter_amount=FLAGS.filter_amount,
                    shuffle_examples=FLAGS.shuffle_examples,
                    shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                    random_rotation=True)
        # Hooks are broken with TPUestimator at the moment.
        hooks = []
    else:
        def _input_fn():
            return preprocessing.get_input_tensors(
                FLAGS.train_batch_size,
                FLAGS.input_layout,
                tf_records,
                filter_amount=FLAGS.filter_amount,
                shuffle_examples=FLAGS.shuffle_examples,
                shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                random_rotation=True)

        hooks = [UpdateRatioSessionHook(FLAGS.work_dir),
                 EchoStepCounterHook(output_dir=FLAGS.work_dir)]

    steps = FLAGS.steps_to_train
    if not steps and FLAGS.num_examples:
        batch_size = FLAGS.train_batch_size
        if FLAGS.use_tpu:
            batch_size *= FLAGS.num_tpu_cores
        steps = math.floor(FLAGS.num_examples / batch_size)

    logging.info("Training, steps = %s, batch = %s -> %s examples",
                 steps or '?', effective_batch_size,
                 (steps * effective_batch_size) if steps else '?')

    if FLAGS.use_bt:
        games = bigtable_input.GameQueue(
            FLAGS.cbt_project, FLAGS.cbt_instance, FLAGS.cbt_table)
        if not games.read_wait_cell():
            games.require_fresh_games(20000)
        latest_game = games.latest_game_number
        index_from = max(latest_game, games.read_wait_cell())
        print("== Last game before training:", latest_game, flush=True)
        print("== Wait cell:", games.read_wait_cell(), flush=True)

    try:
        estimator.train(_input_fn, steps=steps, hooks=hooks)
        if FLAGS.use_bt:
            bigtable_input.set_fresh_watermark(games, index_from,
                                               FLAGS.window_size)
    except:
        if FLAGS.use_bt:
            games.require_fresh_games(0)
        raise
示例#7
0
def train(*tf_records: "Records to train on"):
    """Train on examples."""

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    estimator = dual_net.get_estimator(FLAGS.num_intra_threads,
                                       FLAGS.num_inter_threads)

    if FLAGS.dist_train:
        effective_batch_size = int(FLAGS.train_batch_size / hvd.size())
        global_batch_size = effective_batch_size * hvd.size()
        mllogger = mllog.get_mllogger()
        mllogger.event(key=mllog.constants.GLOBAL_BATCH_SIZE,
                       value=global_batch_size)
    else:
        effective_batch_size = FLAGS.train_batch_size
        global_batch_size = FLAGS.train_batch_size

    logging.info("Real global batch size = {}, local batch size = {}.".format(
        global_batch_size, effective_batch_size))

    if FLAGS.use_tpu:
        effective_batch_size *= FLAGS.num_tpu_cores

    if FLAGS.use_tpu:
        if FLAGS.use_bt:

            def _input_fn(params):
                games = bigtable_input.GameQueue(FLAGS.cbt_project,
                                                 FLAGS.cbt_instance,
                                                 FLAGS.cbt_table)
                games_nr = bigtable_input.GameQueue(FLAGS.cbt_project,
                                                    FLAGS.cbt_instance,
                                                    FLAGS.cbt_table + '-nr')
                return preprocessing.get_tpu_bt_input_tensors(
                    games,
                    games_nr,
                    params['batch_size'],
                    params['input_layout'],
                    number_of_games=FLAGS.window_size,
                    random_rotation=True)
        else:

            def _input_fn(params):
                return preprocessing.get_tpu_input_tensors(
                    params['batch_size'],
                    params['input_layout'],
                    tf_records,
                    filter_amount=FLAGS.filter_amount,
                    shuffle_examples=FLAGS.shuffle_examples,
                    shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                    random_rotation=True)

        # Hooks are broken with TPUestimator at the moment.
        hooks = []
    else:

        def _input_fn():
            return preprocessing.get_input_tensors(
                effective_batch_size,
                FLAGS.input_layout,
                tf_records,
                filter_amount=FLAGS.filter_amount,
                shuffle_examples=FLAGS.shuffle_examples,
                shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                random_rotation=True,
                seed=FLAGS.training_seed,
                dist_train=FLAGS.dist_train,
                use_bf16=FLAGS.use_bfloat16)

        hooks = [
            UpdateRatioSessionHook(FLAGS.work_dir),
            EchoStepCounterHook(output_dir=FLAGS.work_dir)
        ]
        if FLAGS.dist_train:
            hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    steps = FLAGS.steps_to_train
    if not steps and FLAGS.num_examples:
        batch_size = effective_batch_size
        if FLAGS.use_tpu:
            batch_size *= FLAGS.num_tpu_cores
        steps = math.floor(FLAGS.num_examples / batch_size)

    logging.info("Training, steps = %s, batch = %s -> %s examples", steps
                 or '?', effective_batch_size,
                 (steps * effective_batch_size) if steps else '?')

    if FLAGS.use_bt:
        games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance,
                                         FLAGS.cbt_table)
        if not games.read_wait_cell():
            games.require_fresh_games(20000)
        latest_game = games.latest_game_number
        index_from = max(latest_game, games.read_wait_cell())
        print("== Last game before training:", latest_game, flush=True)
        print("== Wait cell:", games.read_wait_cell(), flush=True)

    try:
        estimator.train(_input_fn, steps=steps, hooks=hooks)
        if FLAGS.use_bt:
            bigtable_input.set_fresh_watermark(games, index_from,
                                               FLAGS.window_size)
    except:
        if FLAGS.use_bt:
            games.require_fresh_games(0)
        raise
示例#8
0
def train(*tf_records: "Records to train on"):
    """Train on examples."""
    tf.logging.set_verbosity(tf.logging.INFO)
    estimator = dual_net.get_estimator()

    effective_batch_size = FLAGS.train_batch_size
    if FLAGS.use_tpu:
        effective_batch_size *= FLAGS.num_tpu_cores
    elif FLAGS.use_ipu:
        effective_batch_size *= FLAGS.num_ipu_cores

    if FLAGS.use_tpu:
        if FLAGS.use_bt:

            def _input_fn(params):
                games = bigtable_input.GameQueue(FLAGS.cbt_project,
                                                 FLAGS.cbt_instance,
                                                 FLAGS.cbt_table)
                games_nr = bigtable_input.GameQueue(FLAGS.cbt_project,
                                                    FLAGS.cbt_instance,
                                                    FLAGS.cbt_table + '-nr')
                return preprocessing.get_tpu_bt_input_tensors(
                    games,
                    games_nr,
                    params['batch_size'],
                    number_of_games=FLAGS.window_size,
                    random_rotation=True)
        else:

            def _input_fn(params):
                return preprocessing.get_tpu_input_tensors(
                    params['batch_size'], tf_records, random_rotation=True)

        # Hooks are broken with TPUestimator at the moment.
        hooks = []
    elif FLAGS.use_ipu:

        def _input_fn():
            return preprocessing.get_ipu_input_tensors(
                FLAGS.train_batch_size,
                tf_records,
                filter_amount=FLAGS.filter_amount,
                shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                shuffle_examples=False,
                random_rotation=False)

        hooks = []
    else:

        def _input_fn():
            return preprocessing.get_input_tensors(
                FLAGS.train_batch_size,
                tf_records,
                filter_amount=FLAGS.filter_amount,
                shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                random_rotation=True)

        hooks = [
            UpdateRatioSessionHook(FLAGS.work_dir),
            EchoStepCounterHook(output_dir=FLAGS.work_dir)
        ]

    try:
        if FLAGS.PROFILING:
            ph = ProfilerHook()
            hooks = [ph]
    except:
        pass

    steps = FLAGS.steps_to_train

    # step correction due to smaller batch size
    if FLAGS.use_ipu:
        steps = steps * 4096 // effective_batch_size

    logging.info("Training, steps = %s, batch = %s -> %s examples", steps
                 or '?', effective_batch_size,
                 (steps * effective_batch_size) if steps else '?')

    if FLAGS.use_bt:
        games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance,
                                         FLAGS.cbt_table)
        if not games.read_wait_cell():
            games.require_fresh_games(20000)
        latest_game = games.latest_game_number
        index_from = max(latest_game, games.read_wait_cell())
        print("== Last game before training:", latest_game, flush=True)
        print("== Wait cell:", games.read_wait_cell(), flush=True)

    if DATA_BENCHMARK:
        benchmark_op = dataset_benchmark(
            dataset=_input_fn(),
            number_of_epochs=80,
            elements_per_epochs=10000,
            print_stats=True,
            # apply_options=False
        )

        import json
        print("Benchmarking data pipeline:")
        with tf.Session() as sess:
            json_string = sess.run(benchmark_op)
            json_object = json.loads(json_string[0])
        print(json_object)
        if not INFEED_BENCHMARK:
            raise NotImplementedError("Data benchmark ended.")
        else:
            print("Data benchmark ended.")

    if INFEED_BENCHMARK:
        benchmark_op = infeed_benchmark(
            infeed_queue=ipu_infeed_queue.IPUInfeedQueue(_input_fn(),
                                                         feed_name="infeed"),
            number_of_epochs=80,
            elements_per_epochs=10000,
            print_stats=True,
            # apply_options=False
        )

        import json
        print("Benchmarking data pipeline:")
        with tf.Session() as sess:
            json_string = sess.run(benchmark_op)
            json_object = json.loads(json_string[0])
        print(json_object)
        raise NotImplementedError("Infeed benchmark ended.")

    try:
        estimator.train(_input_fn, steps=steps, hooks=hooks)
        if FLAGS.use_bt:
            bigtable_input.set_fresh_watermark(games, index_from,
                                               FLAGS.window_size)
    except:
        if FLAGS.use_bt:
            games.require_fresh_games(0)
        raise

    return estimator