示例#1
0
def _get_temp_gcs_path():
    path = f"gs://{GCS_TEST_BUCKET}/" + "".join(
        random.choice(string.ascii_lowercase) for i in range(16)
    )
    gfile.mkdir(path)
    yield path + "/file.name"
    gfile.rmtree(path)
示例#2
0
  def test_get_policy_model_files(self):
    output_dir = self.get_temp_dir()

    def write_policy_model_file(epoch):
      with gfile.GFile(
          ppo.get_policy_model_file_from_epoch(output_dir, epoch), 'w') as f:
        f.write('some data')

    epochs = [200, 100, 300]

    # 300, 200, 100
    expected_policy_model_files = [
        output_dir + '/model-000300.pkl',
        output_dir + '/model-000200.pkl',
        output_dir + '/model-000100.pkl',
    ]

    for epoch in epochs:
      write_policy_model_file(epoch)

    policy_model_files = ppo.get_policy_model_files(output_dir)

    self.assertEqual(expected_policy_model_files, policy_model_files)

    gfile.rmtree(output_dir)
示例#3
0
def maybe_load_checkpoint(logdir, optimizer, clobber_checkpoint=False):
  if not clobber_checkpoint:
    if has_checkpoint(logdir):
      print("Loading checkpoint from %s" % logdir)
      optimizer = checkpoints.restore_checkpoint(logdir, optimizer)
      print("Checkpoint loaded from step %d" % optimizer.state.step)
  else:
    if gfile.isdir(logdir):
      gfile.rmtree(logdir)
  return optimizer
示例#4
0
 def remove(self, path: str) -> bool:
     try:
         if not gfile.isdir(path):
             os.remove(path)
             return True
         if gfile.isdir(path):
             gfile.rmtree(path)
             return True
     except Exception as e:  # pylint: disable=broad-except
         logging.error('Error during remove %s', str(e))
     return False
示例#5
0
def prepare_dirs(recreate=False):
    """Prepare config dirs

    When recreate is True, if previous execution exists, remove them and recreate.
    When recreate is False, remain previous execution.
    """
    experiment_dir = environment.EXPERIMENT_DIR
    tensorboard_dir = environment.TENSORBOARD_DIR
    checkpoints_dir = environment.CHECKPOINTS_DIR

    if recreate:
        message = """
Delete and recreate these dirs:
experiment_dir: {experiment_dir}
tensorboard_dir: {tensorboard_dir}
checkpoints_dir: {checkpoints_dir}
        """.format(experiment_dir=experiment_dir,
                   tensorboard_dir=tensorboard_dir,
                   checkpoints_dir=checkpoints_dir)
    else:
        message = """
Create these dirs if the dirs dont exist:
experiment_dir: {experiment_dir}
tensorboard_dir: {tensorboard_dir}
checkpoints_dir: {checkpoints_dir}
        """.format(experiment_dir=experiment_dir,
                   tensorboard_dir=tensorboard_dir,
                   checkpoints_dir=checkpoints_dir)

    print(message)

    if recreate:
        if gfile.exists(experiment_dir):
            gfile.rmtree(experiment_dir)

        if gfile.exists(tensorboard_dir):
            gfile.rmtree(tensorboard_dir)

        if gfile.exists(checkpoints_dir):
            gfile.rmtree(checkpoints_dir)

    if not gfile.exists(experiment_dir):
        gfile.makedirs(experiment_dir)

    if not gfile.exists(tensorboard_dir):
        gfile.makedirs(tensorboard_dir)

    if not gfile.exists(checkpoints_dir):
        gfile.makedirs(checkpoints_dir)
示例#6
0
 def tmp_dir(self):
     tmp = tempfile.mkdtemp(dir=self.get_temp_dir())
     yield tmp
     gfile.rmtree(tmp)
 def tearDown(self):
     super(TestSklearnModel, self).tearDown()
     self.model.clean_up()
     if gfile.exists(self.gcs_path):
         gfile.rmtree(self.gcs_path)
示例#8
0
def rmtree(path: str):
    gfile.rmtree(path)
示例#9
0
 def remove(self, path: str) -> bool:
     if not gfile.isdir(path):
         return os.remove(path)
     return gfile.rmtree(path)
示例#10
0
  def test_load_from_directory(self):
    output_dir = self.get_temp_dir()

    epochs = [0, 1, 2]
    env_ids = [0, 1, 2]
    temperatures = [0.5, 1.0]
    random_strings = ["a", "b"]

    # Write some trajectories.
    # There are 3x3x2x2 (36) trajectories, and of them 3x2x2 (12) are done.
    for epoch in epochs:
      for env_id in env_ids:
        for temperature in temperatures:
          for random_string in random_strings:
            traj = trajectory.Trajectory(time_steps=[
                time_step.TimeStep(
                    observation=epoch,
                    done=(epoch == 0),
                    raw_reward=1.0,
                    processed_reward=1.0,
                    action=env_id,
                    info={})
            ])

            trajectory_file_name = trajectory.TRAJECTORY_FILE_FORMAT.format(
                epoch=epoch,
                env_id=env_id,
                temperature=temperature,
                r=random_string)

            with gfile.GFile(
                os.path.join(output_dir, trajectory_file_name), "w") as f:
              trajectory._get_pickle_module().dump(traj, f)

    # Load everything and check.
    bt = trajectory.BatchTrajectory.load_from_directory(output_dir)

    self.assertIsInstance(bt, trajectory.BatchTrajectory)
    self.assertEqual(36, bt.num_completed_trajectories)
    self.assertEqual(36, bt.batch_size)

    bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=0)
    self.assertEqual(12, bt.num_completed_trajectories)
    self.assertEqual(12, bt.batch_size)

    # Get 100 trajectories, but there aren't any.
    bt = trajectory.BatchTrajectory.load_from_directory(
        output_dir, epoch=0, n_trajectories=100, max_tries=0)
    self.assertIsNone(bt)

    bt = trajectory.BatchTrajectory.load_from_directory(
        output_dir, epoch=0, temperature=0.5)
    self.assertEqual(6, bt.num_completed_trajectories)
    self.assertEqual(6, bt.batch_size)

    bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=1)
    self.assertEqual(12, bt.num_completed_trajectories)
    self.assertEqual(12, bt.batch_size)

    # Constraints cannot be satisfied.
    bt = trajectory.BatchTrajectory.load_from_directory(
        output_dir, epoch=1, n_trajectories=100, up_sample=False, max_tries=0)
    self.assertIsNone(bt)

    # Constraints can be satisfied.
    bt = trajectory.BatchTrajectory.load_from_directory(
        output_dir, epoch=1, n_trajectories=100, up_sample=True, max_tries=0)
    self.assertEqual(100, bt.num_completed_trajectories)
    self.assertEqual(100, bt.batch_size)

    bt = trajectory.BatchTrajectory.load_from_directory(
        output_dir, epoch=1, n_trajectories=10)
    self.assertEqual(10, bt.num_completed_trajectories)
    self.assertEqual(10, bt.batch_size)

    gfile.rmtree(output_dir)
def run(workdir,
        data,
        strategy,
        architecture,
        n_layers,
        n_hiddens,
        activation,
        dropout_rate,
        l2_penalty,
        w_init_name,
        b_init_name,
        optimizer_name,
        learning_rate,
        n_epochs,
        epochs_between_checkpoints,
        init_stddev,
        cnn_stride,
        reduce_learningrate=False,
        verbosity=0):
    """Runs the whole training procedure."""
    data_tr, data_te, dataset_info = data
    n_outputs = dataset_info['num_classes']

    with strategy.scope():
        optimizer = tf.keras.optimizers.get(optimizer_name)
        optimizer.learning_rate = learning_rate
        w_init = tf.keras.initializers.get(w_init_name)
        if w_init_name.lower() in ['truncatednormal', 'randomnormal']:
            w_init.stddev = init_stddev
        b_init = tf.keras.initializers.get(b_init_name)
        if b_init_name.lower() in ['truncatednormal', 'randomnormal']:
            b_init.stddev = init_stddev
        w_reg = tf.keras.regularizers.l2(
            l2_penalty) if l2_penalty > 0 else None

        if architecture == 'cnn' or architecture == 'cnnbn':
            model = build_cnn(n_layers, n_hiddens, n_outputs, dropout_rate,
                              activation, cnn_stride, w_reg, w_init, b_init,
                              architecture == 'cnnbn')
        elif architecture == 'fcn':
            model = build_fcn(n_layers, n_hiddens, n_outputs, dropout_rate,
                              activation, w_reg, w_init, b_init, False)
        else:
            assert False, 'Unknown architecture: ' % architecture

        model.compile(
            optimizer=optimizer,
            loss=tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True),
            metrics=['accuracy', 'mse', 'sparse_categorical_crossentropy'])

    # force the model to set input shapes and init weights
    for x, _ in data_tr:
        model.predict(x)
        if verbosity:
            model.summary()
        break

    ckpt = tf.train.Checkpoint(step=optimizer.iterations,
                               optimizer=optimizer,
                               model=model)
    ckpt_dir = os.path.join(workdir, 'temporary-ckpt')
    ckpt_manager = tf.train.CheckpointManager(ckpt, ckpt_dir, max_to_keep=3)
    if ckpt_manager.latest_checkpoint:
        logging.info('restoring checkpoint: %s',
                     ckpt_manager.latest_checkpoint)
        print('restoring from %s' % ckpt_manager.latest_checkpoint)
        with strategy.scope():
            ckpt.restore(ckpt_manager.latest_checkpoint)
        info = restore_results(
            os.path.join(workdir, '.intermediate-results.json'))
        print(info, flush=True)
    else:
        info = {
            'steps': 0,
            'start_time': time.time(),
            'train_loss': dict(),
            'train_accuracy': dict(),
            'test_loss': dict(),
            'test_accuracy': dict(),
        }
        info.update(_get_workunit_params())  # Add command line parameters.

    logger = None
    starting_epoch = len(info['train_loss'])
    cur_epoch = starting_epoch
    for cur_epoch in range(starting_epoch, n_epochs):
        if reduce_learningrate and cur_epoch == n_epochs - (n_epochs // 10):
            optimizer.learning_rate = learning_rate / 10
        elif reduce_learningrate and cur_epoch == n_epochs - 2:
            optimizer.learning_rate = learning_rate / 100

        # Train until we reach the criterion or get NaNs
        try:
            # always keep checkpoints for the first few epochs
            # we evaluate first and train afterwards so we have the at-init data
            if cur_epoch < 4 or (cur_epoch % epochs_between_checkpoints) == 0:
                eval_model(model, data_tr, data_te, info, logger, cur_epoch,
                           workdir)

            model.fit(data_tr, epochs=1, verbose=verbosity)
            ckpt_manager.save()
            store_results(info,
                          os.path.join(workdir, '.intermediate-results.json'))

            dt = time.time() - info['start_time']
            logging.info('epoch %d (%3.2fs)', cur_epoch, dt)

        except tf.errors.InvalidArgumentError as e:
            # We got NaN in the loss, most likely gradients resulted in NaNs
            logging.info(str(e))
            info['status'] = 'NaN'
            logging.info('Stop training because NaNs encountered')
            break

    eval_model(model, data_tr, data_te, info, logger, cur_epoch + 1, workdir)
    store_results(info, os.path.join(workdir, 'results.json'))

    # we don't need the temporary checkpoints anymore
    gfile.rmtree(os.path.join(workdir, 'temporary-ckpt'))
    gfile.remove(os.path.join(workdir, '.intermediate-results.json'))