예제 #1
0
    def calculate_speed(self,
                        t0,
                        t_after_deltas,
                        t_after_update,
                        num_replicas,
                        verbose=False):
        effective_batch_size = self.get_effective_batch_size(num_replicas)
        t_calculate = t_after_deltas - t0
        t_sync = t_after_update - t_after_deltas
        t_tot = t_after_update - t0

        examples_per_sec = effective_batch_size / t_tot
        frac_calculate = t_calculate / t_tot
        frac_sync = t_sync / t_tot

        print_str = (
            '{:.2E} Examples/sec | {:.2E} sec/batch '.format(
                examples_per_sec, t_tot) +
            '[{:.1%} calc., {:.1%} sync.]'.format(frac_calculate, frac_sync))
        print_str += '[batch = {} = {}*{}] [lr = {:.2E} = {:.2E}*{}]'.format(
            effective_batch_size, self.batch_size, num_replicas,
            self.get_effective_lr(num_replicas), self.lr, num_replicas)
        if verbose:
            g.write_unique(print_str)
        return print_str
예제 #2
0
 def get_effective_lr(self, num_replicas):
     effective_lr = self.lr * num_replicas
     if effective_lr > self.max_lr:
         g.write_unique(
             'Warning: effective learning rate set to {}, '.format(
                 effective_lr) +
             'larger than maximum {}. Clipping.'.format(self.max_lr))
         effective_lr = self.max_lr
     return effective_lr
예제 #3
0
def mpi_train(conf,
              shot_list_train,
              shot_list_validate,
              loader,
              callbacks_list=None,
              shot_list_test=None):
    loader.set_inference_mode(False)

    # TODO(KGF): this is not defined in conf.yaml, but added to processed dict
    # for the first time here:
    conf['num_workers'] = g.comm.Get_size()

    specific_builder = builder.ModelBuilder(conf)
    if g.tf_ver >= parse_version('1.14.0'):
        # Internal TensorFlow flags, subject to change (v1.14.0+ only?)
        try:
            from tensorflow.python.util import module_wrapper as depr
        except ImportError:
            from tensorflow.python.util import deprecation_wrapper as depr
        # depr._PRINT_DEPRECATION_WARNINGS = False  # does nothing
        depr._PER_MODULE_WARNING_LIMIT = 0
        # Suppresses warnings from "keras/backend/tensorflow_backend.py"
        # except: "Rate should be set to `rate = 1 - keep_prob`"
        # Also suppresses warnings from "keras/optimizers.py
        # does NOT suppresses warn from "/tensorflow/python/ops/math_grad.py"
    else:
        # TODO(KGF): next line suppresses ALL info and warning messages,
        # not just deprecation warnings...
        tf.logging.set_verbosity(tf.logging.ERROR)
    # TODO(KGF): for TF>v1.13.0 (esp v1.14.0), this next line prompts a ton of
    # deprecation warnings with externally-packaged Keras, e.g.:
    # WARNING:tensorflow:From  .../keras/backend/tensorflow_backend.py:174:
    # The name tf.get_default_session is deprecated.
    # Please use tf.compat.v1.get_default_session instead.
    train_model = specific_builder.build_model(False)
    # Cannot fix these Keras internals via "import tensorflow.compat.v1 as tf"
    #
    # TODO(KGF): note, these are different than C-based info diagnostics e.g.:
    # 2019-11-06 18:27:31.698908: I ...  dynamic library libcublas.so.10
    # which are NOT suppressed by set_verbosity. See top level __init__.py

    # load the latest epoch we did. Returns 0 if none exist yet
    e = specific_builder.load_model_weights(train_model)
    e_old = e

    num_epochs = conf['training']['num_epochs']
    lr_decay = conf['model']['lr_decay']
    batch_size = conf['training']['batch_size']
    lr = conf['model']['lr']
    clipnorm = conf['model']['clipnorm']
    warmup_steps = conf['model']['warmup_steps']
    # TODO(KGF): rename as "num_iter_minimum" or "min_steps_per_epoch"
    num_batches_minimum = conf['training']['num_batches_minimum']

    if 'adam' in conf['model']['optimizer']:
        optimizer = MPIAdam(lr=lr)
    elif (conf['model']['optimizer'] == 'sgd'
          or conf['model']['optimizer'] == 'tf_sgd'):
        optimizer = MPISGD(lr=lr)
    elif 'momentum_sgd' in conf['model']['optimizer']:
        optimizer = MPIMomentumSGD(lr=lr)
    else:
        print("Optimizer not implemented yet")
        exit(1)

    g.print_unique('{} epoch(s) left to go'.format(num_epochs - e))

    batch_generator = partial(loader.training_batch_generator_partial_reset,
                              shot_list=shot_list_train)

    g.print_unique("warmup steps = {}".format(warmup_steps))
    mpi_model = MPIModel(train_model,
                         optimizer,
                         g.comm,
                         batch_generator,
                         batch_size,
                         lr=lr,
                         warmup_steps=warmup_steps,
                         num_batches_minimum=num_batches_minimum,
                         conf=conf)
    mpi_model.compile(conf['model']['optimizer'], clipnorm,
                      conf['data']['target'].loss)
    tensorboard = None
    if g.task_index == 0:
        tensorboard_save_path = conf['paths']['tensorboard_save_path']
        write_grads = conf['callbacks']['write_grads']
        tensorboard = TensorBoard(log_dir=tensorboard_save_path,
                                  histogram_freq=1,
                                  write_graph=True,
                                  write_grads=write_grads)
        tensorboard.set_model(mpi_model.model)
        # TODO(KGF): check addition of TF model summary write added from fork
        fr = open('model_architecture.log', 'a')
        ori = sys.stdout
        sys.stdout = fr
        mpi_model.model.summary()
        sys.stdout = ori
        fr.close()
        mpi_model.model.summary()

    if g.task_index == 0:
        callbacks = mpi_model.build_callbacks(conf, callbacks_list)
        callbacks.set_model(mpi_model.model)
        callback_metrics = conf['callbacks']['metrics']
        callbacks.set_params({
            'epochs': num_epochs,
            'metrics': callback_metrics,
            'batch_size': batch_size,
        })
        callbacks.on_train_begin()
    if conf['callbacks']['mode'] == 'max':
        best_so_far = -np.inf
        cmp_fn = max
    else:
        best_so_far = np.inf
        cmp_fn = min

    while e < num_epochs:
        g.write_unique('\nBegin training from epoch {:.2f}/{}'.format(
            e, num_epochs))
        if g.task_index == 0:
            callbacks.on_epoch_begin(int(round(e)))
        mpi_model.set_lr(lr * lr_decay**e)

        # KGF: core work of loop performed in next line
        (step, ave_loss, curr_loss, num_so_far,
         effective_epochs) = mpi_model.train_epoch()
        e = e_old + effective_epochs
        g.write_unique('Finished training of epoch {:.2f}/{}\n'.format(
            e, num_epochs))

        # TODO(KGF): add diagnostic about "saving to epoch X"?
        loader.verbose = False  # True during the first iteration
        if g.task_index == 0:
            specific_builder.save_model_weights(train_model, int(round(e)))

        if conf['training']['no_validation']:
            break

        epoch_logs = {}
        g.write_unique('Begin evaluation of epoch {:.2f}/{}\n'.format(
            e, num_epochs))
        # TODO(KGF): flush output/ MPI barrier?
        # g.flush_all_inorder()

        # TODO(KGF): is there a way to avoid Keras.Models.load_weights()
        # repeated calls throughout mpi_make_pred*() fn calls?
        _, _, _, roc_area, loss = mpi_make_predictions_and_evaluate(
            conf, shot_list_validate, loader)

        if conf['training']['ranking_difficulty_fac'] != 1.0:
            (_, _, _, roc_area_train,
             loss_train) = mpi_make_predictions_and_evaluate(
                 conf, shot_list_train, loader)
            batch_generator = partial(
                loader.training_batch_generator_partial_reset,
                shot_list=shot_list_train)
            mpi_model.batch_iterator = batch_generator
            mpi_model.batch_iterator_func.__exit__()
            mpi_model.num_so_far_accum = mpi_model.num_so_far_indiv
            mpi_model.set_batch_iterator_func()

        if ('monitor_test' in conf['callbacks'].keys()
                and conf['callbacks']['monitor_test']):
            times = conf['callbacks']['monitor_times']
            areas, _ = mpi_make_predictions_and_evaluate_multiple_times(
                conf, shot_list_validate, loader, times)
            epoch_str = 'epoch {}, '.format(int(round(e)))
            g.write_unique(epoch_str + ' '.join([
                'val_roc_{} = {}'.format(t, roc)
                for t, roc in zip(times, areas)
            ]) + '\n')
            if shot_list_test is not None:
                areas, _ = mpi_make_predictions_and_evaluate_multiple_times(
                    conf, shot_list_test, loader, times)
                g.write_unique(epoch_str + ' '.join([
                    'test_roc_{} = {}'.format(t, roc)
                    for t, roc in zip(times, areas)
                ]) + '\n')

        epoch_logs['val_roc'] = roc_area
        epoch_logs['val_loss'] = loss
        epoch_logs['train_loss'] = ave_loss
        best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']],
                             best_so_far)
        stop_training = False
        g.flush_all_inorder()
        if g.task_index == 0:
            print('=========Summary======== for epoch {:.2f}'.format(e))
            print('Training Loss numpy: {:.3e}'.format(ave_loss))
            print('Validation Loss: {:.3e}'.format(loss))
            print('Validation ROC: {:.4f}'.format(roc_area))
            if conf['training']['ranking_difficulty_fac'] != 1.0:
                print('Training Loss: {:.3e}'.format(loss_train))
                print('Training ROC: {:.4f}'.format(roc_area_train))
            print('======================== ')
            callbacks.on_epoch_end(int(round(e)), epoch_logs)
            if hasattr(mpi_model.model, 'stop_training'):
                stop_training = mpi_model.model.stop_training
            # only save model weights if quantity we are tracking is improving
            if best_so_far != epoch_logs[conf['callbacks']['monitor']]:
                if ('monitor_test' in conf['callbacks'].keys()
                        and conf['callbacks']['monitor_test']):
                    print("No improvement, saving model weights anyways")
                else:
                    print("Not saving model weights")
                    specific_builder.delete_model_weights(
                        train_model, int(round(e)))

            # tensorboard
            val_generator = partial(loader.training_batch_generator,
                                    shot_list=shot_list_validate)()
            val_steps = 1
            tensorboard.on_epoch_end(val_generator, val_steps, int(round(e)),
                                     epoch_logs)
        stop_training = g.comm.bcast(stop_training, root=0)
        g.write_unique('Finished evaluation of epoch {:.2f}/{}'.format(
            e, num_epochs))
        # TODO(KGF): compare to old diagnostic:
        # g.write_unique("end epoch {}".format(e_old))
        if stop_training:
            g.write_unique("Stopping training due to early stopping")
            break

    if g.task_index == 0:
        callbacks.on_train_end()
        tensorboard.on_train_end()

    mpi_model.close()
예제 #4
0
    def train_epoch(self):
        '''
        Perform distributed mini-batch SGD for
        one epoch.  It takes the batch iterator function and a NN model from
        MPIModel object, fetches mini-batches in a while-loop until number of
        samples seen by the ensemble of workers (num_so_far) exceeds the
        training dataset size (num_total).

        NOTE: "sample" = "an entire shot" within this description

        During each iteration, the gradient updates (deltas) and the loss are
        calculated for each model replica in the ensemble, weights are averaged
        over ensemble, and the new weights are set.

        It performs calls to: MPIModel.get_deltas, MPIModel.set_new_weights
        methods

        Argument list: Empty

        Returns:
          - step: final iteration number
          - ave_loss: model loss averaged over iterations within this epoch
          - curr_loss: training loss averaged over replicas at final iteration
          - num_so_far: the cumulative number of samples seen by the ensemble
        of replicas up to the end of the final iteration (step) of this epoch

        Intermediate outputs and logging: debug printout of task_index (MPI),
        epoch number, number of samples seen to a current epoch, average
        training loss

        '''

        verbose = False
        first_run = True
        step = 0
        loss_averager = Averager()
        t_start = time.time()

        timeline_prof = False
        if (self.conf is not None and conf['training']['timeline_prof']):
            timeline_prof = True
        step_limit = 0
        if (self.conf is not None and conf['training']['step_limit'] > 0):
            step_limit = conf['training']['step_limit']

        batch_iterator_func = self.batch_iterator_func
        num_total = 1
        ave_loss = -1
        curr_loss = -1
        t0 = 0
        t1 = 0
        t2 = 0

        while ((self.num_so_far - self.epoch * num_total) < num_total
               or step < self.num_batches_minimum):
            if step_limit > 0 and step > step_limit:
                print('reached step limit')
                break
            try:
                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
                 num_total, is_warmup_period) = next(batch_iterator_func)
            except StopIteration:
                g.print_unique("Resetting batch iterator.")
                self.num_so_far_accum = self.num_so_far_indiv
                self.set_batch_iterator_func()
                batch_iterator_func = self.batch_iterator_func
                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
                 num_total, is_warmup_period) = next(batch_iterator_func)
            self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr

            # if batches_to_reset:
            # self.model.reset_states(batches_to_reset)

            warmup_phase = (step < self.warmup_steps and self.epoch == 0)
            num_replicas = 1 if warmup_phase else self.num_replicas

            self.num_so_far = self.mpi_sum_scalars(self.num_so_far_indiv,
                                                   num_replicas)

            # run the model once to force compilation. Don't actually use these
            # values.
            if first_run:
                first_run = False
                t0_comp = time.time()
                #   print('input_dimension:',batch_xs.shape)
                #   print('output_dimension:',batch_ys.shape)
                _, _ = self.train_on_batch_and_get_deltas(
                    batch_xs, batch_ys, verbose)
                self.comm.Barrier()
                sys.stdout.flush()
                # TODO(KGF): check line feed/carriage returns around this
                g.print_unique(
                    '\nCompilation finished in {:.2f}s'.format(time.time() -
                                                               t0_comp))
                t_start = time.time()
                sys.stdout.flush()

            if np.any(batches_to_reset):
                reset_states(self.model, batches_to_reset)
            if ('noise' in self.conf['training'].keys()
                    and self.conf['training']['noise'] is not False):
                batch_xs = self.add_noise(batch_xs)
            t0 = time.time()
            deltas, loss = self.train_on_batch_and_get_deltas(
                batch_xs, batch_ys, verbose)
            t1 = time.time()
            if not is_warmup_period:
                self.set_new_weights(deltas, num_replicas)
                t2 = time.time()
                write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
                curr_loss = self.mpi_average_scalars(1.0 * loss, num_replicas)
                # g.print_unique(self.model.get_weights()[0][0][:4])
                loss_averager.add_val(curr_loss)
                ave_loss = loss_averager.get_ave()
                eta = self.estimate_remaining_time(
                    t0 - t_start, self.num_so_far - self.epoch * num_total,
                    num_total)
                write_str = (
                    '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
                        self.task_index, step, eta, 1.0 * self.num_so_far,
                        num_total) +
                    'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss) +
                    'walltime: {:.4f} | '.format(time.time() -
                                                 self.start_time))
                g.write_unique(write_str + write_str_0)

                if timeline_prof:
                    # dump profile
                    tl = timeline.Timeline(self.run_metadata.step_stats)
                    ctf = tl.generate_chrome_trace_format()
                    # dump file per iteration
                    with open('timeline_%s.json' % step, 'w') as f:
                        f.write(ctf)

                step += 1
            else:
                g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
                    self.task_index, self.num_so_far))

        effective_epochs = 1.0 * self.num_so_far / num_total
        epoch_previous = self.epoch
        self.epoch = effective_epochs
        g.write_unique(
            # TODO(KGF): "a total of X epochs within this session" ?
            '\nFinished training epoch {:.2f} '.format(self.epoch)
            # TODO(KGF): "precisely/exactly X epochs just passed"?
            +
            'during this session ({:.2f} epochs passed)'.format(self.epoch -
                                                                epoch_previous)
            # '\nEpoch {:.2f} finished training ({:.2f} epochs passed)'.format(
            #     1.0 * self.epoch, self.epoch - epoch_previous)
            + ' in {:.2f} seconds\n'.format(t2 - t_start))
        return (step, ave_loss, curr_loss, self.num_so_far, effective_epochs)
예제 #5
0
    def train_epoch(self):
        '''
        The purpose of the method is to perform distributed mini-batch SGD for
        one epoch.  It takes the batch iterator function and a NN model from
        MPIModel object, fetches mini-batches in a while-loop until number of
        samples seen by the ensemble of workers (num_so_far) exceeds the
        training dataset size (num_total).

        During each iteration, the gradient updates (deltas) and the loss are
        calculated for each model replica in the ensemble, weights are averaged
        over ensemble, and the new weights are set.

        It performs calls to: MPIModel.get_deltas, MPIModel.set_new_weights
        methods

        Argument list: Empty

        Returns:
          - step: epoch number
          - ave_loss: training loss averaged over replicas
          - curr_loss:
          - num_so_far: the number of samples seen by ensemble of replicas to a
        current epoch (step)

        Intermediate outputs and logging: debug printout of task_index (MPI),
        epoch number, number of samples seen to a current epoch, average
        training loss

        '''

        verbose = False
        first_run = True
        step = 0
        loss_averager = Averager()
        t_start = time.time()

        batch_iterator_func = self.batch_iterator_func
        num_total = 1
        ave_loss = -1
        curr_loss = -1
        t0 = 0
        t1 = 0
        t2 = 0

        while ((self.num_so_far - self.epoch * num_total) < num_total
               or step < self.num_batches_minimum):
            try:
                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
                 num_total, is_warmup_period) = next(batch_iterator_func)
            except StopIteration:
                g.print_unique("Resetting batch iterator.")
                self.num_so_far_accum = self.num_so_far_indiv
                self.set_batch_iterator_func()
                batch_iterator_func = self.batch_iterator_func
                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
                 num_total, is_warmup_period) = next(batch_iterator_func)
            self.num_so_far_indiv = self.num_so_far_accum + num_so_far_curr

            # if batches_to_reset:
            # self.model.reset_states(batches_to_reset)

            warmup_phase = (step < self.warmup_steps and self.epoch == 0)
            num_replicas = 1 if warmup_phase else self.num_replicas

            self.num_so_far = self.mpi_sum_scalars(self.num_so_far_indiv,
                                                   num_replicas)

            # run the model once to force compilation. Don't actually use these
            # values.
            if first_run:
                first_run = False
                t0_comp = time.time()
                _, _ = self.train_on_batch_and_get_deltas(
                    batch_xs, batch_ys, verbose)
                self.comm.Barrier()
                sys.stdout.flush()
                # TODO(KGF): check line feed/carriage returns around this
                g.print_unique(
                    '\nCompilation finished in {:.2f}s'.format(time.time() -
                                                               t0_comp))
                t_start = time.time()
                sys.stdout.flush()

            if np.any(batches_to_reset):
                reset_states(self.model, batches_to_reset)

            t0 = time.time()
            deltas, loss = self.train_on_batch_and_get_deltas(
                batch_xs, batch_ys, verbose)
            t1 = time.time()
            if not is_warmup_period:
                self.set_new_weights(deltas, num_replicas)
                t2 = time.time()
                write_str_0 = self.calculate_speed(t0, t1, t2, num_replicas)
                curr_loss = self.mpi_average_scalars(1.0 * loss, num_replicas)
                # g.print_unique(self.model.get_weights()[0][0][:4])
                loss_averager.add_val(curr_loss)
                ave_loss = loss_averager.get_val()
                eta = self.estimate_remaining_time(
                    t0 - t_start, self.num_so_far - self.epoch * num_total,
                    num_total)
                write_str = (
                    '\r[{}] step: {} [ETA: {:.2f}s] [{:.2f}/{}], '.format(
                        self.task_index, step, eta, 1.0 * self.num_so_far,
                        num_total) +
                    'loss: {:.5f} [{:.5f}] | '.format(ave_loss, curr_loss) +
                    'walltime: {:.4f} | '.format(time.time() -
                                                 self.start_time))
                g.write_unique(write_str + write_str_0)
                step += 1
            else:
                g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
                    self.task_index, self.num_so_far))

        effective_epochs = 1.0 * self.num_so_far / num_total
        epoch_previous = self.epoch
        self.epoch = effective_epochs
        g.write_unique(
            # TODO(KGF): "a total of X epochs within this session" ?
            '\nFinished training epoch {:.2f} '.format(self.epoch)
            # TODO(KGF): "precisely/exactly X epochs just passed"?
            +
            'during this session ({:.2f} epochs passed)'.format(self.epoch -
                                                                epoch_previous)
            # '\nEpoch {:.2f} finished training ({:.2f} epochs passed)'.format(
            #     1.0 * self.epoch, self.epoch - epoch_previous)
            + ' in {:.2f} seconds\n'.format(t2 - t_start))
        return (step, ave_loss, curr_loss, self.num_so_far, effective_epochs)