def frnn_model_loader(custom_path):
    print(conf)
    specific_builder = builder.ModelBuilder(conf)
    model = specific_builder.build_model_PCS(False)
    print(
        'Printing_out whole model summary..............********.......********'
    )
    model.summary()
    print('FRNN Model built....')
    specific_builder.load_model_weights(model, custom_path)
    print('FRNN Model loaded....')
    model.summary()
    return model
示例#2
0
def mpi_train(conf,
              shot_list_train,
              shot_list_validate,
              loader,
              callbacks_list=None):

    loader.set_inference_mode(False)
    conf['num_workers'] = comm.Get_size()

    specific_builder = builder.ModelBuilder(conf)
    train_model = specific_builder.build_model(False)

    #load the latest epoch we did. Returns -1 if none exist yet
    e = specific_builder.load_model_weights(train_model)
    e_old = e

    num_epochs = conf['training']['num_epochs']
    lr_decay = conf['model']['lr_decay']
    batch_size = conf['training']['batch_size']
    lr = conf['model']['lr']
    clipnorm = conf['model']['clipnorm']
    warmup_steps = conf['model']['warmup_steps']
    num_batches_minimum = conf['training']['num_batches_minimum']

    if 'adam' in conf['model']['optimizer']:
        optimizer = MPIAdam(lr=lr)
    elif conf['model']['optimizer'] == 'sgd' or conf['model'][
            'optimizer'] == 'tf_sgd':
        optimizer = MPISGD(lr=lr)
    elif 'momentum_sgd' in conf['model']['optimizer']:
        optimizer = MPIMomentumSGD(lr=lr)
    else:
        print("Optimizer not implemented yet")
        exit(1)

    print('{} epochs left to go'.format(num_epochs - 1 - e))

    # batch_generator = partial(loader.training_batch_generator,shot_list=shot_list_train)
    batch_generator = partial(loader.training_batch_generator_partial_reset,
                              shot_list=shot_list_train)
    #{}batch_generator = partial(loader.training_batch_generator_process,shot_list=shot_list_train)

    print("warmup {}".format(warmup_steps))
    mpi_model = MPIModel(train_model,
                         optimizer,
                         comm,
                         batch_generator,
                         batch_size,
                         lr=lr,
                         warmup_steps=warmup_steps,
                         num_batches_minimum=num_batches_minimum)
    mpi_model.compile(conf['model']['optimizer'], clipnorm,
                      conf['data']['target'].loss)

    tensorboard = None
    if backend != "theano" and task_index == 0:
        tensorboard_save_path = conf['paths']['tensorboard_save_path']
        write_grads = conf['callbacks']['write_grads']
        tensorboard = TensorBoard(log_dir=tensorboard_save_path,
                                  histogram_freq=1,
                                  write_graph=True,
                                  write_grads=write_grads)
        tensorboard.set_model(mpi_model.model)
        mpi_model.model.summary()

    if task_index == 0:
        callbacks = mpi_model.build_callbacks(conf, callbacks_list)
        callbacks.set_model(mpi_model.model)
        callback_metrics = conf['callbacks']['metrics']
        callbacks.set_params({
            'epochs': num_epochs,
            'metrics': callback_metrics,
            'batch_size': batch_size,
        })
        callbacks.on_train_begin()
    if conf['callbacks']['mode'] == 'max':
        best_so_far = -np.inf
        cmp_fn = max
    else:
        best_so_far = np.inf
        cmp_fn = min

    while e < num_epochs - 1:
        if task_index == 0:
            callbacks.on_epoch_begin(int(round(e)))
        mpi_model.set_lr(lr * lr_decay**e)
        print_unique('\nEpoch {}/{}'.format(e, num_epochs))

        (step, ave_loss, curr_loss, num_so_far,
         effective_epochs) = mpi_model.train_epoch()
        e = e_old + effective_epochs

        loader.verbose = False  #True during the first iteration
        if task_index == 0:
            specific_builder.save_model_weights(train_model, int(round(e)))

        epoch_logs = {}

        _, _, _, roc_area, loss = mpi_make_predictions_and_evaluate(
            conf, shot_list_validate, loader)
        if conf['training']['ranking_difficulty_fac'] != 1.0:
            _, _, _, roc_area_train, loss_train = mpi_make_predictions_and_evaluate(
                conf, shot_list_train, loader)
            batch_generator = partial(
                loader.training_batch_generator_partial_reset,
                shot_list=shot_list_train)
            mpi_model.batch_iterator = batch_generator
            mpi_model.batch_iterator_func.__exit__()
            mpi_model.num_so_far_accum = mpi_model.num_so_far_indiv
            mpi_model.set_batch_iterator_func()
        epoch_logs['val_roc'] = roc_area
        epoch_logs['val_loss'] = loss
        epoch_logs['train_loss'] = ave_loss
        best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']],
                             best_so_far)

        stop_training = False
        if task_index == 0:
            print('=========Summary======== for epoch{}'.format(step))
            print('Training Loss numpy: {:.3e}'.format(ave_loss))
            print('Validation Loss: {:.3e}'.format(loss))
            print('Validation ROC: {:.4f}'.format(roc_area))
            if conf['training']['ranking_difficulty_fac'] != 1.0:
                print('Training Loss: {:.3e}'.format(loss_train))
                print('Training ROC: {:.4f}'.format(roc_area_train))

            callbacks.on_epoch_end(int(round(e)), epoch_logs)
            if hasattr(mpi_model.model, 'stop_training'):
                stop_training = mpi_model.model.stop_training
            if best_so_far != epoch_logs[
                    conf['callbacks']
                ['monitor']]:  #only save model weights if quantity we are tracking is improving
                print("Not saving model weights")
                specific_builder.delete_model_weights(train_model,
                                                      int(round(e)))

            #tensorboard
            if backend != 'theano':
                val_generator = partial(loader.training_batch_generator,
                                        shot_list=shot_list_validate)()
                val_steps = 1
                tensorboard.on_epoch_end(val_generator, val_steps,
                                         int(round(e)), epoch_logs)

        stop_training = comm.bcast(stop_training, root=0)
        if stop_training:
            print("Stopping training due to early stopping")
            break

    if task_index == 0:
        callbacks.on_train_end()
        tensorboard.on_train_end()

    mpi_model.close()
示例#3
0
def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
    loader.set_inference_mode(True)
    np.random.seed(task_index)
    shot_list.sort()  #make sure all replicas have the same list
    specific_builder = builder.ModelBuilder(conf)

    y_prime = []
    y_gold = []
    disruptive = []

    model = specific_builder.build_model(True)
    specific_builder.load_model_weights(model, custom_path)

    #broadcast model weights then set it explicitely: fix for Py3.6
    if sys.version_info[0] > 2:
        if task_index == 0:
            new_weights = model.get_weights()
        else:
            new_weights = None
        nw = comm.bcast(new_weights, root=0)
        model.set_weights(nw)

    model.reset_states()
    if task_index == 0:
        pbar = Progbar(len(shot_list))
    shot_sublists = shot_list.sublists(conf['model']['pred_batch_size'],
                                       do_shuffle=False,
                                       equal_size=True)

    y_prime_global = []
    y_gold_global = []
    disruptive_global = []
    if task_index != 0:
        loader.verbose = False

    for (i, shot_sublist) in enumerate(shot_sublists):

        if i % num_workers == task_index:
            X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist)

            #load data and fit on data
            y_p = model.predict(X, batch_size=conf['model']['pred_batch_size'])
            model.reset_states()
            y_p = loader.batch_output_to_array(y_p)
            y = loader.batch_output_to_array(y)

            #cut arrays back
            y_p = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y_p)]
            y = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y)]

            # print('Shots {}/{}'.format(i*num_at_once + j*1.0*len(shot_sublist)/len(X_list),len(shot_list_train)))
            y_prime += y_p
            y_gold += y
            disruptive += disr
            # print_all('\nFinished with i = {}'.format(i))

        if i % num_workers == num_workers - 1 or i == len(shot_sublists) - 1:
            comm.Barrier()
            y_prime_global += concatenate_sublists(comm.allgather(y_prime))
            y_gold_global += concatenate_sublists(comm.allgather(y_gold))
            disruptive_global += concatenate_sublists(
                comm.allgather(disruptive))
            comm.Barrier()
            y_prime = []
            y_gold = []
            disruptive = []
            # print_all('\nFinished subepoch with lists len(y_prime_global), gold, disruptive = {},{},{}'.format(len(y_prime_global),len(y_gold_global),len(disruptive_global)))

        if task_index == 0:
            pbar.add(1.0 * len(shot_sublist))

    y_prime_global = y_prime_global[:len(shot_list)]
    y_gold_global = y_gold_global[:len(shot_list)]
    disruptive_global = disruptive_global[:len(shot_list)]
    loader.set_inference_mode(False)

    return y_prime_global, y_gold_global, disruptive_global
示例#4
0
def train(conf, shot_list_train, shot_list_validate, loader):
    loader.set_inference_mode(False)
    np.random.seed(1)

    validation_losses = []
    validation_roc = []
    training_losses = []
    print('validate: {} shots, {} disruptive'.format(
        len(shot_list_validate), shot_list_validate.num_disruptive()))
    print('training: {} shots, {} disruptive'.format(
        len(shot_list_train), shot_list_train.num_disruptive()))

    if backend == 'tf' or backend == 'tensorflow':
        first_time = "tensorflow" not in sys.modules
        if first_time:
            import tensorflow as tf
            os.environ['KERAS_BACKEND'] = 'tensorflow'
            from keras.backend.tensorflow_backend import set_session
            config = tf.ConfigProto(device_count={"GPU": 1})
            set_session(tf.Session(config=config))
    else:
        os.environ['KERAS_BACKEND'] = 'theano'
        os.environ['THEANO_FLAGS'] = 'device=gpu,floatX=float32'
        import theano

    from keras.utils.generic_utils import Progbar
    from keras import backend as K
    from plasma.models import builder

    print('Build model...', end='')
    specific_builder = builder.ModelBuilder(conf)
    train_model = specific_builder.build_model(False)
    print('Compile model', end='')
    train_model.compile(optimizer=optimizer_class(),
                        loss=conf['data']['target'].loss)
    print('...done')

    #load the latest epoch we did. Returns -1 if none exist yet
    e = specific_builder.load_model_weights(train_model)
    e_start = e
    batch_generator = partial(loader.training_batch_generator_partial_reset,
                              shot_list=shot_list_train)
    batch_iterator = ProcessGenerator(batch_generator())

    num_epochs = conf['training']['num_epochs']
    num_at_once = conf['training']['num_shots_at_once']
    lr_decay = conf['model']['lr_decay']
    print('{} epochs left to go'.format(num_epochs - 1 - e))
    num_so_far_accum = 0
    num_so_far = 0
    num_total = np.inf

    if conf['callbacks']['mode'] == 'max':
        best_so_far = -np.inf
        cmp_fn = max
    else:
        best_so_far = np.inf
        cmp_fn = min

    while e < num_epochs - 1:
        e += 1
        print('\nEpoch {}/{}'.format(e + 1, num_epochs))
        pbar = Progbar(len(shot_list_train))

        #decay learning rate each epoch:
        K.set_value(train_model.optimizer.lr, lr * lr_decay**(e))

        #print('Learning rate: {}'.format(train_model.optimizer.lr.get_value()))
        num_batches_minimum = 100
        num_batches_current = 0
        training_losses_tmp = []

        while num_so_far < (
                e - e_start
        ) * num_total or num_batches_current < num_batches_minimum:
            num_so_far_old = num_so_far
            try:
                batch_xs, batch_ys, batches_to_reset, num_so_far_curr, num_total, is_warmup_period = next(
                    batch_iterator)
            except StopIteration:
                print("Resetting batch iterator.")
                num_so_far_accum = num_so_far
                batch_iterator = ProcessGenerator(batch_generator())
                batch_xs, batch_ys, batches_to_reset, num_so_far_curr, num_total, is_warmup_period = next(
                    batch_iterator)
            if np.any(batches_to_reset):
                reset_states(train_model, batches_to_reset)
            if not is_warmup_period:
                num_so_far = num_so_far_accum + num_so_far_curr

                num_batches_current += 1

                loss = train_model.train_on_batch(batch_xs, batch_ys)
                training_losses_tmp.append(loss)
                pbar.add(num_so_far - num_so_far_old,
                         values=[("train loss", loss)])
                loader.verbose = False  #True during the first iteration
            else:
                _ = train_model.predict(
                    batch_xs, batch_size=conf['training']['batch_size'])

        e = e_start + 1.0 * num_so_far / num_total
        sys.stdout.flush()
        ave_loss = np.mean(training_losses_tmp)
        training_losses.append(ave_loss)
        specific_builder.save_model_weights(train_model, int(round(e)))

        if conf['training']['validation_frac'] > 0.0:
            print("prediction on GPU...")
            _, _, _, roc_area, loss = make_predictions_and_evaluate_gpu(
                conf, shot_list_validate, loader)
            validation_losses.append(loss)
            validation_roc.append(roc_area)

            epoch_logs = {}
            epoch_logs['val_roc'] = roc_area
            epoch_logs['val_loss'] = loss
            epoch_logs['train_loss'] = ave_loss
            best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']],
                                 best_so_far)
            if best_so_far != epoch_logs[
                    conf['callbacks']
                ['monitor']]:  #only save model weights if quantity we are tracking is improving
                print("Not saving model weights")
                specific_builder.delete_model_weights(train_model,
                                                      int(round(e)))

            if conf['training']['ranking_difficulty_fac'] != 1.0:
                _, _, _, roc_area_train, loss_train = make_predictions_and_evaluate_gpu(
                    conf, shot_list_train, loader)
                batch_iterator.__exit__()
                batch_generator = partial(
                    loader.training_batch_generator_partial_reset,
                    shot_list=shot_list_train)
                batch_iterator = ProcessGenerator(batch_generator())
                num_so_far_accum = num_so_far

        print('=========Summary========')
        print('Training Loss Numpy: {:.3e}'.format(training_losses[-1]))
        if conf['training']['validation_frac'] > 0.0:
            print('Validation Loss: {:.3e}'.format(validation_losses[-1]))
            print('Validation ROC: {:.4f}'.format(validation_roc[-1]))
            if conf['training']['ranking_difficulty_fac'] != 1.0:
                print('Train Loss: {:.3e}'.format(loss_train))
                print('Train ROC: {:.4f}'.format(roc_area_train))

    # plot_losses(conf,[training_losses],specific_builder,name='training')
    if conf['training']['validation_frac'] > 0.0:
        plot_losses(conf, [training_losses, validation_losses, validation_roc],
                    specific_builder,
                    name='training_validation_roc')
    batch_iterator.__exit__()
    print('...done')
示例#5
0
    def keras_fmin_fnct(self, space):
        from plasma.models import builder

        specific_builder = builder.ModelBuilder(self.conf)

        train_model = specific_builder.hyper_build_model(space, False)
        train_model.compile(optimizer=optimizer_class(),
                            loss=conf['data']['target'].loss)

        np.random.seed(1)
        validation_losses = []
        validation_roc = []
        training_losses = []
        shot_list_train, shot_list_validate = self.shot_list.split_direct(
            1.0 - conf['training']['validation_frac'], do_shuffle=True)

        from keras.utils.generic_utils import Progbar
        from keras import backend as K

        num_epochs = self.conf['training']['num_epochs']
        num_at_once = self.conf['training']['num_shots_at_once']
        lr_decay = self.conf['model']['lr_decay']

        resulting_dict = {'loss': None, 'status': STATUS_OK, 'model': None}

        e = -1
        #print("Current num_epochs {}".format(e))
        while e < num_epochs - 1:
            e += 1
            pbar = Progbar(len(shot_list_train))

            shot_list_train.shuffle()
            shot_sublists = shot_list_train.sublists(num_at_once)[:1]
            training_losses_tmp = []

            K.set_value(train_model.optimizer.lr, lr * lr_decay**(e))
            for (i, shot_sublist) in enumerate(shot_sublists):
                X_list, y_list = self.loader.load_as_X_y_list(shot_sublist)
                for j, (X, y) in enumerate(zip(X_list, y_list)):
                    history = builder.LossHistory()
                    train_model.fit(X,
                                    y,
                                    batch_size=Loader.get_batch_size(
                                        self.conf['training']['batch_size'],
                                        prediction_mode=False),
                                    epochs=1,
                                    shuffle=False,
                                    verbose=0,
                                    validation_split=0.0,
                                    callbacks=[history])
                    train_model.reset_states()
                    train_loss = np.mean(history.losses)
                    training_losses_tmp.append(train_loss)

                    pbar.add(1.0 * len(shot_sublist) / len(X_list),
                             values=[("train loss", train_loss)])
                    self.loader.verbose = False
            sys.stdout.flush()
            training_losses.append(np.mean(training_losses_tmp))
            specific_builder.save_model_weights(train_model, e)

            _, _, _, roc_area, loss = make_predictions_and_evaluate_gpu(
                self.conf, shot_list_validate, self.loader)
            print("Epoch: {}, loss: {}, validation_losses_size: {}".format(
                e, loss, len(validation_losses)))
            validation_losses.append(loss)
            validation_roc.append(roc_area)
            resulting_dict['loss'] = loss
            resulting_dict['model'] = train_model
            #print("Results {}, before {}".format(resulting_dict,id(resulting_dict)))

        #print("Results {}, after {}".format(resulting_dict,id(resulting_dict)))
        return resulting_dict
示例#6
0
def mpi_train(conf,
              shot_list_train,
              shot_list_validate,
              loader,
              callbacks_list=None,
              shot_list_test=None):
    loader.set_inference_mode(False)

    # TODO(KGF): this is not defined in conf.yaml, but added to processed dict
    # for the first time here:
    conf['num_workers'] = g.comm.Get_size()

    specific_builder = builder.ModelBuilder(conf)
    if g.tf_ver >= parse_version('1.14.0'):
        # Internal TensorFlow flags, subject to change (v1.14.0+ only?)
        try:
            from tensorflow.python.util import module_wrapper as depr
        except ImportError:
            from tensorflow.python.util import deprecation_wrapper as depr
        # depr._PRINT_DEPRECATION_WARNINGS = False  # does nothing
        depr._PER_MODULE_WARNING_LIMIT = 0
        # Suppresses warnings from "keras/backend/tensorflow_backend.py"
        # except: "Rate should be set to `rate = 1 - keep_prob`"
        # Also suppresses warnings from "keras/optimizers.py
        # does NOT suppresses warn from "/tensorflow/python/ops/math_grad.py"
    else:
        # TODO(KGF): next line suppresses ALL info and warning messages,
        # not just deprecation warnings...
        tf.logging.set_verbosity(tf.logging.ERROR)
    # TODO(KGF): for TF>v1.13.0 (esp v1.14.0), this next line prompts a ton of
    # deprecation warnings with externally-packaged Keras, e.g.:
    # WARNING:tensorflow:From  .../keras/backend/tensorflow_backend.py:174:
    # The name tf.get_default_session is deprecated.
    # Please use tf.compat.v1.get_default_session instead.
    train_model = specific_builder.build_model(False)
    # Cannot fix these Keras internals via "import tensorflow.compat.v1 as tf"
    #
    # TODO(KGF): note, these are different than C-based info diagnostics e.g.:
    # 2019-11-06 18:27:31.698908: I ...  dynamic library libcublas.so.10
    # which are NOT suppressed by set_verbosity. See top level __init__.py

    # load the latest epoch we did. Returns 0 if none exist yet
    e = specific_builder.load_model_weights(train_model)
    e_old = e

    num_epochs = conf['training']['num_epochs']
    lr_decay = conf['model']['lr_decay']
    batch_size = conf['training']['batch_size']
    lr = conf['model']['lr']
    clipnorm = conf['model']['clipnorm']
    warmup_steps = conf['model']['warmup_steps']
    # TODO(KGF): rename as "num_iter_minimum" or "min_steps_per_epoch"
    num_batches_minimum = conf['training']['num_batches_minimum']

    if 'adam' in conf['model']['optimizer']:
        optimizer = MPIAdam(lr=lr)
    elif (conf['model']['optimizer'] == 'sgd'
          or conf['model']['optimizer'] == 'tf_sgd'):
        optimizer = MPISGD(lr=lr)
    elif 'momentum_sgd' in conf['model']['optimizer']:
        optimizer = MPIMomentumSGD(lr=lr)
    else:
        print("Optimizer not implemented yet")
        exit(1)

    g.print_unique('{} epoch(s) left to go'.format(num_epochs - e))

    batch_generator = partial(loader.training_batch_generator_partial_reset,
                              shot_list=shot_list_train)

    g.print_unique("warmup steps = {}".format(warmup_steps))
    mpi_model = MPIModel(train_model,
                         optimizer,
                         g.comm,
                         batch_generator,
                         batch_size,
                         lr=lr,
                         warmup_steps=warmup_steps,
                         num_batches_minimum=num_batches_minimum,
                         conf=conf)
    mpi_model.compile(conf['model']['optimizer'], clipnorm,
                      conf['data']['target'].loss)
    tensorboard = None
    if g.task_index == 0:
        tensorboard_save_path = conf['paths']['tensorboard_save_path']
        write_grads = conf['callbacks']['write_grads']
        tensorboard = TensorBoard(log_dir=tensorboard_save_path,
                                  histogram_freq=1,
                                  write_graph=True,
                                  write_grads=write_grads)
        tensorboard.set_model(mpi_model.model)
        # TODO(KGF): check addition of TF model summary write added from fork
        fr = open('model_architecture.log', 'a')
        ori = sys.stdout
        sys.stdout = fr
        mpi_model.model.summary()
        sys.stdout = ori
        fr.close()
        mpi_model.model.summary()

    if g.task_index == 0:
        callbacks = mpi_model.build_callbacks(conf, callbacks_list)
        callbacks.set_model(mpi_model.model)
        callback_metrics = conf['callbacks']['metrics']
        callbacks.set_params({
            'epochs': num_epochs,
            'metrics': callback_metrics,
            'batch_size': batch_size,
        })
        callbacks.on_train_begin()
    if conf['callbacks']['mode'] == 'max':
        best_so_far = -np.inf
        cmp_fn = max
    else:
        best_so_far = np.inf
        cmp_fn = min

    while e < num_epochs:
        g.write_unique('\nBegin training from epoch {:.2f}/{}'.format(
            e, num_epochs))
        if g.task_index == 0:
            callbacks.on_epoch_begin(int(round(e)))
        mpi_model.set_lr(lr * lr_decay**e)

        # KGF: core work of loop performed in next line
        (step, ave_loss, curr_loss, num_so_far,
         effective_epochs) = mpi_model.train_epoch()
        e = e_old + effective_epochs
        g.write_unique('Finished training of epoch {:.2f}/{}\n'.format(
            e, num_epochs))

        # TODO(KGF): add diagnostic about "saving to epoch X"?
        loader.verbose = False  # True during the first iteration
        if g.task_index == 0:
            specific_builder.save_model_weights(train_model, int(round(e)))

        if conf['training']['no_validation']:
            break

        epoch_logs = {}
        g.write_unique('Begin evaluation of epoch {:.2f}/{}\n'.format(
            e, num_epochs))
        # TODO(KGF): flush output/ MPI barrier?
        # g.flush_all_inorder()

        # TODO(KGF): is there a way to avoid Keras.Models.load_weights()
        # repeated calls throughout mpi_make_pred*() fn calls?
        _, _, _, roc_area, loss = mpi_make_predictions_and_evaluate(
            conf, shot_list_validate, loader)

        if conf['training']['ranking_difficulty_fac'] != 1.0:
            (_, _, _, roc_area_train,
             loss_train) = mpi_make_predictions_and_evaluate(
                 conf, shot_list_train, loader)
            batch_generator = partial(
                loader.training_batch_generator_partial_reset,
                shot_list=shot_list_train)
            mpi_model.batch_iterator = batch_generator
            mpi_model.batch_iterator_func.__exit__()
            mpi_model.num_so_far_accum = mpi_model.num_so_far_indiv
            mpi_model.set_batch_iterator_func()

        if ('monitor_test' in conf['callbacks'].keys()
                and conf['callbacks']['monitor_test']):
            times = conf['callbacks']['monitor_times']
            areas, _ = mpi_make_predictions_and_evaluate_multiple_times(
                conf, shot_list_validate, loader, times)
            epoch_str = 'epoch {}, '.format(int(round(e)))
            g.write_unique(epoch_str + ' '.join([
                'val_roc_{} = {}'.format(t, roc)
                for t, roc in zip(times, areas)
            ]) + '\n')
            if shot_list_test is not None:
                areas, _ = mpi_make_predictions_and_evaluate_multiple_times(
                    conf, shot_list_test, loader, times)
                g.write_unique(epoch_str + ' '.join([
                    'test_roc_{} = {}'.format(t, roc)
                    for t, roc in zip(times, areas)
                ]) + '\n')

        epoch_logs['val_roc'] = roc_area
        epoch_logs['val_loss'] = loss
        epoch_logs['train_loss'] = ave_loss
        best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']],
                             best_so_far)
        stop_training = False
        g.flush_all_inorder()
        if g.task_index == 0:
            print('=========Summary======== for epoch {:.2f}'.format(e))
            print('Training Loss numpy: {:.3e}'.format(ave_loss))
            print('Validation Loss: {:.3e}'.format(loss))
            print('Validation ROC: {:.4f}'.format(roc_area))
            if conf['training']['ranking_difficulty_fac'] != 1.0:
                print('Training Loss: {:.3e}'.format(loss_train))
                print('Training ROC: {:.4f}'.format(roc_area_train))
            print('======================== ')
            callbacks.on_epoch_end(int(round(e)), epoch_logs)
            if hasattr(mpi_model.model, 'stop_training'):
                stop_training = mpi_model.model.stop_training
            # only save model weights if quantity we are tracking is improving
            if best_so_far != epoch_logs[conf['callbacks']['monitor']]:
                if ('monitor_test' in conf['callbacks'].keys()
                        and conf['callbacks']['monitor_test']):
                    print("No improvement, saving model weights anyways")
                else:
                    print("Not saving model weights")
                    specific_builder.delete_model_weights(
                        train_model, int(round(e)))

            # tensorboard
            val_generator = partial(loader.training_batch_generator,
                                    shot_list=shot_list_validate)()
            val_steps = 1
            tensorboard.on_epoch_end(val_generator, val_steps, int(round(e)),
                                     epoch_logs)
        stop_training = g.comm.bcast(stop_training, root=0)
        g.write_unique('Finished evaluation of epoch {:.2f}/{}'.format(
            e, num_epochs))
        # TODO(KGF): compare to old diagnostic:
        # g.write_unique("end epoch {}".format(e_old))
        if stop_training:
            g.write_unique("Stopping training due to early stopping")
            break

    if g.task_index == 0:
        callbacks.on_train_end()
        tensorboard.on_train_end()

    mpi_model.close()
示例#7
0
def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
    loader.set_inference_mode(True)
    np.random.seed(g.task_index)
    shot_list.sort()  # make sure all replicas have the same list
    specific_builder = builder.ModelBuilder(conf)

    y_prime = []
    y_gold = []
    disruptive = []

    model = specific_builder.build_model(True)
    specific_builder.load_model_weights(model, custom_path)

    # broadcast model weights then set it explicitly: fix for Py3.6
    # TODO(KGF): remove if we no longer support Py2
    if sys.version_info[0] > 2:
        if g.task_index == 0:
            new_weights = model.get_weights()
        else:
            new_weights = None
        nw = g.comm.bcast(new_weights, root=0)
        model.set_weights(nw)

    model.reset_states()
    if g.task_index == 0:
        # TODO(KGF): this appears to prepend a \n, resulting in:
        # [2] loading from epoch 7
        #
        # 128/862 [===>..........................] - ETA: 2:20
        pbar = Progbar(len(shot_list))
    shot_sublists = shot_list.sublists(conf['model']['pred_batch_size'],
                                       do_shuffle=False,
                                       equal_size=True)
    y_prime_global = []
    y_gold_global = []
    disruptive_global = []
    if g.task_index != 0:
        loader.verbose = False

    for (i, shot_sublist) in enumerate(shot_sublists):
        if i % g.num_workers == g.task_index:
            X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist)

            # load data and fit on data
            y_p = model.predict(X, batch_size=conf['model']['pred_batch_size'])
            model.reset_states()
            y_p = loader.batch_output_to_array(y_p)
            y = loader.batch_output_to_array(y)

            # cut arrays back
            y_p = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y_p)]
            y = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y)]

            y_prime += y_p
            y_gold += y
            disruptive += disr
            # print_all('\nFinished with i = {}'.format(i))

        if (i % g.num_workers == g.num_workers - 1
                or i == len(shot_sublists) - 1):
            g.comm.Barrier()
            y_prime_global += concatenate_sublists(g.comm.allgather(y_prime))
            y_gold_global += concatenate_sublists(g.comm.allgather(y_gold))
            disruptive_global += concatenate_sublists(
                g.comm.allgather(disruptive))
            g.comm.Barrier()
            y_prime = []
            y_gold = []
            disruptive = []

        if g.task_index == 0:
            pbar.add(1.0 * len(shot_sublist))

    y_prime_global = y_prime_global[:len(shot_list)]
    y_gold_global = y_gold_global[:len(shot_list)]
    disruptive_global = disruptive_global[:len(shot_list)]
    loader.set_inference_mode(False)

    return y_prime_global, y_gold_global, disruptive_global