Python factory 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nabu.neuralnetworks.models.model_factory

메소드/함수: factory

hotexamples.com에서의 예제들: 2

Python factory - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nabu.neuralnetworks.models.model_factory.factory에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: multi_task_trainer.py 프로젝트: Kaatje95/Nabu-MSSS

    def __init__(self, conf, tasksconf, dataconf, modelconf, evaluatorconf,
                 expdir, init_filename, server, task_index):
        '''
        NnetTrainer constructor, creates the training graph

        Args:
            conf: the trainer config
            taskconf: the config file for each task
            dataconf: the data configuration as a ConfigParser
            modelconf: the neural net model configuration
            evaluatorconf: the evaluator configuration for evaluating
                if None no evaluation will be done
            expdir: directory where the summaries will be written
            init_filename: filename of the network that should be used to
            initialize the model. Put to None if no network is available/wanted.
            server: optional server to be used for distributed training
            task_index: optional index of the worker task in the cluster
        '''

        self.expdir = expdir
        self.server = server
        self.conf = conf
        self.tasksconf = tasksconf
        self.task_index = task_index
        self.init_filename = init_filename

        self.batch_size = int(conf['batch_size'])

        cluster = tf.train.ClusterSpec(server.server_def.cluster)

        #create the graph
        self.graph = tf.Graph()

        #3 model types for multi task: single one to one; single one to many; multiple one to one
        #single one to one: the whole model is shared for all tasks, only loss function can be different
        #single one to many: each task has a separate output so only part of the network is shared, eg evrything but the output layer
        #multiple one to one: each task has its own network. Possibly the outputs are combined in a loss function

        #create the model
        modelfile = os.path.join(expdir, 'model', 'model.pkl')
        with open(modelfile, 'wb') as fid:
            self.model = model_factory.factory(
                modelconf.get('model', 'architecture'))(conf=modelconf)
            pickle.dump(self.model, fid)

        evaltype = evaluatorconf.get('evaluator', 'evaluator')

        #get the database configurations
        input_dataconfs = dict()
        target_dataconfs = dict()
        loss_computers = dict()
        nr_input_sections = dict()
        if evaltype != 'None':
            evaluators = dict()

        for task in self.conf['tasks'].split(' '):
            taskconf = self.tasksconf[task]

            #get the database configurations
            input_names = modelconf.get('io', 'inputs').split(' ')
            if input_names == ['']:
                input_names = []
            input_sections = [taskconf[i].split(' ') for i in input_names]
            nr_input_sections[task] = len(input_sections)
            task_input_dataconfs = []
            for sectionset in input_sections:
                task_input_dataconfs.append([])
                for section in sectionset:
                    task_input_dataconfs[-1].append(
                        dict(dataconf.items(section)))
            input_dataconfs[task] = task_input_dataconfs

            output_names = taskconf['targets'].split(' ')
            if output_names == ['']:
                output_names = []
            target_sections = [taskconf[o].split(' ') for o in output_names]
            task_target_dataconfs = []
            for sectionset in target_sections:
                task_target_dataconfs.append([])
                for section in sectionset:
                    task_target_dataconfs[-1].append(
                        dict(dataconf.items(section)))
            target_dataconfs[task] = task_target_dataconfs

            #create the loss computer
            loss_computer = loss_computer_factory.factory(
                taskconf['loss_type'])(self.batch_size)

            loss_computers[task] = loss_computer

            if evaltype != 'None':
                evaluator = evaluator_factory.factory(evaltype)(
                    conf=evaluatorconf,
                    dataconf=dataconf,
                    model=self.model,
                    task=task)

                evaluators[task] = evaluator

        if 'local' in cluster.as_dict():
            num_replicas = 1
            device = tf.DeviceSpec(job='local')
        else:
            #distributed training
            num_replicas = len(cluster.as_dict()['worker'])
            num_servers = len(cluster.as_dict()['ps'])
            ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy(
                num_tasks=num_servers,
                load_fn=tf.contrib.training.byte_size_load_fn)
            device = tf.train.replica_device_setter(ps_tasks=num_servers,
                                                    ps_strategy=ps_strategy)
            chief_ps = tf.DeviceSpec(job='ps', task=0)

        self.is_chief = task_index == 0

        #define the placeholders in the graph
        with self.graph.as_default():

            #create a local num_steps variable
            self.num_steps = tf.get_variable(
                name='num_steps',
                shape=[],
                dtype=tf.int32,
                initializer=tf.constant_initializer(0),
                trainable=False)

            #a variable to hold the amount of steps already taken
            self.global_step = tf.get_variable(
                name='global_step',
                shape=[],
                dtype=tf.int32,
                initializer=tf.constant_initializer(0),
                trainable=False)

            should_terminate = tf.get_variable(
                name='should_terminate',
                shape=[],
                dtype=tf.bool,
                initializer=tf.constant_initializer(False),
                trainable=False)

            self.terminate = should_terminate.assign(True).op

            #create a check if training should continue
            self.should_stop = tf.logical_or(
                tf.greater_equal(self.global_step, self.num_steps),
                should_terminate)

            with tf.device(device):
                data_queues = dict()
                num_steps = []
                done_ops = []
                for task in self.conf['tasks'].split(' '):

                    #check if running in distributed model
                    if 'local' in cluster.as_dict():

                        #get the filenames
                        data_queue_elements, _ = input_pipeline.get_filenames(
                            input_dataconfs[task] + target_dataconfs[task])

                        #create the data queue and queue runners (inputs get shuffled! I already did this so set to False)
                        data_queue = tf.train.string_input_producer(
                            string_tensor=data_queue_elements,
                            shuffle=False,
                            seed=None,
                            capacity=self.batch_size * 2,
                            shared_name='data_queue_' + task)

                        data_queues[task] = data_queue

                        #compute the number of steps
                        if int(conf['numbatches_to_aggregate']) == 0:
                            task_num_steps = (int(conf['num_epochs']) *
                                              len(data_queue_elements) /
                                              self.batch_size)
                        else:
                            task_num_steps = (
                                int(conf['num_epochs']) *
                                len(data_queue_elements) /
                                (self.batch_size *
                                 int(conf['numbatches_to_aggregate'])))

                        #set the number of steps
                        num_steps.append(task_num_steps)
                        done_ops.append(tf.no_op())

                    else:
                        with tf.device(chief_ps):

                            #get the data queue
                            data_queue = tf.FIFOQueue(
                                capacity=self.batch_size * (num_replicas + 1),
                                shared_name='data_queue_' + task,
                                name='data_queue_' + task,
                                dtypes=[tf.string],
                                shapes=[[]])

                            data_queues[task] = data_queue

                            #get the number of steps from the parameter server
                            num_steps_queue = tf.FIFOQueue(
                                capacity=num_replicas,
                                dtypes=[tf.int32],
                                shared_name='num_steps_queue',
                                name='num_steps_queue',
                                shapes=[[]])

                            #set the number of steps
                            task_num_steps = num_steps_queue.dequeue()

                        #get the done queues
                        for i in range(num_servers):
                            with tf.device('job:ps/task:%d' % i):
                                done_queue = tf.FIFOQueue(
                                    capacity=num_replicas,
                                    dtypes=[tf.bool],
                                    shapes=[[]],
                                    shared_name='done_queue%d' % i,
                                    name='done_queue%d' % i)

                                done_ops.append(done_queue.enqueue(True))

                self.set_num_steps = self.num_steps.assign(min(num_steps)).op
                self.done = tf.group(*done_ops)

                #training part
                with tf.variable_scope('train'):

                    #a variable to scale the learning rate (used to reduce the
                    #learning rate in case validation performance drops)
                    learning_rate_fact = tf.get_variable(
                        name='learning_rate_fact',
                        shape=[],
                        initializer=tf.constant_initializer(1.0),
                        trainable=False)

                    #compute the learning rate with exponential decay and scale
                    #with the learning rate factor
                    self.learning_rate = (tf.train.exponential_decay(
                        learning_rate=float(conf['initial_learning_rate']),
                        global_step=self.global_step,
                        decay_steps=self.num_steps,
                        decay_rate=float(conf['learning_rate_decay'])) *
                                          learning_rate_fact)

                    #create the optimizer
                    optimizer = tf.train.AdamOptimizer(self.learning_rate)

                    self.total_loss = tf.get_variable(
                        name='total_loss',
                        shape=[],
                        dtype=tf.float32,
                        initializer=tf.constant_initializer(0),
                        trainable=False)

                    self.reset_loss = self.total_loss.assign(0.0)

                    loss = []

                    for task in self.conf['tasks'].split(' '):

                        with tf.variable_scope(task):

                            #create the input pipeline
                            data, seq_length = input_pipeline.input_pipeline(
                                data_queue=data_queues[task],
                                batch_size=self.batch_size,
                                numbuckets=int(conf['numbuckets']),
                                dataconfs=input_dataconfs[task] +
                                target_dataconfs[task])

                            inputs = {
                                input_names[i]: d
                                for i, d in enumerate(
                                    data[:nr_input_sections[task]])
                            }
                            seq_length = {
                                input_names[i]: d
                                for i, d in enumerate(
                                    seq_length[:nr_input_sections[task]])
                            }
                            targets = {
                                output_names[i]: d
                                for i, d in enumerate(
                                    data[nr_input_sections[task]:])
                            }
                            #target_seq_length = {
                            #output_names[i]: d
                            #for i, d in enumerate(seq_length[nr_input_sections[task]:])}

                            #compute the training outputs of the model
                            logits = self.model(inputs=inputs,
                                                input_seq_length=seq_length,
                                                is_training=True)

                            #TODO: The proper way to exploit data paralellism is via the
                            #SyncReplicasOptimizer defined below. However for some reason it hangs
                            #and I have not yet found a solution for it. For the moment the gradients
                            #are accumulated in a way that does not allow data paralellism and there
                            # is no advantage on having multiple workers. (We also accumulate the loss)

                            #create an optimizer that aggregates gradients
                            #if int(conf['numbatches_to_aggregate']) > 0:
                            #optimizer = tf.train.SyncReplicasOptimizer(
                            #opt=optimizer,
                            #replicas_to_aggregate=int(
                            #conf['numbatches_to_aggregate'])#,
                            ##total_num_replicas=num_replicas
                            #)

                            #compute the loss
                            task_loss = loss_computers[task](targets, logits,
                                                             seq_length)

                            #append the task loss to the global loss
                            loss.append(task_loss)

                #accumulate losses from tasks
                    with tf.variable_scope('accumulate_loss_from_tasks'):
                        loss = tf.reduce_mean(loss)

                #accumulate losses from batches
                    self.acc_loss = self.total_loss.assign_add(loss)

                    ##compute the gradients
                    #grads_and_vars = optimizer.compute_gradients(self.loss)

                    #with tf.variable_scope('clip'):
                    #clip_value = float(conf['clip_grad_value'])
                    ##clip the gradients
                    #grads_and_vars = [(tf.clip_by_value(grad, -clip_value, clip_value), var)
                    #for grad, var in grads_and_vars]

                    self.params = tf.trainable_variables()

                    grads = [
                        tf.get_variable(param.op.name,
                                        param.get_shape().as_list(),
                                        initializer=tf.constant_initializer(0),
                                        trainable=False)
                        for param in self.params
                    ]

                    self.reset_grad = tf.variables_initializer(grads)

                    #compute the gradients
                    minibatch_grads_and_vars = optimizer.compute_gradients(
                        loss)

                    with tf.variable_scope('clip'):
                        clip_value = float(conf['clip_grad_value'])
                        #clip the gradients
                        minibatch_grads_and_vars = [
                            (tf.clip_by_value(grad, -clip_value,
                                              clip_value), var)
                            for grad, var in minibatch_grads_and_vars
                        ]

                    (minibatchgrads,
                     minibatchvars) = zip(*minibatch_grads_and_vars)

                    #update gradients by accumulating them
                    self.update_gradients = [
                        grad.assign_add(batchgrad)
                        for batchgrad, grad in zip(minibatchgrads, grads)
                    ]

                    #opperation to apply the gradients
                    grads_and_vars = list(zip(grads, minibatchvars))
                    apply_gradients_op = optimizer.apply_gradients(
                        grads_and_vars=grads_and_vars,
                        global_step=self.global_step,
                        name='apply_gradients')

                    #all remaining operations with the UPDATE_OPS GraphKeys
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

                    #create an operation to update the gradients, the batch_loss
                    #and do all other update ops
                    self.update_op = tf.group(*([apply_gradients_op] +
                                                update_ops),
                                              name='update')

                if evaltype != 'None':

                    #validation part
                    with tf.variable_scope('validate'):

                        #create a variable to hold the validation loss
                        self.validation_loss = tf.get_variable(
                            name='validation_loss',
                            shape=[],
                            dtype=tf.float32,
                            initializer=tf.constant_initializer(0),
                            trainable=False)

                        #create a variable to save the last step where the model
                        #was validated
                        validated_step = tf.get_variable(
                            name='validated_step',
                            shape=[],
                            dtype=tf.int32,
                            initializer=tf.constant_initializer(
                                -int(conf['valid_frequency'])),
                            trainable=False)

                        #a check if validation is due
                        self.should_validate = tf.greater_equal(
                            self.global_step - validated_step,
                            int(conf['valid_frequency']))

                        val_batch_loss = []
                        valbatches = []

                        for task in self.conf['tasks'].split(' '):

                            with tf.variable_scope(task):

                                task_val_batch_loss, task_valbatches, _, _ = evaluators[
                                    task].evaluate()
                                val_batch_loss.append(task_val_batch_loss)
                                valbatches.append(task_valbatches)

                        val_batch_loss = tf.reduce_mean(val_batch_loss)
                        self.valbatches = min(valbatches)

                        self.update_loss = self.validation_loss.assign(
                            self.validation_loss +
                            val_batch_loss  #/self.valbatches
                        ).op

                        #update the learning rate factor
                        self.half_lr = learning_rate_fact.assign(
                            learning_rate_fact / 2).op

                        #create an operation to updated the validated step
                        self.update_validated_step = validated_step.assign(
                            self.global_step).op

                        #variable to hold the best validation loss so far
                        self.best_validation = tf.get_variable(
                            name='best_validation',
                            shape=[],
                            dtype=tf.float32,
                            initializer=tf.constant_initializer(1.79e+308),
                            trainable=False)

                        #op to update the best velidation loss
                        self.update_best = self.best_validation.assign(
                            self.validation_loss).op

                        #a variable that holds the amount of workers at the
                        #validation point
                        waiting_workers = tf.get_variable(
                            name='waiting_workers',
                            shape=[],
                            dtype=tf.int32,
                            initializer=tf.constant_initializer(0),
                            trainable=False)

                        #an operation to signal a waiting worker
                        self.waiting = waiting_workers.assign_add(1).op

                        #an operation to set the waiting workers to zero
                        self.reset_waiting = waiting_workers.initializer

                        #an operation to check if all workers are waiting
                        self.all_waiting = tf.equal(waiting_workers,
                                                    num_replicas - 1)

                        tf.summary.scalar('validation loss',
                                          self.validation_loss)

                else:
                    self.update_loss = None

                tf.summary.scalar('learning rate', self.learning_rate)

                #create a histogram for all trainable parameters
                for param in tf.trainable_variables():
                    tf.summary.histogram(param.name, param)

                #create the scaffold
                self.scaffold = tf.train.Scaffold()

예제 #2

파일 보기

파일: multi_task_trainer.py 프로젝트: a3VonG/Nabu-MSSS

    def __init__(self, conf, tasksconf, dataconf, modelconf, evaluatorconf,
                 expdir, init_filename, server, task_index):
        '''
        MultiTaskTrainer constructor, creates the training graph

        Args:
            conf: the trainer config
            taskconf: the config file for each task
            dataconf: the data configuration as a ConfigParser
            modelconf: the neural net model configuration
            evaluatorconf: the evaluator configuration for evaluating
                if None no evaluation will be done
            expdir: directory where the summaries will be written
            init_filename: filename of the network that should be used to
            initialize the model. Put to None if no network is available/wanted.
            server: optional server to be used for distributed training
            task_index: optional index of the worker task in the cluster
        '''

        self.expdir = expdir
        self.server = server
        self.conf = conf
        self.tasksconf = tasksconf
        self.task_index = task_index
        self.init_filename = init_filename

        self.batch_size = int(conf['batch_size'])

        cluster = tf.train.ClusterSpec(server.server_def.cluster)

        #create the graph
        self.graph = tf.Graph()

        #create the model
        modelfile = os.path.join(expdir, 'model', 'model.pkl')
        model_names = modelconf.get('hyper', 'model_names').split(' ')
        self.models = dict()
        with open(modelfile, 'wb') as fid:
            for model_name in model_names:
                self.models[model_name] = model_factory.factory(
                    modelconf.get(model_name, 'architecture'))(conf=dict(
                        modelconf.items(model_name)),
                                                               name=model_name)
            pickle.dump(self.models, fid)

        evaltype = evaluatorconf.get('evaluator', 'evaluator')

        #define a trainer per traintask
        self.task_trainers = []
        for task in self.conf['tasks'].split(' '):
            taskconf = self.tasksconf[task]

            task_trainer = task_trainer_script.TaskTrainer(
                task, conf, taskconf, self.models, modelconf, dataconf,
                evaluatorconf, self.batch_size)

            self.task_trainers.append(task_trainer)

        if 'local' in cluster.as_dict():
            num_replicas = 1
            device = tf.DeviceSpec(job='local')
        else:
            #distributed training
            num_replicas = len(cluster.as_dict()['worker'])
            num_servers = len(cluster.as_dict()['ps'])
            ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy(
                num_tasks=num_servers,
                load_fn=tf.contrib.training.byte_size_load_fn)
            device = tf.train.replica_device_setter(ps_tasks=num_servers,
                                                    ps_strategy=ps_strategy)
            chief_ps = tf.DeviceSpec(job='ps', task=0)

        self.is_chief = task_index == 0

        #define the placeholders in the graph
        with self.graph.as_default():

            #create a local num_steps variable
            self.num_steps = tf.get_variable(
                name='num_steps',
                shape=[],
                dtype=tf.int32,
                initializer=tf.constant_initializer(0),
                trainable=False)

            #a variable to hold the amount of steps already taken
            self.global_step = tf.get_variable(
                name='global_step',
                shape=[],
                dtype=tf.int32,
                initializer=tf.constant_initializer(0),
                trainable=False)

            should_terminate = tf.get_variable(
                name='should_terminate',
                shape=[],
                dtype=tf.bool,
                initializer=tf.constant_initializer(False),
                trainable=False)

            self.terminate = should_terminate.assign(True).op

            #create a check if training should continue
            self.should_stop = tf.logical_or(
                tf.greater_equal(self.global_step, self.num_steps),
                should_terminate)

            with tf.device(device):
                num_steps = []
                done_ops = []

                #set the dataqueues for each trainer
                for task_trainer in self.task_trainers:

                    task_num_steps, task_done_ops = task_trainer.set_dataqueues(
                        cluster)

                    num_steps.append(task_num_steps)
                    done_ops += task_done_ops

                self.set_num_steps = self.num_steps.assign(min(num_steps)).op
                self.done = tf.group(*done_ops)

                #training part
                with tf.variable_scope('train'):

                    #a variable to scale the learning rate (used to reduce the
                    #learning rate in case validation performance drops)
                    learning_rate_fact = tf.get_variable(
                        name='learning_rate_fact',
                        shape=[],
                        initializer=tf.constant_initializer(1.0),
                        trainable=False)

                    #compute the learning rate with exponential decay and scale
                    #with the learning rate factor
                    self.learning_rate = (tf.train.exponential_decay(
                        learning_rate=float(conf['initial_learning_rate']),
                        global_step=self.global_step,
                        decay_steps=self.num_steps,
                        decay_rate=float(conf['learning_rate_decay'])) *
                                          learning_rate_fact)

                    #For each task, set the task specific training ops
                    for task_trainer in self.task_trainers:

                        task_trainer.train(self.learning_rate)

                    #Group ops over tasks
                    self.process_minibatch = tf.group(
                        *([
                            task_trainer.process_minibatch
                            for task_trainer in self.task_trainers
                        ]),
                        name='process_minibatch_all_tasks')

                    self.reset_grad_loss_norm = tf.group(
                        *([
                            task_trainer.reset_grad_loss_norm
                            for task_trainer in self.task_trainers
                        ]),
                        name='reset_grad_loss_norm_all_tasks')

                    tmp = []
                    # should'nt this be task_trainer instead of task?
                    # for task in self.task_trainers:
                    for task_trainer in self.task_trainers:
                        tmp += task_trainer.normalize_gradients
                    self.normalize_gradients = tf.group(
                        *(tmp), name='normalize_gradients_all_tasks')

                    #accumulate losses from tasks
                    with tf.variable_scope('accumulate_losses_from_tasks'):
                        self.loss_all_tasks = [
                            task_trainer.normalized_loss
                            for task_trainer in self.task_trainers
                        ]
                        self.total_loss = tf.reduce_mean(self.loss_all_tasks,
                                                         name='acc_loss')

                    tmp = []
                    for task_trainer in self.task_trainers:
                        tmp.append(task_trainer.apply_gradients)

                    #all remaining operations with the UPDATE_OPS GraphKeys
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

                    #an op to increment the global step
                    global_step_inc = self.global_step.assign_add(1)

                    #create an operation to update the gradients, the batch_loss
                    #and do all other update ops
                    #self.update_op = tf.group(
                    #*(tmp + update_ops + [global_step_inc]),
                    #name='update')

                    self.other_update_op = tf.group(*(update_ops +
                                                      [global_step_inc]),
                                                    name='other_update')

                if evaltype != 'None':

                    #validation part
                    with tf.variable_scope('validate'):

                        #create a variable to save the last step where the model
                        #was validated
                        validated_step = tf.get_variable(
                            name='validated_step',
                            shape=[],
                            dtype=tf.int32,
                            initializer=tf.constant_initializer(
                                -int(conf['valid_frequency'])),
                            trainable=False)

                        #a check if validation is due
                        self.should_validate = tf.greater_equal(
                            self.global_step - validated_step,
                            int(conf['valid_frequency']))

                        #For each task, if requested, set the task specific validation ops
                        #The number of validation batches is the minimum number of validation
                        #batches over all tasks.
                        tasks_excluded_for_val = ['None']
                        if evaluatorconf.has_option('evaluator',
                                                    'tasks_excluded_for_val'):
                            tasks_excluded_for_val = evaluatorconf.get(
                                'evaluator',
                                'tasks_excluded_for_val').split(' ')
                        self.val_task_trainers = [
                            task_trainer for task_trainer in self.task_trainers
                            if task_trainer.task_name not in
                            tasks_excluded_for_val
                        ]

                        valbatches = []
                        for task_trainer in self.val_task_trainers:
                            valbatches.append(
                                task_trainer.evaluate_evaluator())
                        self.valbatches = min(valbatches)

                        #Group ops over tasks
                        self.process_val_batch = tf.group(*([
                            task_trainer.process_val_batch
                            for task_trainer in self.val_task_trainers
                        ]))

                        self.reset_val_loss_norm = tf.group(*([
                            task_trainer.reset_val_loss_norm
                            for task_trainer in self.val_task_trainers
                        ]))

                        self.val_loss_all_tasks = []
                        for task_trainer in self.val_task_trainers:
                            self.val_loss_all_tasks.append(
                                task_trainer.val_loss_normalized)
                        self.validation_loss = tf.reduce_mean(
                            self.val_loss_all_tasks)

                        #update the learning rate factor
                        self.half_lr = learning_rate_fact.assign(
                            learning_rate_fact / 2).op

                        #create an operation to updated the validated step
                        self.update_validated_step = validated_step.assign(
                            self.global_step).op

                        #variable to hold the best validation loss so far
                        self.best_validation_all_tasks = [
                            tf.get_variable(
                                name='best_validation_task_%i' % ind,
                                shape=[],
                                dtype=tf.float32,
                                initializer=tf.constant_initializer(1.79e+308),
                                trainable=False)
                            for ind in range(len(self.val_task_trainers))
                        ]

                        #op to update the best validation loss
                        self.update_best_all_tasks = \
                         [best_val_task.assign(self.val_loss_all_tasks[ind])
                          for ind,best_val_task in enumerate(self.best_validation_all_tasks)]

                        #a variable that holds the amount of workers at the
                        #validation point
                        waiting_workers = tf.get_variable(
                            name='waiting_workers',
                            shape=[],
                            dtype=tf.int32,
                            initializer=tf.constant_initializer(0),
                            trainable=False)

                        #an operation to signal a waiting worker
                        self.waiting = waiting_workers.assign_add(1).op

                        #an operation to set the waiting workers to zero
                        self.reset_waiting = waiting_workers.initializer

                        #an operation to check if all workers are waiting
                        self.all_waiting = tf.equal(waiting_workers,
                                                    num_replicas - 1)

                        tf.summary.scalar('validation loss',
                                          self.validation_loss)

                else:
                    self.process_val_batch = None

                tf.summary.scalar('learning rate', self.learning_rate)

                #create a histogram for all trainable parameters
                for param in tf.trainable_variables():
                    tf.summary.histogram(param.name, param)

                #create the scaffold
                self.scaffold = tf.train.Scaffold()