def test_multiple_device_multiple_loss_gradients_with_multiple_optimiser(self): n_device = 3 grad_collector = GradientsCollector(n_devices=n_device) for idx in range(n_device): with tf.name_scope('worker_%d' % idx) as scope: image = tf.ones([2, 32, 32, 32, 4], dtype=tf.float32) test_net = get_test_network()(image, is_training=True) loss = tf.reduce_mean(tf.square(test_net - image)) loss_1 = tf.reduce_mean(tf.abs(test_net - image)) grads = dict() optimiser = dict() optimiser['opt'] = tf.train.GradientDescentOptimizer(0.1) optimiser['opt_1'] = tf.train.GradientDescentOptimizer(0.1) grads['opt'] = optimiser['opt'].compute_gradients(loss) grads['opt_1'] = optimiser['opt_1'].compute_gradients(loss_1) grad_collector.add_to_collection(grads) self.assertAllClose(len(grad_collector._gradients), n_device) with self.assertRaisesRegexp(AssertionError, ""): grad_collector.add_to_collection(grads) ave_grads = grad_collector.gradients self.assertAllClose(len(grad_collector._gradients[0]), len(ave_grads)) self.assertAllClose( grad_collector._gradients[0]['opt'][0][0][0].shape.as_list(), ave_grads['opt'][0][0][0].shape.as_list()) self.assertAllClose( grad_collector._gradients[0]['opt_1'][0][0][0].shape.as_list(), ave_grads['opt_1'][0][0][0].shape.as_list())
def create_graph(application, num_gpus=1, num_threads=1, is_training_action=False): """ Create a TF graph based on self.app properties and engine parameters. :return: """ graph = tf.Graph() main_device = device_string(num_gpus, 0, False, is_training_action) outputs_collector = OutputsCollector(n_devices=max(num_gpus, 1)) gradients_collector = GradientsCollector(n_devices=max(num_gpus, 1)) # start constructing the graph, handling training and inference cases with graph.as_default(), tf.device(main_device): # initialise sampler with tf.name_scope('Sampler'): application.initialise_sampler() for sampler in traverse_nested(application.get_sampler()): sampler.set_num_threads(num_threads) # initialise network, these are connected in # the context of multiple gpus application.initialise_network() application.add_validation_flag() # for data parallelism -- # defining and collecting variables from multiple devices for gpu_id in range(0, max(num_gpus, 1)): worker_device = device_string(num_gpus, gpu_id, True, is_training_action) scope_string = 'worker_{}'.format(gpu_id) with tf.name_scope(scope_string), tf.device(worker_device): # setup network for each of the multiple devices application.connect_data_and_network( outputs_collector, gradients_collector) with tf.name_scope('MergeOutputs'): outputs_collector.finalise_output_op() application.outputs_collector = outputs_collector application.gradients_collector = gradients_collector GRAPH_CREATED.send(application, iter_msg=None) return graph
def initialise_application(self, workflow_param, data_param): """ This function receives all parameters from user config file, create an instance of application. :param workflow_param: a dictionary of user parameters, keys correspond to sections in the config file :param data_param: a dictionary of input image parameters, keys correspond to data properties to be used by image_reader :return: """ try: system_param = workflow_param.get('SYSTEM', None) net_param = workflow_param.get('NETWORK', None) train_param = workflow_param.get('TRAINING', None) infer_param = workflow_param.get('INFERENCE', None) app_param = workflow_param.get('CUSTOM', None) except AttributeError: tf.logging.fatal('parameters should be dictionaries') raise assert os.path.exists(system_param.model_dir), \ 'Model folder not exists {}'.format(system_param.model_dir) self.is_training = (system_param.action == "train") # hardware-related parameters self.num_threads = max(system_param.num_threads, 1) \ if self.is_training else 1 self.num_gpus = system_param.num_gpus \ if self.is_training else min(system_param.num_gpus, 1) set_cuda_device(system_param.cuda_devices) # set output TF model folders self.model_dir = touch_folder( os.path.join(system_param.model_dir, 'models')) self.session_prefix = os.path.join(self.model_dir, FILE_PREFIX) if self.is_training: assert train_param, 'training parameters not specified' summary_root = os.path.join(system_param.model_dir, 'logs') self.summary_dir = get_latest_subfolder( summary_root, create_new=train_param.starting_iter == 0) # training iterations-related parameters self.initial_iter = train_param.starting_iter self.final_iter = train_param.max_iter self.save_every_n = train_param.save_every_n self.tensorboard_every_n = train_param.tensorboard_every_n self.max_checkpoints = train_param.max_checkpoints self.gradients_collector = GradientsCollector( n_devices=max(self.num_gpus, 1)) action_param = train_param else: assert infer_param, 'inference parameters not specified' self.initial_iter = infer_param.inference_iter action_param = infer_param self.outputs_collector = OutputsCollector( n_devices=max(self.num_gpus, 1)) # create an application instance assert app_param, 'application specific param. not specified' app_module = ApplicationDriver._create_app(app_param.name) self.app = app_module(net_param, action_param, self.is_training) # initialise data input self.app.initialise_dataset_loader(data_param, app_param) # pylint: disable=not-context-manager with self.graph.as_default(), tf.name_scope('Sampler'): self.app.initialise_sampler()
def initialise_application(self, workflow_param, data_param): """ This function receives all parameters from user config file, create an instance of application. :param workflow_param: a dictionary of user parameters, keys correspond to sections in the config file :param data_param: a dictionary of input image parameters, keys correspond to data properties to be used by image_reader :return: """ try: system_param = workflow_param.get('SYSTEM', None) net_param = workflow_param.get('NETWORK', None) train_param = workflow_param.get('TRAINING', None) infer_param = workflow_param.get('INFERENCE', None) app_param = workflow_param.get('CUSTOM', None) except AttributeError: tf.logging.fatal('parameters should be dictionaries') raise assert os.path.exists(system_param.model_dir), \ 'Model folder not exists {}'.format(system_param.model_dir) self.is_training = (system_param.action == "train") # hardware-related parameters self.num_threads = max(system_param.num_threads, 1) \ if self.is_training else 1 self.num_gpus = system_param.num_gpus \ if self.is_training else min(system_param.num_gpus, 1) set_cuda_device(system_param.cuda_devices) # set output TF model folders self.model_dir = touch_folder( os.path.join(system_param.model_dir, 'models')) self.session_prefix = os.path.join(self.model_dir, FILE_PREFIX) if self.is_training: assert train_param, 'training parameters not specified' summary_root = os.path.join(system_param.model_dir, 'logs') self.summary_dir = get_latest_subfolder( summary_root, create_new=train_param.starting_iter == 0) self.initial_iter = train_param.starting_iter self.final_iter = max(train_param.max_iter, self.initial_iter) self.save_every_n = train_param.save_every_n self.tensorboard_every_n = train_param.tensorboard_every_n self.max_checkpoints = \ max(train_param.max_checkpoints, self.max_checkpoints) self.gradients_collector = GradientsCollector( n_devices=max(self.num_gpus, 1)) self.validation_every_n = train_param.validation_every_n if self.validation_every_n > 0: self.validation_max_iter = max(self.validation_max_iter, train_param.validation_max_iter) action_param = train_param else: assert infer_param, 'inference parameters not specified' self.initial_iter = infer_param.inference_iter action_param = infer_param self.outputs_collector = OutputsCollector( n_devices=max(self.num_gpus, 1)) # create an application instance assert app_param, 'application specific param. not specified' app_module = ApplicationDriver._create_app(app_param.name) self.app = app_module(net_param, action_param, self.is_training) # initialise data input data_partitioner = ImageSetsPartitioner() # clear the cached file lists data_partitioner.reset() do_new_partition = \ self.is_training and self.initial_iter == 0 and \ (not os.path.isfile(system_param.dataset_split_file)) and \ (train_param.exclude_fraction_for_validation > 0 or train_param.exclude_fraction_for_inference > 0) data_fractions = None if do_new_partition: assert train_param.exclude_fraction_for_validation > 0 or \ self.validation_every_n <= 0, \ 'validation_every_n is set to {}, ' \ 'but train/validation splitting not available,\nplease ' \ 'check "exclude_fraction_for_validation" in the config ' \ 'file (current config value: {}).'.format( self.validation_every_n, train_param.exclude_fraction_for_validation) data_fractions = (train_param.exclude_fraction_for_validation, train_param.exclude_fraction_for_inference) if data_param: data_partitioner.initialise( data_param=data_param, new_partition=do_new_partition, ratios=data_fractions, data_split_file=system_param.dataset_split_file) if data_param and self.is_training and self.validation_every_n > 0: assert data_partitioner.has_validation, \ 'validation_every_n is set to {}, ' \ 'but train/validation splitting not available.\nPlease ' \ 'check dataset partition list {} ' \ '(remove file to generate a new dataset partition). ' \ 'Or set validation_every_n to -1.'.format( self.validation_every_n, system_param.dataset_split_file) # initialise readers self.app.initialise_dataset_loader(data_param, app_param, data_partitioner) self._data_partitioner = data_partitioner # pylint: disable=not-context-manager with self.graph.as_default(), tf.name_scope('Sampler'): self.app.initialise_sampler()