def __init__(self, session, parallel_models, optimize_op, train_set=None, eval_set=None, variables=None, lr=0.001, device_type='gpu', save_dir="test"): super().__init__(session, parallel_models, optimize_op, train_set, eval_set, variables) self.lr = lr self.save_dir = save_dir grads = [] for i in range(self.nr_model): grads.append(self.parallel_models[i].grads) with tf.device('/' + device_type + ':0'): for i in range(1, self.nr_model): for j in range(len(grads[0])): grads[0][j] += grads[i][j] self.aggregated_grads = grads[0] self.optimize_op = adam_updates(variables, self.aggregated_grads, lr=self.lr)
def construct_models(self, model_cls, model_opt, learning_rate, trainable_params=None, eval_keys=['total loss']): # models self.models = [model_cls(counters={}) for i in range(self.nr_gpu)] template = tf.make_template('model', model_cls.build_graph) for i in range(self.nr_gpu): with tf.device('/gpu:%d' % i): template(self.models[i], **model_opt) if trainable_params is None: self.params = tf.trainable_variables() else: self.params = get_trainable_variables(trainable_params) # gradients grads = [] for i in range(self.nr_gpu): with tf.device('/gpu:%d' % i): grads.append( tf.gradients(self.models[i].loss, self.params, colocate_gradients_with_ops=True)) with tf.device('/gpu:0'): for i in range(1, self.nr_gpu): for j in range(len(grads[0])): grads[0][j] += grads[i][j] mdict = {} if 'total loss' in eval_keys: mdict['total loss'] = tf.add_n( [model.loss for model in self.models]) / self.nr_gpu if 'nll loss' in eval_keys: mdict['nll loss'] = tf.add_n( [model.loss_nll for model in self.models]) / self.nr_gpu if 'reg loss' in eval_keys: mdict['reg loss'] = tf.add_n( [model.loss_reg for model in self.models]) / self.nr_gpu if 'bits per dim' in eval_keys: mdict['bits per dim'] = tf.add_n( [model.bits_per_dim for model in self.models]) / self.nr_gpu if 'mi' in eval_keys: mdict['mi'] = tf.add_n([model.mi for model in self.models]) / self.nr_gpu self.monitor = Monitor(dict=mdict, config_str="", log_file_path=self.save_dir + "/logfile") self.train_step = adam_updates(self.params, grads[0], lr=learning_rate) # self.saver = tf.train.Saver()
if True: all_params = tf.trainable_variables( ) #get_trainable_variables(["conv_encoder", "conv_decoder", "conv_pixel_cnn"]) grads = [] for i in range(args.nr_gpu): with tf.device('/gpu:%d' % i): grads.append( tf.gradients(models[i].loss, all_params, colocate_gradients_with_ops=True)) with tf.device('/gpu:0'): for i in range(1, args.nr_gpu): for j in range(len(grads[0])): grads[0][j] += grads[i][j] train_step = adam_updates(all_params, grads[0], lr=args.learning_rate) learner = Learner(session=None, parallel_models=models, optimize_op=train_step, train_set=train_set, eval_set=val_set, variables=tf.trainable_variables()) # def make_feed_dict(data, is_training=True, dropout_p=0.5): # data = np.rint(data) # ds = np.split(data, args.nr_gpu) # feed_dict = {is_trainings[i]: is_training for i in range(args.nr_gpu)} # feed_dict.update({dropout_ps[i]: dropout_p for i in range(args.nr_gpu)}) # feed_dict.update({ xs[i]:ds[i] for i in range(args.nr_gpu) }) # return feed_dict