def get_model_fn(features, labels, mode, params, cosmoflow_config): """model definition""" model = get_model(**cosmoflow_config['model']) outputs = model(features, training=mode == tf.estimator.ModeKeys.TRAIN) train_config = cosmoflow_config['train'] loss_name = train_config['loss'] if loss_name == "mse": loss = tf.losses.mean_squared_error(labels=labels, predictions=outputs) else: raise NotImplementedError("loss: %s" % loss_name) if mode == tf.estimator.ModeKeys.EVAL: predictions = outputs eval_metric_ops = { "mae": tf.metrics.mean_absolute_error(labels=labels, predictions=predictions), } return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.GradientDescentOptimizer(params["learning_rate"]) if cosmoflow_config['ipu_config']['num_ipus'] > 1: optimizer = CrossReplicaOptimizer(optimizer) train_op = optimizer.minimize(loss=loss) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) raise NotImplementedError(mode)
def graph_builder(opts, observed=None, ground_truth=None, learning_rate=0.001, mode=util.Modes.TRAIN): # Build the neural network predictions = MLPModel(opts, mode=mode)(observed) # Loss loss = opts.loss_scaling * tf.cast(tf.losses.absolute_difference( ground_truth, predictions, reduction=tf.losses.Reduction.MEAN), dtype=getattr(tf, opts.dtypes[0])) # Error metric rmse_metric = util.exp_rmspe(ground_truth, predictions) if mode == util.Modes.TRAIN: # Training optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) # Wrap in a CrossReplica if we're replicating across multiple IPUs if opts.replication_factor > 1: optimizer = CrossReplicaOptimizer(optimizer) # Batch norm variable update dependency update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): # Op to calculate every variable gradient grads = tf.gradients(loss, tf.trainable_variables()) grads = list(zip(grads, tf.trainable_variables())) # Loss scaling grads = [(grad / opts.loss_scaling, var) for grad, var in grads] # Apply weight_decay directly to gradients if opts.weight_decay != 0: grads = [(grad + (opts.weight_decay * var), var) if 'l2tag' in var.name and 'kernel' in var.name else (grad, var) for grad, var in grads] # clip gradients if opts.gradient_clipping: grads = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grads] # Op to update all variables according to their gradient apply_grads = optimizer.apply_gradients(grads_and_vars=grads) return loss / opts.loss_scaling, rmse_metric, apply_grads elif mode == util.Modes.VALID: return loss / opts.loss_scaling, rmse_metric, None
def get_optimiser(self): _learning_rate = self.get_current_learning_rate() opt_kwargs = self.optimiser_kwargs.copy() if 'dtype' in opt_kwargs: opt_kwargs['dtype'] = self.experiment.dtype if self.n_replicas == 1: return self.optimiser_type(_learning_rate, **opt_kwargs) else: return CrossReplicaOptimizer(self.optimiser_type(_learning_rate, **opt_kwargs))
def body(loss, features, labels): with tf.variable_scope("MainGraph"): model = get_model(**cosmoflow_config['model']) outputs = model(features, training=True) train_config = cosmoflow_config['train'] loss_name = train_config['loss'] if loss_name == "mse": loss = tf.losses.mean_squared_error(labels=labels, predictions=outputs) else: raise NotImplementedError("loss: %s" % loss_name) optimizer = tf.train.GradientDescentOptimizer( cosmoflow_config['optimizer']['lr']) if cosmoflow_config['ipu_config']['num_ipus'] > 1: optimizer = CrossReplicaOptimizer(optimizer) train_op = optimizer.minimize(loss=loss) with tf.control_dependencies([train_op]): return loss, outfeed_queue.enqueue(loss)
def build_train_op(previous_loss, *infeed_data): """Construct loss and optimizer.""" with ipu_scope("/device:IPU:0"): action_prob = create_policy(*infeed_data) loss = tf.reduce_sum(action_prob * infeed_data[-2]) opt = tf.train.GradientDescentOptimizer(LEARNING_RATE) if args.accumulate_grad: opt = GradientAccumulationOptimizer( opt, num_mini_batches=args.num_mini_batches) opt = CrossReplicaOptimizer(opt) train_op = opt.minimize(loss) with tf.control_dependencies([train_op]): loss = tf.identity(loss) return previous_loss + loss