def __init__(self, shared_value, final_value, epochs_to_saturation): assert_is_instance(shared_value, theano.tensor.sharedvar.SharedVariable) assert_is_subdtype(shared_value.dtype, numpy.floating) assert_equal(shared_value.ndim == 0, numpy.isscalar(final_value)) if numpy.isscalar(final_value): assert_floating(final_value) else: assert_is_subdtype(final_value.dtype, numpy.floating) assert_equal(final_value.shape, shared_value.get_value().shape) assert_integer(epochs_to_saturation) assert_greater(epochs_to_saturation, 0) self.shared_value = shared_value cast = numpy.cast[shared_value.dtype] self._final_value = cast(final_value) self._epochs_to_saturation = epochs_to_saturation self._num_epochs_seen = None self._initial_value = None
def main(): args = parse_args() images = load_npy_file(args.images) labels = load_npy_file(args.labels) blank_image = numpy.zeros(images.shape[1:]) num_examples = labels.shape[0] images_fmt = DenseFormat(axes=('b', '0', '1', 'c'), shape=((-1, ) + images.shape[1:]), dtype=images.dtype) labels_fmt = DenseFormat(axes=('b', 'f'), shape=(-1, labels.shape[1]), dtype=labels.dtype) print("Allocating output file.") output_memmap = make_memmap_file(args.output, num_examples + 1, ['images', 'labels'], [images_fmt, labels_fmt]) print("Copying {} images and labels.".format(num_examples)) assert_equal(images_fmt.axes.index('b'), 0) assert_equal(labels_fmt.axes, ('b', 'f')) assert_is_subdtype(labels_fmt.dtype, numpy.signedinteger) max_category = labels[:, 0].max() blank_label = numpy.empty(labels.shape[1], dtype=labels.dtype) blank_label[0] = max_category + 1 blank_label[1] = 0 blank_label[2:] = -1 output_memmap['images'][0, ...] = 0 output_memmap['labels'][0, ...] = blank_label output_memmap['images'][1:, ...] = images output_memmap['labels'][1:, ...] = labels print("Wrote output to {}".format(args.output))
def __init__(self, parameter, gradient, # see (*) below learning_rate): # (*): We pass in the gradient, rather than the cost, since there are # different ways to generate the gradient expression, and we want to # allow the user to choose different ones, rather than generating the # gradient here ourselves. In particular, the 'consider_constant' # argument to theano.gradient.grad() could be of interest to the user. # (It's a list of symbols to consider constant, and thus not # backpropagate through to their inputs.) ''' Parameters ---------- parameter: A theano symbol A parameter being optimized by an Sgd trainer. gradient: A theano symbol The gradient of the loss function w.r.t. the above parameter. learing_rate: float The initial value of the learning rate. momentum: float A parameter affecting how smeared the update direction is over multiple batches. Use 0.0 for momentum-less SGD. use_nesterov: bool If true, use Nesterov momentum. (See "Advances in Optimizing Recurrent Networks", Yoshua Bengio, et al.) ''' # # sanity-check args # assert_is_instance(parameter, theano.tensor.sharedvar.SharedVariable) assert_is_instance(gradient, theano.gof.Variable) assert_equal(parameter.broadcastable, gradient.broadcastable, "If an Op's .grad() method is buggy, it can return " "broadcast masks.") assert_is_subdtype(gradient.dtype, numpy.floating) assert_greater_equal(learning_rate, 0) floatX = theano.config.floatX if str(gradient.dtype) != str(floatX): gradient = theano.tensor.cast(gradient, floatX) # # define updates, set members # def concat(str0, str1): ''' Like str0 + str1, except returns None if either is None. ''' if str0 is None or str1 is None: return None else: return str0 + str1 def make_shared_floatX(numeric_var, name, **kwargs): return theano.shared(numpy.asarray(numeric_var, dtype=floatX), name=name, **kwargs) self.learning_rate = make_shared_floatX(learning_rate, concat(parameter.name, ' learning rate')) step = - self.learning_rate * gradient self.averaged_param = make_shared_floatX( parameter.get_value(), concat(parameter.name, ' average'), broadcastable=parameter.broadcastable) self.parameter = parameter self.iteration_number = make_shared_floatX(1.0, 'iteration counter') self.parameter_temp = make_shared_floatX( 0 * parameter.get_value(), concat(parameter.name, ' temp'), broadcastable=parameter.broadcastable) assert_equal(parameter.broadcastable, step.broadcastable) new_parameter = parameter + step new_parameter.name = concat('new ', parameter.name) new_averaged_param = (1.0/self.iteration_number)*self.averaged_param + ((self.iteration_number-1)/self.iteration_number)*new_parameter new_iteration_number = self.iteration_number.get_value() + 1.0 updates = OrderedDict([(self.parameter, new_parameter), (self.averaged_param, new_averaged_param), (self.iteration_number, new_iteration_number)]) super(SgdParameterUpdater, self).__init__(updates)
def __init__(self, parameter, gradient, gradient_at_old_params, learning_rate, momentum, method, input_iterator, input_iterator_full, use_nesterov): # # sanity-check args # assert_is_instance(parameter, theano.tensor.sharedvar.SharedVariable) assert_is_instance(gradient, theano.gof.Variable) assert_is_instance(gradient_at_old_params, theano.gof.Variable) assert_equal(parameter.broadcastable, gradient.broadcastable, "If an Op's .grad() method is buggy, it can return " "broadcast masks.") assert_is_subdtype(gradient.dtype, numpy.floating) assert_is_subdtype(gradient_at_old_params.dtype, numpy.floating) assert_greater_equal(learning_rate, 0) assert_greater_equal(momentum, 0) assert_is_instance(use_nesterov, bool) floatX = theano.config.floatX if str(gradient.dtype) != str(floatX): gradient = theano.tensor.cast(gradient, floatX) # # define updates, set members # def concat(str0, str1): ''' Like str0 + str1, except returns None if either is None. ''' if str0 is None or str1 is None: return None else: return str0 + str1 def make_shared_floatX(numeric_var, name, **kwargs): return theano.shared(numpy.asarray(numeric_var, dtype=floatX), name=name, **kwargs) self.learning_rate = make_shared_floatX(learning_rate, concat(parameter.name, ' learning rate')) self.momentum = make_shared_floatX(momentum, concat(parameter.name, ' momentum')) self._velocity = make_shared_floatX( 0.0 * parameter.get_value(), concat(parameter.name, ' velocity'), broadcastable=parameter.broadcastable) self.full_gradient = make_shared_floatX( 0.0 * parameter.get_value(), concat(parameter.name, ' full gradient'), broadcastable=parameter.broadcastable) # This variable takes value 1 if S2GD is used and 0 otherwise. self.method = method if self.method == 'SGD' or self.method == 'S2GD_plus': multiplier = 0.0 elif self.method == 'S2GD' or self.method == 'S2GD_rolling': multiplier = 1.0 else: raise Exception('Please enter a valid method: "SGD", "S2GD", "S2GD_plus", or "S2GD_rolling"') self.S2GD_on = make_shared_floatX(numeric_var=multiplier, name='use_S2GD') # updated_full_gradient = 0 if self.method == 'S2GD_rolling': total_size_dataset = float(input_iterator.dataset.tensors[0].shape[0]) batch_size = float(input_iterator.batch_size) updated_full_gradient = (gradient*batch_size + self.full_gradient*total_size_dataset - gradient_at_old_params*batch_size)/ total_size_dataset new_velocity = self.momentum* self._velocity - self.learning_rate * updated_full_gradient new_velocity.name = concat('new ', self._velocity.name) else: new_velocity = self.momentum* self._velocity - self.learning_rate * (gradient + self.S2GD_on * (self.full_gradient - gradient_at_old_params)) new_velocity.name = concat('new ', self._velocity.name) assert_equal(str(new_velocity.dtype), str(floatX)) assert_equal(self._velocity.broadcastable, new_velocity.broadcastable) step = (self.momentum * new_velocity - self.learning_rate * gradient if use_nesterov else new_velocity) assert_equal(parameter.broadcastable, step.broadcastable) new_parameter = parameter + step new_parameter.name = concat('new ', parameter.name) # self.updates = 0 if self.method == 'S2GD_rolling': updates = OrderedDict([(parameter, new_parameter), (self._velocity, new_velocity), (self.full_gradient, updated_full_gradient)]) else: updates = OrderedDict([(parameter, new_parameter), (self._velocity, new_velocity)]) total_size_dataset = input_iterator_full.dataset.tensors[0].shape[0] batch_size = input_iterator_full.batch_size steps = total_size_dataset/batch_size self.full_gradient_updates = OrderedDict([(self.full_gradient, self.full_gradient + (gradient/steps))]) super(SemiSgdParameterUpdater, self).__init__(updates)
def __init__(self, image_node, yaml_dict, numpy_rng, theano_rng): ''' Parameters ---------- image_node: InputNode yaml_dict: dict ''' super(IdAndCameraDirModel, self).__init__() # # Build the model nodes, and initialize their weights. # # preprocessing layers shared_layers = [] shared_layers.append(RgbToGray(image_node)) shared_layers.append(Lcn(shared_layers[-1])) assert_is_subdtype(image_node.output_format.dtype, numpy.floating) use_dropout = yaml_dict['hyperparams']['use_dropout'] def get_num_classes(yaml_dict): fg_path = yaml_dict['datasets']['training']['fg_path'] dataset = MemmapDataset(os.path.join(data_path, fg_path)) label_to_id = NorbLabelToObjectIdConverter(dataset.tensors[1]) return label_to_id.num_unique_ids add_conv_layers(shared_layers[-1], yaml_dict['model']['shared_layers']['conv'], use_dropout, numpy_rng, theano_rng, shared_layers) add_affine_layers(shared_layers[-1], yaml_dict['model']['shared_layers']['affine'], use_dropout, numpy_rng, theano_rng, shared_layers) id_layers = [] add_classifier_mlp(shared_layers[-1], yaml_dict['model']['id_layers'], get_num_classes(yaml_dict), use_dropout, numpy_rng, theano_rng, id_layers) cam_dir_layers = [] add_regressor_mlp(shared_layers[-1], yaml_dict['model']['cam_dir_layers'], 3, use_dropout, numpy_rng, theano_rng, cam_dir_layers) self.input_node = image_node self.shared_layers = shared_layers self.id_layers = id_layers self.cam_dir_layers = cam_dir_layers
def __init__(self, small_conv_model, image_node, yaml_dict, numpy_rng, theano_rng): assert_is_instance(small_conv_model, IdAndCameraDirModel) assert_equal(image_node.output_format.axes, ('b', '0', '1', 'c')) self.shared_layers = [] self.shared_layers.append(RgbToGray(image_node)) self.shared_layers.append(Lcn(self.shared_layers[-1])) assert_is_subdtype(image_node.output_format.dtype, numpy.floating) use_dropout = yaml_dict['hyperparams']['use_dropout'] def get_num_classes(yaml_dict): fg_path = yaml_dict['datasets']['training']['fg_path'] dataset = MemmapDataset(os.path.join(data_path, fg_path)) label_to_id = NorbLabelToObjectIdConverter(dataset.tensors[1]) return label_to_id.num_unique_ids add_conv_layers(self.shared_layers[-1], yaml_dict['model']['shared_layers']['conv'], use_dropout, numpy_rng, theano_rng, self.shared_layers) def get_first_affine_layer_filter_shape(small_conv_model): first_affine_layer = \ small_conv_model.shared_layers[len(self.shared_layers)] assert_is_instance(first_affine_layer, AffineLayer) assert_is_instance(first_affine_layer.inputs[0], Conv2dLayer) assert_equal(first_affine_layer.inputs[0].output_format.axes, ('b', 'c', '0', '1')) return first_affine_layer.inputs[0].output_format.shape[2:] first_filter_shape = \ get_first_affine_layer_filter_shape(small_conv_model) assert_equal(first_filter_shape, (2, 2)) add_affine_layers_conv(self.shared_layers[-1], yaml_dict['model']['shared_layers']['affine'], use_dropout, numpy_rng, theano_rng, first_filter_shape=first_filter_shape, output_list=self.shared_layers) self.id_layers = [] add_classifier_mlp_conv(self.shared_layers[-1], yaml_dict['model']['id_layers'], get_num_classes(yaml_dict), use_dropout, numpy_rng, theano_rng, self.id_layers) self.cam_dir_layers = [] add_regressor_mlp_conv(self.shared_layers[-1], yaml_dict['model']['cam_dir_layers'], 3, use_dropout, numpy_rng, theano_rng, self.cam_dir_layers) self.input_node = image_node
def __init__(self, image_node, yaml_dict, numpy_rng, theano_rng): ''' Parameters ---------- image_node: InputNode yaml_dict: dict ''' super(IdPoseLightingModel, self).__init__() self.input_node = image_node # preprocessing layers self.shared_layers = [] self.shared_layers.append(RgbToGray(image_node)) self.shared_layers.append(Lcn(self.shared_layers[-1])) assert_is_subdtype(image_node.output_format.dtype, numpy.floating) use_dropout = yaml_dict['hyperparams']['use_dropout'] add_conv_layers(self.shared_layers[-1], yaml_dict['model']['shared_layers']['conv'], use_dropout, numpy_rng, theano_rng, self.shared_layers) add_affine_layers_conv(self.shared_layers[-1], yaml_dict['model']['shared_layers']['affine'], use_dropout, numpy_rng, theano_rng, self.shared_layers) def get_num_classes(yaml_dict): fg_path = yaml_dict['datasets']['training']['fg_path'] dataset = MemmapDataset(os.path.join(data_path, fg_path)) label_to_id = NorbLabelToObjectIdConverter(dataset.tensors[1]) return label_to_id.num_unique_ids self.id_layers = [] add_classifier_mlp_conv(self.shared_layers[-1], yaml_dict['model']['id_layers'], get_num_classes(yaml_dict), use_dropout, numpy_rng, theano_rng, self.id_layers) self.cam_dir_layers = [] add_regressor_mlp_conv(self.shared_layers[-1], yaml_dict['model']['cam_dir_layers'], 3, use_dropout, numpy_rng, theano_rng, self.cam_dir_layers) def get_num_lightings(yaml_dict): ''' Returns the number of non-blank lighting values. ''' fg_path = yaml_dict['datasets']['training']['fg_path'] dataset = MemmapDataset(os.path.join(data_path, fg_path)) lighting_labels = dataset.tensors[1][:, 4] assert_equal(lighting_labels[0], -1) assert_array_compare(numpy.greater_equal, lighting_labels[1:], 0) assert_array_compare(numpy.less, lighting_labels[1:], 4) num_valid_lighting_values = len(frozenset(lighting_labels[1:])) assert_equal(num_valid_lighting_values, 4) return num_valid_lighting_values self.lighting_layers = [] add_classifier_mlp_conv(self.shared_layers[-1], yaml_dict['model']['lighting_layers'], get_num_lightings(yaml_dict), use_dropout, numpy_rng, theano_rng, self.lighting_layers) self.rc_shift_layers = [] add_regressor_mlp_conv(self.shared_layers[-1], yaml_dict['model']['rc_shift_layers'], 2, use_dropout, numpy_rng, theano_rng, self.rc_shift_layers) self.scale_layers = [] add_regressor_mlp_conv(self.shared_layers[-1], yaml_dict['model']['scale_layers'], 1, use_dropout, numpy_rng, theano_rng, self.scale_layers) self.roll_layers = [] add_regressor_mlp_conv(self.shared_layers[-1], yaml_dict['model']['roll_layers'], 1, use_dropout, numpy_rng, theano_rng, self.roll_layers)
def __init__(self, parameter, gradient, # see (*) below learning_rate, momentum, use_nesterov): # (*): We pass in the gradient, rather than the cost, since there are # different ways to generate the gradient expression, and we want to # allow the user to choose different ones, rather than generating the # gradient here ourselves. In particular, the 'consider_constant' # argument to theano.gradient.grad() could be of interest to the user. # (It's a list of symbols to consider constant, and thus not # backpropagate through.) ''' Parameters ---------- parameter: A theano symbol A parameter being optimized by an Sgd trainer. gradient: A theano symbol The gradient of the loss function w.r.t. the above parameter. learing_rate: float The initial value of the learning rate. momentum: float A parameter affecting how smeared the update direction is over multiple batches. Use 0.0 for momentum-less SGD. use_nesterov: bool If true, use Nesterov momentum. (See "Advances in Optimizing Recurrent Networks", Yoshua Bengio, et al.) ''' # # sanity-check args # assert_is_instance(parameter, theano.tensor.sharedvar.SharedVariable) assert_is_instance(gradient, theano.gof.Variable) assert_equal(parameter.broadcastable, gradient.broadcastable, "If an Op's .grad() method is buggy, it can return " "broadcast masks.") assert_is_subdtype(gradient.dtype, numpy.floating) assert_greater_equal(learning_rate, 0) assert_greater_equal(momentum, 0) assert_is_instance(use_nesterov, bool) floatX = theano.config.floatX if str(gradient.dtype) != str(floatX): gradient = theano.tensor.cast(gradient, floatX) # # define updates, set members # def concat(str0, str1): ''' Like str0 + str1, except returns None if either is None. ''' if str0 is None or str1 is None: return None else: return str0 + str1 def make_shared_floatX(numeric_var, name, **kwargs): return theano.shared(numpy.asarray(numeric_var, dtype=floatX), name=name, **kwargs) self.learning_rate = make_shared_floatX(learning_rate, concat(parameter.name, ' learning rate')) self.momentum = make_shared_floatX(momentum, concat(parameter.name, ' momentum')) decay_rate = 0.1 self.decay_rate = make_shared_floatX(decay_rate, concat(parameter.name, ' decay rate')) self._velocity = make_shared_floatX( 0.0 * parameter.get_value(), concat(parameter.name, ' velocity'), broadcastable=parameter.broadcastable) self.mean_square = make_shared_floatX( 0.0 * parameter.get_value(), concat(parameter.name, ' MeanSquare'), broadcastable=parameter.broadcastable) new_mean_square = self.decay_rate * self.mean_square + (1-self.decay_rate) * pow(gradient,2) new_mean_square.name = concat('new ', self.mean_square.name) new_velocity = (self.momentum * self._velocity - self.learning_rate * (gradient / (pow(new_mean_square, 0.5) + 0.6)) ) new_velocity.name = concat('new ', self._velocity.name) assert_equal(str(new_velocity.dtype), str(floatX)) assert_equal(self._velocity.broadcastable, new_velocity.broadcastable) step = (self.momentum * new_velocity - self.learning_rate * gradient if use_nesterov else new_velocity) #step2 = self.learning_rate * (gradient / ( pow(new_mean_square, 0.5) + 0.6) ) assert_equal(parameter.broadcastable, step.broadcastable) new_parameter = parameter + step new_parameter.name = concat('new ', parameter.name) self.updates = OrderedDict([(parameter, new_parameter), (self._velocity, new_velocity), (self.mean_square, new_mean_square)])