Exemplo n.º 1
0
    def add_channel(self,
                    name,
                    ipt,
                    val,
                    dataset=None,
                    prereqs=None,
                    data_specs=None):
        """
        Asks the monitor to start tracking a new value.  Can be called
        even after the monitor is already in use.

        Parameters
        ----------
        name : str
            The display name in the monitor.
        ipt : tensor_like
            The symbolic tensor which should be clamped to the data.
            (or a list/tuple containing symbolic tensors, following the
            data_specs)
        val : tensor_like
            The value (function of `ipt`) to be tracked.
        dataset : pylearn2.datasets.Dataset
            Which dataset to compute this channel on
        prereqs : list of callables that take a list of numpy tensors
            Each prereq must be called exactly once per each new batch
            of data drawn *from dataset* before the channel value is
            computed if two channels provide a prereq with exactly the
            same id, that prereq will only be called once
        data_specs : (space, source) pair
            Identifies the order, format and semantics of ipt
        """
        if six.PY3:
            numeric = (float, int)
        else:
            numeric = (float, int, long)  # noqa

        if isinstance(val, numeric):
            val = np.cast[theano.config.floatX](val)

        val = T.as_tensor_variable(val)

        if data_specs is None:
            warnings.warn("parameter 'data_specs' should be provided when " +
                          "calling add_channel. We will build a default one.",
                          stacklevel=2)
            if isinstance(ipt, list):
                ipt = tuple(ipt)
            if ipt is not None and not isinstance(ipt, tuple):
                ipt = (ipt, )

            if ipt is None:
                data_specs = (NullSpace(), '')
            elif len(ipt) == 0:
                data_specs = (CompositeSpace([]), ())
            elif hasattr(dataset, 'get_data_specs'):
                dataset_space, dataset_source = dataset.get_data_specs()
                if (len(ipt) == 1 and dataset_source is not None
                        and (not isinstance(dataset_source, tuple)
                             or len(dataset_source) == 1)
                        and 'features' in dataset_source):
                    data_specs = (dataset_space, dataset_source)
                elif (len(ipt) == 2
                      and dataset_source == ('features', 'targets')):
                    data_specs = (dataset_space, dataset_source)
                else:
                    raise ValueError("Cannot infer default data_specs for " +
                                     "the following input points and " +
                                     "dataset: ipt = %s, dataset = %s" %
                                     (ipt, dataset))

        data_specs[0].validate(ipt)

        mapping = DataSpecsMapping(data_specs)
        flat_ipt = mapping.flatten(ipt)
        if not isinstance(flat_ipt, tuple):
            flat_ipt = (flat_ipt, )
        inputs = theano.gof.graph.inputs([val])
        for elem in inputs:
            if not hasattr(elem, 'get_value') and \
               not isinstance(elem, theano.gof.graph.Constant):
                if elem not in flat_ipt:
                    raise ValueError("Unspecified input: " + str(elem) +
                                     ". This may be due to an incorrect " +
                                     "implementation of a cost's " +
                                     "get_data_specs() method, or of a " +
                                     "model's get_monitoring_data_specs() " +
                                     "method.")

        mode = self.theano_function_mode
        if mode is not None and hasattr(mode, 'record'):
            mode.record.handle_line('Adding monitor channel ' + name + '\n')
            assert isinstance(flat_ipt, tuple)
            if len(flat_ipt) != 1:
                for elem in flat_ipt:
                    mode.record.handle_line('Includes input var ' +
                                            var_descriptor(elem) + '\n')
            else:
                mode.record.handle_line(name + ' input var is ' +
                                        var_descriptor(flat_ipt[0]) + '\n')
            mode.record.handle_line('channel ' + name + ' is ' +
                                    var_descriptor(val) + '\n')

        if dataset is None:
            if len(self._datasets) == 1:
                dataset = self._datasets[0]
            elif len(self._datasets) == 0:
                raise ValueError(_err_no_data)
            else:
                raise ValueError(_err_ambig_data)

        try:
            self._datasets.index(dataset)
        except ValueError:
            reraise_as(
                ValueError("The dataset specified is not one of the " +
                           "monitor's datasets"))

        if ((self.on_channel_conflict
             not in ('error', 'copy_history', 'overwrite'))):
            raise ValueError("on_channel_conflict should be either 'error'" +
                             "'copy_history', or 'overwrite'")

        if name in self.channels and self.on_channel_conflict == 'error':
            raise ValueError("Tried to create the same channel twice (%s)" %
                             name)
        elif ((name in self.channels
               and self.on_channel_conflict == 'copy_history')):
            self.channels[name] = MonitorChannel(ipt, val, name, data_specs,
                                                 dataset, prereqs,
                                                 self.channels[name])
        elif ((name not in self.channels
               or self.on_channel_conflict == 'overwrite')):
            self.channels[name] = MonitorChannel(ipt, val, name, data_specs,
                                                 dataset, prereqs)
        self._dirty = True
Exemplo n.º 2
0
    def setup(self,
              dataset,
              cost,
              batch_size,
              num_batches=None,
              extra_costs=None,
              mode='sequential',
              obj_prereqs=None,
              cost_monitoring_args=None):
        """
        Sets up the monitor for a cost minimization problem.
        Adds channels defined by both the model and the cost for
        the specified dataset(s), as well as a channel called
        'objective' defined by the costs' __call__ method.

        Parameters
        ----------
        dataset : pylearn2.datasets.Dataset
            Dataset or dictionary mapping string names to Datasets.
            If string names are used, then for every dataset, each
            channel defined by the model or cost will be replicated
            with that dataset's name followed by an underscore as the
            prefix. For example, if your cost defines a channel called
            'misclass', and datasets is
            {'train' : train_dataset, 'valid' : valid_dataset},
            you will get channels called 'train_misclass' and
            'valid_misclass'.
        cost : pylearn2.costs.Cost
            The cost being optimized by training. The value of the cost
            will appear as the `objective` channel. Its
            `get_monitoring_channels` method will also be used to
            supply other channels.
        extra_costs : OrderedDict, optional
            A dictionary mapping channel names to Cost objects.
            Their value will appear as the specified channel name.
            They will also provide more monitoring channels via their
            `get_monitoring_channels` method.
        obj_prereqs : None, or list of functions
            Functions to pass as prerequisites to the `objective` channel.
        cost_monitoring_args : dict
            Dictionary of kwargs that will be passed to
            `cost.get_monitoring_channels()`
            (but not for the extra_costs).
        """

        if dataset is None:
            return
        if isinstance(dataset, Dataset):
            dataset = {'': dataset}
        else:
            assert isinstance(dataset, dict)
            assert all(isinstance(key, str) for key in dataset)
            assert all(isinstance(dataset[key], Dataset) for key in dataset)

        if extra_costs is None:
            costs = {}
        else:
            assert isinstance(extra_costs, (OrderedDict, dict))
            costs = extra_costs
        assert '' not in costs
        costs[''] = cost

        if cost_monitoring_args is None:
            cost_monitoring_args = {}

        model = self.model

        # Build a composite data_specs containing the specs for all costs,
        # then the specs of the model
        cost_names = sorted(costs.keys())
        spaces = []
        sources = []
        for c in cost_names:
            c_space, c_source = costs[c].get_data_specs(model)
            spaces.append(c_space)
            sources.append(c_source)

        # Ask the model for the data_specs needed
        m_space, m_source = model.get_monitoring_data_specs()
        spaces.append(m_space)
        sources.append(m_source)

        nested_space = CompositeSpace(spaces)
        nested_sources = tuple(sources)

        # Flatten this data_specs, so we build only one symbolic Theano
        # variable for each of the unique (space, source) pairs.
        mapping = DataSpecsMapping((nested_space, nested_sources))
        space_tuple = mapping.flatten(nested_space, return_tuple=True)
        source_tuple = mapping.flatten(nested_sources, return_tuple=True)
        ipt = tuple(
            space.make_theano_batch(name='monitor_%s' % source,
                                    batch_size=None)
            for (space, source) in safe_zip(space_tuple, source_tuple))

        # Build a nested tuple from ipt, to dispatch the appropriate parts
        # of the ipt batch to each cost
        nested_ipt = mapping.nest(ipt)

        custom_channels = {}
        for i, cost_name in enumerate(cost_names):
            if cost_name == '':
                prefix = ''
            else:
                prefix = cost_name + '_'
            cost = costs[cost_name]
            cost_ipt = nested_ipt[i]
            raw_channels = cost.get_monitoring_channels(model, cost_ipt)
            channels = {}
            for name in raw_channels:
                # We need three things: the value itself (raw_channels[name]),
                # the input variables (cost_ipt), and the data_specs for
                # these input variables ((spaces[i], sources[i]))
                channels[prefix + name] = (raw_channels[name], cost_ipt,
                                           (spaces[i], sources[i]))
            custom_channels.update(channels)

        # Use the last inputs from nested_ipt for the model
        model_channels = model.get_monitoring_channels(nested_ipt[-1])
        channels = {}
        for name in model_channels:
            # Note: some code used to consider that model_channels[name]
            # could be a a (channel, prereqs) pair, this is not supported.
            channels[name] = (model_channels[name], nested_ipt[-1],
                              (spaces[-1], sources[-1]))
        custom_channels.update(channels)

        if is_stochastic(mode):
            seed = [[2013, 2, 22]]
        else:
            seed = None

        for dataset_name in dataset:
            cur_dataset = dataset[dataset_name]
            self.add_dataset(dataset=cur_dataset,
                             mode=mode,
                             batch_size=batch_size,
                             num_batches=num_batches,
                             seed=seed)
            if dataset_name == '':
                dprefix = ''
            else:
                dprefix = dataset_name + '_'
            # These channel name 'objective' must not vary, since callbacks
            # that respond to the values in the monitor use the name to find
            # it.
            for i, cost_name in enumerate(cost_names):
                cost = costs[cost_name]
                cost_ipt = nested_ipt[i]
                cost_value = cost.expr(model, cost_ipt)
                if cost_value is not None:
                    if cost_name == '':
                        name = dprefix + 'objective'
                        prereqs = obj_prereqs
                    else:
                        name = dprefix + cost_name
                        prereqs = None

                    cost.get_data_specs(model)[0].validate(cost_ipt)
                    self.add_channel(name=name,
                                     ipt=cost_ipt,
                                     val=cost_value,
                                     data_specs=cost.get_data_specs(model),
                                     dataset=cur_dataset,
                                     prereqs=prereqs)

            for key in custom_channels:
                val, ipt, data_specs = custom_channels[key]
                data_specs[0].validate(ipt)
                self.add_channel(name=dprefix + key,
                                 ipt=ipt,
                                 val=val,
                                 data_specs=data_specs,
                                 dataset=cur_dataset)
Exemplo n.º 3
0
    def setup(self, model, dataset):
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [
            param for param in model.get_params()
            if np.any(np.isinf(param.get_value()))
        ]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: " + str(inf_params))
        if any([
                np.any(np.isnan(param.get_value()))
                for param in model.get_params()
        ]):
            nan_params = [
                param for param in model.get_params()
                if np.any(np.isnan(param.get_value()))
            ]
            raise ValueError("These params are NaN: " + str(nan_params))
        self.model = model

        batch_size = self.batch_size
        if hasattr(model, "force_batch_size"):
            if model.force_batch_size > 0:
                if batch_size is not None:
                    if batch_size != model.force_batch_size:
                        if self.set_batch_size:
                            model.set_batch_size(batch_size)
                        else:
                            raise ValueError(
                                "batch_size argument to SGD conflicts with model's force_batch_size attribute"
                            )
                else:
                    self.batch_size = model.force_batch_size
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                                          batch_size=self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    **fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost
        learning_rate = self.learning_rate
        if self.monitoring_dataset is not None:
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.batch_size,
                               num_batches=self.monitoring_batches,
                               extra_costs=self.monitoring_costs,
                               mode=self.monitor_iteration_mode)
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=None,
                                     val=learning_rate,
                                     data_specs=(NullSpace(), ''),
                                     dataset=monitoring_dataset)

            if self.learning_rule:
                self.learning_rule.add_channels_to_monitor(
                    self.monitor, monitoring_dataset)

        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i

        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 **fixed_var_descr.fixed_vars)

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        for param in grads:
            if grads[param].name is None and cost_value is not None:
                grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {
                    'costname': cost_value.name,
                    'paramname': param.name
                })

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        log.info('Parameter and initial learning rate summary:')
        for param in params:
            param_name = param.name
            if param_name is None:
                param_name = 'anon_param'
            lr = learning_rate.get_value() * lr_scalers.get(param, 1.)
            log.info('\t' + param_name + ': ' + str(lr))

        if self.learning_rule:
            updates.update(
                self.learning_rule.get_updates(learning_rate, grads,
                                               lr_scalers))
        else:
            # Use standard SGD updates with fixed learning rate.
            updates.update( dict(safe_zip(params, [param - learning_rate * \
                lr_scalers.get(param, 1.) * grads[param]
                                    for param in params])))

        for param in params:
            if updates[param].name is None:
                updates[param].name = 'sgd_update(' + param.name + ')'
        model.censor_updates(updates)
        for param in params:
            update = updates[param]
            if update.name is None:
                update.name = 'censor(sgd_update(' + param.name + '))'
            for update_val in get_debug_values(update):
                if np.any(np.isinf(update_val)):
                    raise ValueError("debug value of %s contains infs" %
                                     update.name)
                if np.any(np.isnan(update_val)):
                    raise ValueError("debug value of %s contains nans" %
                                     update.name)

        with log_timing(log, 'Compiling sgd_update'):
            self.sgd_update = function(theano_args,
                                       updates=updates,
                                       name='sgd_update',
                                       on_unused_input='ignore',
                                       mode=self.theano_function_mode)
        self.params = params
Exemplo n.º 4
0
    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is called any time we need to evaluate the channels and
        the channel definitions have changed since last we called it,
        or if the theano functions are unavailable for any other reason
        (first time they are needed after construction or
        deserialization, etc.)

        All channels are compiled as part of the same theano function
        so that the theano optimizations can eliminate subexpressions
        that are shared between multiple channels.
        """
        self._dirty = False

        # Recompute the data specs, since the channels may have changed.
        self._build_data_specs()

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data
        # the model acts on
        batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]]
        theano_args = self._flat_data_specs[0].make_theano_batch(batch_names)

        # Get a symbolic expression of the batch size
        # We do it here, rather than for each channel, because channels with an
        # empty data_specs do not use data, and are unable to extract the batch
        # size. The case where the whole data specs is empty is not supported.
        batch_size = self._flat_data_specs[0].batch_size(theano_args)

        # Also get a nested representation, for joint iteration
        # with each of channel.graph_input
        nested_theano_args = self._data_specs_mapping.nest(theano_args)
        if not isinstance(nested_theano_args, tuple):
            nested_theano_args = (nested_theano_args, )
        assert len(nested_theano_args) == (len(self.channels) + 1)

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including ' +
                                        'channel ' + key + '\n')
            log.info('\t%s' % key)
        it = []
        for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                    self._num_batches, self._batch_size):
            it.append(
                d.iterator(mode=i,
                           num_batches=n,
                           batch_size=b,
                           data_specs=self._flat_data_specs,
                           return_tuple=True))
        self.num_examples = [i.num_examples for i in it]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for i, channel in enumerate(self.channels.values()):
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]

            # Flatten channel.graph_input and the appropriate part of
            # nested_theano_args, to iterate jointly over them.
            c_mapping = DataSpecsMapping(channel.data_specs)
            channel_inputs = c_mapping.flatten(channel.graph_input,
                                               return_tuple=True)
            inputs = c_mapping.flatten(nested_theano_args[i + 1],
                                       return_tuple=True)

            for (channel_X, X) in safe_izip(channel_inputs, inputs):
                assert channel_X not in g or g[channel_X] is X
                assert channel_X.type == X.type, (channel_X.type, X.type)
                g[channel_X] = X

            if batch_size == 0:
                # No channel does need any data, so there is not need to
                # average results, and we will call the accum functions only
                # once.
                # TODO: better handling of channels not needing data when
                # some other channels need data.
                assert len(self._flat_data_specs[1]) == 0
                val = channel.val
            else:
                if n == 0:
                    raise ValueError("Iterating over 0 examples results in " +
                                     "divide by 0")
                val = T.cast(
                    channel.val * T.cast(batch_size, 'float64') /
                    cur_num_examples, config.floatX)
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' +
                                        key.name + ' has dtype ' + key.dtype +
                                        ' but is driven by an expression ' +
                                        'with type ' + up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if mode is not None and hasattr(mode, 'record'):
                    mode.record.handle_line('compiling supervised accum\n')
                # Some channels may not depend on the data, ie, they might just
                # monitor the model parameters, or some shared variable updated
                # by the training algorithm, so we need to ignore the unused
                # input error
                self.accum.append(
                    function(theano_args,
                             givens=g,
                             updates=u,
                             mode=self.theano_function_mode,
                             name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del(
            [name for name in final_names if name not in init_names])
Exemplo n.º 5
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model : object
            Python object representing the model to train loosely \
            implementing the interface of models.model.Model.

        dataset : pylearn2.datasets.dataset.Dataset
            Dataset object used to draw training data
        """
        self._synchronize_batch_size(model)

        self.model = model

        self.monitor = Monitor.get_monitor(model)

        if self.monitoring_dataset is not None:
            # Get the data specifications needed by the model
            space, source = model.get_monitoring_data_specs()

            # Create Theano variables for each of the individual components
            # of that data. Usually, it will be X for inputs and Y for targets.
            # First, we need to find these components, and put them in a tuple
            mapping = DataSpecsMapping((space, source))
            space_tuple = mapping.flatten(space, return_tuple=True)
            source_tuple = mapping.flatten(source, return_tuple=True)
            # Then, build a flat tuple of these Theano variables
            ipt = tuple(
                sp.make_theano_batch(name='monitor_%s' % src)
                for (sp, src) in safe_zip(space_tuple, source_tuple))
            # Finally, organize them back into a structure expected by the
            # monitoring channels of the model
            nested_ipt = mapping.nest(ipt)

            channels = model.get_monitoring_channels(nested_ipt)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]

                if (self.monitoring_batch_size is None
                        and self.monitoring_batches == -1):
                    self.monitoring_batch_size = self.batch_size
                    self.monitoring_batches = self.batches_per_iter
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                         mode="sequential",
                                         batch_size=self.monitoring_batch_size,
                                         num_batches=self.monitoring_batches)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    self.monitor.add_channel(name=prefix + name,
                                             ipt=nested_ipt,
                                             val=J,
                                             dataset=monitoring_dataset,
                                             prereqs=prereqs,
                                             data_specs=(space, source))

        self.first = True
        self.bSetup = True
Exemplo n.º 6
0
    def train(self, dataset):
        """
        Runs one epoch of SGD training on the specified dataset.

        Parameters
        ----------
        dataset : Dataset
        """

        if not hasattr(self, 'd_func'):
            raise Exception("train called without first calling setup")

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

        self.first = False
        rng = self.rng
        if not is_stochastic(self.train_iteration_mode):
            rng = None

        data_specs = self.cost.get_data_specs(self.model)

        # The iterator should be built from flat data specs, so it returns
        # flat, non-redundent tuples of data.
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        if len(space_tuple) == 0:
            # No data will be returned by the iterator, and it is impossible
            # to know the size of the actual batch.
            # It is not decided yet what the right thing to do should be.
            raise NotImplementedError(
                "Unable to train with SGD, because "
                "the cost does not actually use data from the data set. "
                "data_specs: %s" % str(data_specs))
        flat_data_specs = (CompositeSpace(space_tuple), source_tuple)

        iterator = dataset.iterator(mode=self.train_iteration_mode,
                                    batch_size=self.batch_size,
                                    data_specs=flat_data_specs,
                                    return_tuple=True,
                                    rng=rng,
                                    num_batches=self.batches_per_iter)

        on_load_batch = self.on_load_batch
        i = 0
        for batch in iterator:
            for callback in on_load_batch:
                callback(*batch)
            if self.train_generator and i == self.discriminator_steps:
                self.g_func(*batch)
                i = 0
            else:
                self.d_func(*batch)
                i += 1
            # iterator might return a smaller batch if dataset size
            # isn't divisible by batch_size
            # Note: if data_specs[0] is a NullSpace, there is no way to know
            # how many examples would actually have been in the batch,
            # since it was empty, so actual_batch_size would be reported as 0.
            actual_batch_size = flat_data_specs[0].np_batch_size(batch)
            self.monitor.report_batch(actual_batch_size)
            for callback in self.update_callbacks:
                callback(self)

        # Make sure none of the parameters have bad values
        for param in self.params:
            value = param.get_value(borrow=True)
            if np.any(np.isnan(value)) or np.any(np.isinf(value)):
                raise Exception("NaN in " + param.name)

        self.train_generator = not self.train_generator
Exemplo n.º 7
0
    def setup(self, model, dataset):
        """
        Compiles the theano functions needed for the train method.

        Parameters
        ----------
        model : a Model instance
        dataset : Dataset
        """
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [
            param for param in model.get_params()
            if np.any(np.isinf(param.get_value()))
        ]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: " + str(inf_params))
        if any([
                np.any(np.isnan(param.get_value()))
                for param in model.get_params()
        ]):
            nan_params = [
                param for param in model.get_params()
                if np.any(np.isnan(param.get_value()))
            ]
            raise ValueError("These params are NaN: " + str(nan_params))
        self.model = model

        self._synchronize_batch_size(model)
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        # test if force batch size and batch size
        if getattr(model, "force_batch_size", False) and \
           any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for
               dataset in self.monitoring_dataset.values()) and \
           not has_uniform_batch_size(self.monitor_iteration_mode):

            raise ValueError("Dataset size is not a multiple of batch size."
                             "You should set monitor_iteration_mode to "
                             "even_sequential, even_shuffled_sequential or "
                             "even_batchwise_shuffled_sequential")

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                                          batch_size=self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    **fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost
        learning_rate = self.learning_rate
        if self.monitoring_dataset is not None:
            if (self.monitoring_batch_size is None
                    and self.monitoring_batches is None):
                self.monitoring_batch_size = self.batch_size
                self.monitoring_batches = self.batches_per_iter
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.monitoring_batch_size,
                               num_batches=self.monitoring_batches,
                               extra_costs=self.monitoring_costs,
                               mode=self.monitor_iteration_mode)
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=None,
                                     val=learning_rate,
                                     data_specs=(NullSpace(), ''),
                                     dataset=monitoring_dataset)

            if self.learning_rule:
                self.learning_rule.add_channels_to_monitor(
                    self.monitor, monitoring_dataset)

        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i
        self.params = params

        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 **fixed_var_descr.fixed_vars)
        if not isinstance(grads, OrderedDict):
            raise TypeError(
                str(type(self.cost)) + ".get_gradients returned " +
                "something with" + str(type(grads)) + "as its " +
                "first member. Expected OrderedDict.")

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        assert len(updates.keys()) == 0

        def get_func(learn_discriminator,
                     learn_generator,
                     dont_you_fucking_dare_touch_the_generator=False):

            updates = OrderedDict()

            assert (learn_discriminator or learn_generator
                    ) and not (learn_discriminator and learn_generator)

            if learn_discriminator:
                cur_params = model.discriminator.get_params()
            else:
                cur_params = model.generator.get_params()

            def check():
                for param in params:
                    if param not in cur_params:
                        assert param not in updates

            cur_grads = OrderedDict()
            for param in cur_params:
                cur_grads[param] = grads[param]

            for param in grads:
                if grads[param].name is None and cost_value is not None:
                    grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
                                         {
                                             'costname': cost_value.name,
                                             'paramname': param.name
                                         })
                assert grads[param].dtype == param.dtype

            cur_lr_scalers = OrderedDict()
            for param in cur_params:
                if param in lr_scalers:
                    lr_scaler = lr_scalers[param]
                    cur_lr_scalers[param] = lr_scaler

            log.info('Parameter and initial learning rate summary:')
            for param in cur_params:
                param_name = param.name
                if param_name is None:
                    param_name = 'anon_param'
                lr = learning_rate.get_value() * cur_lr_scalers.get(param, 1.)
                log.info('\t' + param_name + ': ' + str(lr))

            updates.update(
                self.learning_rule.get_updates(learning_rate, cur_grads,
                                               cur_lr_scalers))
            check()

            for param in cur_params:
                if updates[param].name is None:
                    updates[param].name = 'sgd_update(' + param.name + ')'
            check()
            model.modify_updates(updates)
            check()
            for param in cur_params:
                update = updates[param]
                if update.name is None:
                    update.name = 'censor(sgd_update(' + param.name + '))'
                for update_val in get_debug_values(update):
                    if np.any(np.isinf(update_val)):
                        raise ValueError("debug value of %s contains infs" %
                                         update.name)
                    if np.any(np.isnan(update_val)):
                        raise ValueError("debug value of %s contains nans" %
                                         update.name)

            check()

            if dont_you_fucking_dare_touch_the_generator:
                for param in model.generator.get_params():
                    assert param not in updates

            with log_timing(log, 'Compiling sgd_update'):
                return function(theano_args,
                                updates=updates,
                                name='sgd_update',
                                on_unused_input='ignore',
                                mode=self.theano_function_mode)

        self.d_func = get_func(1,
                               0,
                               dont_you_fucking_dare_touch_the_generator=True)
        self.g_func = get_func(0, 1)
Exemplo n.º 8
0
    def setup(self, model, dataset):
        """
        Allows the training algorithm to do some preliminary configuration
        *before* we actually start training the model. The dataset is provided
        in case other derived training algorithms need to modify model based on
        the dataset.

        Parameters
        ----------
        model : object
            A Python object representing the model to train loosely \
            implementing the interface of models.model.Model.
        dataset : pylearn2.datasets.dataset.Dataset
            Dataset object used to draw training data
        """
        self.model = model

        if self.cost is None:
            self.cost = model.get_default_cost()

        if self.batch_size is None:
            self.batch_size = model.force_batch_size
        else:
            batch_size = self.batch_size
            if self.set_batch_size:
                model.set_batch_size(batch_size)
            elif hasattr(model, 'force_batch_size'):
                if not (model.force_batch_size <= 0
                        or batch_size == model.force_batch_size):
                    raise ValueError("batch_size is %d but " +
                                     "model.force_batch_size is %d" %
                                     (batch_size, model.force_batch_size))

        self.monitor = Monitor.get_monitor(model)
        self.monitor.set_theano_function_mode(self.theano_function_mode)

        data_specs = self.cost.get_data_specs(model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space,
        # named according to the sources.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = 'BGD_[%s]' % source
            arg = space.make_theano_batch(name=name)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with their data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    **fixed_var_descr.fixed_vars)
        grads, grad_updates = self.cost.get_gradients(
            model, nested_args, **fixed_var_descr.fixed_vars)

        assert isinstance(grads, OrderedDict)
        assert isinstance(grad_updates, OrderedDict)

        if cost_value is None:
            raise ValueError("BGD is incompatible with " + str(self.cost) +
                             " because it is intractable, but BGD uses the " +
                             "cost function value to do line searches.")

        # obj_prereqs has to be a list of function f called with f(*data),
        # where data is a data tuple coming from the iterator.
        # this function enables capturing "mapping" and "f", while
        # enabling the "*data" syntax
        def capture(f, mapping=mapping):
            new_f = lambda *args: f(mapping.flatten(args, return_tuple=True))
            return new_f

        obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch]

        if self.monitoring_dataset is not None:
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.batch_size,
                               num_batches=self.monitoring_batches,
                               obj_prereqs=obj_prereqs,
                               cost_monitoring_args=fixed_var_descr.fixed_vars)

            # TODO : Why is this commented?
            '''
            channels = model.get_monitoring_channels(theano_args)
            if not isinstance(channels, dict):
                raise TypeError("model.get_monitoring_channels must return a "
                                "dictionary, but it returned " + str(channels))
            channels.update(self.cost.get_monitoring_channels(model, theano_args, ** fixed_var_descr.fixed_vars))

            for dataset_name in self.monitoring_dataset:
                if dataset_name == '':
                    prefix = ''
                else:
                    prefix = dataset_name + '_'
                monitoring_dataset = self.monitoring_dataset[dataset_name]
                self.monitor.add_dataset(dataset=monitoring_dataset,
                                    mode="sequential",
                                    batch_size=self.batch_size,
                                    num_batches=self.monitoring_batches)

                # The monitor compiles all channels for the same dataset into one function, and
                # runs all prereqs before calling the function. So we only need to register the
                # on_load_batch prereq once per monitoring dataset.
                self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=cost_value,
                        dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch)

                for name in channels:
                    J = channels[name]
                    if isinstance(J, tuple):
                        assert len(J) == 2
                        J, prereqs = J
                    else:
                        prereqs = None

                    self.monitor.add_channel(name= prefix + name,
                                             ipt=ipt,
                                             val=J,
                                             data_specs=data_specs,
                                             dataset = monitoring_dataset,
                                             prereqs=prereqs)
                '''

        params = model.get_params()

        self.optimizer = BatchGradientDescent(
            objective=cost_value,
            gradients=grads,
            gradient_updates=grad_updates,
            params=params,
            param_constrainers=[model.censor_updates],
            lr_scalers=model.get_lr_scalers(),
            inputs=theano_args,
            verbose=self.verbose_optimization,
            max_iter=self.updates_per_batch,
            reset_alpha=self.reset_alpha,
            conjugate=self.conjugate,
            reset_conjugate=self.reset_conjugate,
            min_init_alpha=self.min_init_alpha,
            line_search_mode=self.line_search_mode,
            theano_function_mode=self.theano_function_mode,
            init_alpha=self.init_alpha)

        # These monitoring channels keep track of shared variables,
        # which do not need inputs nor data.
        if self.monitoring_dataset is not None:
            self.monitor.add_channel(
                name='ave_step_size',
                ipt=None,
                val=self.optimizer.ave_step_size,
                data_specs=(NullSpace(), ''),
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_size',
                ipt=None,
                val=self.optimizer.ave_grad_size,
                data_specs=(NullSpace(), ''),
                dataset=self.monitoring_dataset.values()[0])
            self.monitor.add_channel(
                name='ave_grad_mult',
                ipt=None,
                val=self.optimizer.ave_grad_mult,
                data_specs=(NullSpace(), ''),
                dataset=self.monitoring_dataset.values()[0])

        self.first = True
        self.bSetup = True
Exemplo n.º 9
0
    def setup(self, model, dataset):
        """
        Compiles the theano functions needed for the train method.

        Parameters
        ----------
        model : a Model instance
        dataset : Dataset
        """
        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [
            param for param in model.get_params()
            if contains_inf(param.get_value())
        ]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: " + str(inf_params))
        if any(
            [contains_nan(param.get_value()) for param in model.get_params()]):
            nan_params = [
                param for param in model.get_params()
                if contains_nan(param.get_value())
            ]
            raise ValueError("These params are NaN: " + str(nan_params))
        self.model = model

        self._synchronize_batch_size(model)
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        # test if force batch size and batch size
        has_force_batch_size = getattr(model, "force_batch_size", False)
        train_dataset_is_uneven = \
            dataset.get_num_examples() % self.batch_size != 0

        has_monitoring_datasets = bool(self.monitoring_dataset)

        if has_monitoring_datasets:
            monitoring_datasets_are_uneven = \
                any(d.get_num_examples() % self.batch_size
                    != 0 for d in self.monitoring_dataset.values())
        else:
            monitoring_datasets_are_uneven = False  # or True it doesn't matter

        if has_force_batch_size and train_dataset_is_uneven and \
           not has_uniform_batch_size(self.train_iteration_mode):

            raise ValueError("Dataset size is not a multiple of batch size."
                             "You should set train_iteration_mode (and "
                             "maybe monitor_iteration_mode) to "
                             "even_sequential, even_shuffled_sequential or "
                             "even_batchwise_shuffled_sequential")

        if has_force_batch_size and has_monitoring_datasets and \
           monitoring_datasets_are_uneven and \
           not has_uniform_batch_size(self.monitor_iteration_mode):

            raise ValueError("Dataset size is not a multiple of batch size."
                             "You should set monitor_iteration_mode to "
                             "even_sequential, even_shuffled_sequential or "
                             "even_batchwise_shuffled_sequential")

        data_specs = self.cost.get_data_specs(self.model)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)

        # Build a flat tuple of Theano Variables, one for each space.
        # We want that so that if the same space/source is specified
        # more than once in data_specs, only one Theano Variable
        # is generated for it, and the corresponding value is passed
        # only once to the compiled Theano function.
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                                          batch_size=self.batch_size)
            theano_args.append(arg)
        theano_args = tuple(theano_args)

        # Methods of `self.cost` need args to be passed in a format compatible
        # with data_specs
        nested_args = mapping.nest(theano_args)
        fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
        self.on_load_batch = fixed_var_descr.on_load_batch

        cost_value = self.cost.expr(model, nested_args,
                                    **fixed_var_descr.fixed_vars)

        if cost_value is not None and cost_value.name is None:
            # Concatenate the name of all tensors in theano_args !?
            cost_value.name = 'objective'

        learning_rate = self.learning_rate
        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i

        grads, updates = self.cost.get_gradients(model, nested_args,
                                                 **fixed_var_descr.fixed_vars)
        if not isinstance(grads, OrderedDict):
            raise TypeError(
                str(type(self.cost)) + ".get_gradients returned " +
                "something with" + str(type(grads)) + "as its " +
                "first member. Expected OrderedDict.")

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        for param in grads:
            if grads[param].name is None and cost_value is not None:
                grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {
                    'costname': cost_value.name,
                    'paramname': param.name
                })
            assert grads[param].dtype == param.dtype

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +
                                 str(key) +
                                 " which is not an optimization parameter.")

        log.info('Parameter and initial learning rate summary:')
        for param in params:
            param_name = param.name
            if param_name is None:
                param_name = 'anon_param'
            lr = learning_rate.get_value() * lr_scalers.get(param, 1.)
            log.info('\t' + param_name + ': ' + str(lr))

        if self.learning_rule:
            updates.update(
                self.learning_rule.get_updates(learning_rate, grads,
                                               lr_scalers))
        else:
            # Use standard SGD updates with fixed learning rate.
            updates.update(
                dict(
                    safe_zip(params, [
                        param - learning_rate * lr_scalers.get(param, 1.) *
                        grads[param] for param in params
                    ])))

        for param in params:
            if updates[param].name is None:
                updates[param].name = 'sgd_update(' + param.name + ')'
        model.modify_updates(updates)
        for param in params:
            update = updates[param]
            if update.name is None:
                update.name = 'censor(sgd_update(' + param.name + '))'
            for update_val in get_debug_values(update):
                if contains_inf(update_val):
                    raise ValueError("debug value of %s contains infs" %
                                     update.name)
                if contains_nan(update_val):
                    raise ValueError("debug value of %s contains nans" %
                                     update.name)

        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost.
        # We have to do that after learning_rule.get_updates has been
        # called, since it may have an effect on
        # learning_rule.add_channels_to_monitor (that is currently the case
        # for AdaDelta and RMSProp).
        self._setup_monitor()

        with log_timing(log, 'Compiling sgd_update'):
            self.sgd_update = function(theano_args,
                                       updates=updates,
                                       name='sgd_update',
                                       on_unused_input='ignore',
                                       mode=self.theano_function_mode)
        self.params = params