Пример #1
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        i_t = self.i + 1.
        fix1 = 1. - (1. - self.b1) ** i_t
        fix2 = 1. - (1. - self.b2) ** i_t
        lr_t = self.learning_rate * (T.sqrt(fix2) / fix1)

        for param in grads.keys():
            m = theano.shared(param.get_value() * 0.)
            self.parameters.append(m)
            v = theano.shared(param.get_value() * 0.)
            self.parameters.append(v)

            b1t = 1. - (1. - self.b1) * self.lmbda**(i_t - 1)
            m_t = b1t * grads[param] + (1. - b1t) * m
            v_t = self.b2 * T.sqr(grads[param]) + (1. - self.b2) * v
            g_t = m_t / (T.sqrt(v_t) + self.epsilon)
            p_t = param - (lr_t * g_t)

            updates[m] = m_t
            updates[v] = v_t
            updates[param] = p_t
        updates[self.i] = i_t

        return updates
Пример #2
0
    def get_gradients(self, model, data, ** kwargs):
        #print 'get_gradients'
        pos_v = data
        #pos_h = model.sample_h_given_v(pos_v)[-1]
        #chain_start = pos_v
        #h_samples = pos_h
        #print 'v_samples', v_samples.ndim
        [act_hids, hid_mfs, hid_samples, act_vis, vis_mfs, vis_samples], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, 
		                        outputs_info=[None, None, None, None, None, pos_v], non_sequences=None, n_steps=self.k)
        neg_v = vis_samples[-1]
        #neg_h = hid_samples[-1]
        
        cost = -(- model.free_energy(pos_v).mean() + model.free_energy(neg_v).mean())

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, neg_v])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()
        
        updates.update(scan_updates) # add scan_updates

        return gradients, updates
Пример #3
0
    def get_monitoring_channels(self, model, X, Y=None, ** kwargs):
        if Y is  None and self.supervised:
            raise ValueError("no targets provided while some of the " +
                             "costs in the sum are supervised costs")

        rval = OrderedDict()

        for i, cost in enumerate(self.costs):
            try:
                rval.update(cost.get_monitoring_channels(model, X, Y, **kwargs))
            except TypeError:
                print 'SumOfCosts.get_monitoring_channels encountered TypeError while calling ' \
                        + str(type(cost))+'.get_monitoring_channels'
                raise

            Y_to_pass = Y
            if not cost.supervised:
                Y_to_pass = None

            value = cost(model, X, Y_to_pass, ** kwargs)
            if value is not None:
                name = ''
                if hasattr(value, 'name') and value.name is not None:
                    name = '_' + value.name
                rval['term_'+str(i)+name] = value

        return rval
Пример #4
0
    def get_params(self):
        """
        This returns the list of theano shared variables that will be trained by the :class:`Optimizer`.
        These parameters are used in the gradient.

        This includes all of the parameters in every model in the Prototype, without duplication.

        Returns
        -------
        dict(str: SharedVariable)
            Dictionary of {string_name: theano shared variables} to be trained with an :class:`Optimizer`.
            These are the parameters to be trained.
        """
        params = OrderedDict()
        model_index = 0
        for model in self.models:
            if isinstance(model, Model):
                model_params = model.get_params()
                # append the parameters only if they aren't already in the list!
                for name, param in model_params.items():
                    if param not in list(params.values()):
                        name = model._classname + '_%d_' % model_index + name
                        params[name] = param
                model_index += 1
        return params
Пример #5
0
    def get_gradients(self, model, data, ** kwargs):
        indiv_results = []
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)
        for cost, cost_data in safe_zip(self.costs, nested_data):
            result = cost.get_gradients(model, cost_data, ** kwargs)
            indiv_results.append(result)

        grads = OrderedDict()
        updates = OrderedDict()
        params = model.get_params()

        for coeff, packed in zip(self.coeffs, indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError("A shared variable (" +
                                     str(param) +
                                     ") that is not a parameter appeared "
                                     "a cost gradient dictionary.")
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates
Пример #6
0
    def __init__(self, valid=None, invalid=None, valid_equivalent=None):
        '''
        Check if variables can be expressed without using variables in invalid.

        init_valid_equivalent provides a dictionary mapping some invalid
        variables to valid ones that can be used instead.
        '''

        if valid is None:
            valid = []
        if invalid is None:
            invalid = []
        if valid_equivalent is None:
            valid_equivalent = OrderedDict()

        # Nodes that are valid to have in the graph computing outputs
        self.valid = set(valid)

        # Nodes that are NOT valid to have in the graph computing outputs
        self.invalid = set(invalid)

        # Mapping from invalid variables to equivalent valid ones.
        self.valid_equivalent = valid_equivalent.copy()
        self.valid.update(valid_equivalent.values())
        self.invalid.update(valid_equivalent.keys())
Пример #7
0
    def get_gradients(self, model, data, ** kwargs):

        cost = self.expr(model=model, data=data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        if self.gradient_clipping:
            norm_gs = 0.
            for grad in gradients.values():
                norm_gs += (grad ** 2).sum()
            not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
            norm_gs = T.sqrt(norm_gs)
            norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude),
                               self.max_magnitude / norm_gs,
                               1.)

            for param, grad in gradients.items():
                gradients[param] = T.switch(not_finite,
                                            .1 * param,
                                            grad * norm_gs)

        updates = OrderedDict()

        return gradients, updates
Пример #8
0
    def get_gradients(self, model, data, ** kwargs):
        #print 'get_gradients'
        chain_start = theano.shared(numpy.zeros(shape=(self.chain_num, model.n_vis), dtype=theano.config.floatX), name='chain_start', borrow=True)
        
        [act_hids, hid_mfs, hid_samples, act_vis, vis_mfs, vis_samples], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, 
		                        outputs_info=[None, None, None, None, None, chain_start], non_sequences=None, n_steps=self.k)
    
        chain_end = vis_samples[-1]
        scan_updates[chain_start] = chain_end
        
        pos_v = data 
        
        cost = -(- model.free_energy(pos_v).mean() + model.free_energy(chain_end).mean())

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, chain_end])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()
        
        updates.update(scan_updates) # manual added

        return gradients, updates
Пример #9
0
    def get_gradients(self, model, data, ** kwargs):
        #print 'get_gradients'
        chain_start = theano.shared(numpy.zeros(shape=(self.chain_num, model.n_vis)), name=None, borrow=True)
        v_samples = chain_start
        
        for i in xrange(self.k):
            v_samples = model.gibbs_vhv(v_samples)[-1]
        chain_end = v_samples
        #print 'chain_end', chain_end.ndim
        chain_updates = {}
        chain_updates[chain_start] = chain_end
        
        pos_v = data
        #neg_v = self.get_neg_v(model)
        
        cost = -(- model.free_energy(pos_v).mean() + model.free_energy(chain_end).mean())

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[chain_end])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()
        
        updates.update(chain_updates) # manual added

        return gradients, updates
Пример #10
0
    def orderings(self):
        """
        Return dict d s.t. d[node] is a list of nodes that must be evaluated
        before node itself can be evaluated.

        This is used primarily by the destroy_handler feature to ensure that
        all clients of any destroyed inputs have already computed their
        outputs.

        :note: This only calls the orderings() fct on all features. It does not
               take care of computing dependencies by itself.

        """
        ords = OrderedDict()
        assert isinstance(self._features, list)
        for feature in self._features:
            if hasattr(feature, 'orderings'):
                orderings = feature.orderings(self)
                if not isinstance(orderings, OrderedDict):
                    raise TypeError("Non-deterministic return value from " +
                                    str(feature.orderings) +
                                    ". Nondeterministic object is " +
                                    str(orderings))
                for node, prereqs in orderings.items():
                    if not isinstance(prereqs, (list, OrderedSet)):
                        raise TypeError(
                            "prereqs must be a type with a "
                            "deterministic iteration order, or toposort "
                            " will be non-deterministic.")
                    ords.setdefault(node, []).extend(prereqs)
        # eliminate duplicate prereqs
        for (node, prereqs) in ords.items():
            ords[node] = list(OrderedSet(prereqs))
        return ords
Пример #11
0
    def get_monitoring_channels(self, model, data, **kwargs):
        self.get_data_specs(model)[0].validate(data)
        rval = OrderedDict()
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)

        for i, cost in enumerate(self.costs):
            cost_data = nested_data[i]
            try:
                channels = cost.get_monitoring_channels(model, cost_data, **kwargs)
                rval.update(channels)
            except TypeError:
                print (
                    "SumOfCosts.get_monitoring_channels encountered "
                    "TypeError while calling " + str(type(cost)) + ".get_monitoring_channels"
                )
                raise

            value = cost.expr(model, cost_data, **kwargs)
            if value is not None:
                name = ""
                if hasattr(value, "name") and value.name is not None:
                    name = "_" + value.name
                rval["term_" + str(i) + name] = value

        return rval
Пример #12
0
    def get_monitoring_channels(self, model, data, ** kwargs):
        self.get_data_specs(model)[0].validate(data)
        rval = OrderedDict()
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)

        for i, cost in enumerate(self.costs):
            cost_data = nested_data[i]
            try:
                channels = cost.get_monitoring_channels(model, cost_data,
                                                        **kwargs)
                rval.update(channels)
            except TypeError:
                logger.error('SumOfCosts.get_monitoring_channels encountered '
                             'TypeError while calling {0}'
                             '.get_monitoring_channels'.format(type(cost)))
                raise

            value = cost.expr(model, cost_data, ** kwargs)
            if value is not None:
                name = ''
                if hasattr(value, 'name') and value.name is not None:
                    name = '_' + value.name
                rval['term_' + str(i) + name] = value

        return rval
Пример #13
0
    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=None):

        W, = self.transformer.get_params()

        assert W.ndim == 4

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=(1, 2, 3)))

        rval = OrderedDict([
                           ('kernel_norms_min', row_norms.min()),
                           ('kernel_norms_mean', row_norms.mean()),
                           ('kernel_norms_max', row_norms.max()),
                           ])

        orval = super(CorrMMElemwise, self).get_monitoring_channels_from_state(state,
                                                                            targets)

        rval.update(orval)

        cst = self.cost
        orval = self.nonlin.get_monitoring_channels_from_state(state,
                                                               targets,
                                                               cost_fn=cst)

        rval.update(orval)

        return rval
Пример #14
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_grad_' + param.name, borrow=False)
            self.parameters.append(mean_square_grad)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_dx_' + param.name, borrow=False)
            self.parameters.append(mean_square_dx)

            # Accumulate gradient
            new_mean_squared_grad = self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param])

            # Compute update
            rms_dx_tm1 = T.sqrt(mean_square_dx + self.epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + self.epsilon)
            delta_x_t = - rms_dx_tm1 / rms_grad_t * grads[param]

            # Accumulate updates
            new_mean_square_dx = self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t)

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

        return updates
Пример #15
0
    class OrderedSet(object):
        """
        An implementation of OrderedSet based on the keys of
        an OrderedDict.
        """
        def __init__(self, iterable=None):
            self.data = OrderedDict()
            if iterable is not None:
                self.update(iterable)

        def update(self, container):
            check_deterministic(container)
            for elem in container:
                self.add(elem)

        def add(self, key):
            self.data[key] = None

        def __len__(self):
            return len(self.data)

        def __contains__(self, key):
            return key in self.data

        def discard(self, key):
            if key in self.data:
                del self.data[key]

        def remove(self, key):
            if key in self.data:
                del self.data[key]
            else:
                raise KeyError(key)

        def __iter__(self):
            return self.data.__iter__()

        def __reversed__(self):
            return self.data.__reversed__()

        def pop(self, last=True):
            raise NotImplementedError()

        def __eq__(self, other):
            # Note that we implement only the comparison to another
            # `OrderedSet`, and not to a regular `set`, because otherwise we
            # could have a non-symmetric equality relation like:
            #       my_ordered_set == my_set and my_set != my_ordered_set
            if isinstance(other, OrderedSet):
                return len(self) == len(other) and list(self) == list(other)
            elif isinstance(other, set):
                # Raise exception to avoid confusion.
                raise TypeError(
                        'Cannot compare an `OrderedSet` to a `set` because '
                        'this comparison cannot be made symmetric: please '
                        'manually cast your `OrderedSet` into `set` before '
                        'performing this comparison.')
            else:
                return NotImplemented
Пример #16
0
def main():
    var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W')
    updates = [(var, add_uniform(input=var, noise_level=.02))]

    stats = get_stats(var)
    l1 = stats.pop('l1')
    l2 = stats.pop('l2')
    min = stats.pop('min')
    max = stats.pop('max')
    var = stats.pop('var')
    std = stats.pop('std')
    mean = stats.pop('mean')

    mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt'))
    var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt'))

    w_channel = MonitorsChannel('W', monitors=mean_monitor)

    stat_channel = MonitorsChannel('stats', monitors=[var_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed_raw = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw])
    train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw])
    valid_collapsed_raw = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw])
    valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw])

    log.debug('compiling...')
    f = theano.function(inputs=[], outputs=train_collapsed.values(), updates=updates)
    f2 = theano.function(inputs=[], outputs=valid_collapsed.values(), updates=updates)
    log.debug('done')

    t1=time.time()

    for epoch in range(10):
        t=time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        for name, service in train_services.items():
            if name in m:
                service.write(m[name], TRAIN)
        log.debug('----- '+make_time_units_string(time.time()-t))

    for epoch in range(10):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        for name, service in valid_services.items():
            if name in m:
                service.write(m[name], VALID)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
Пример #17
0
    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=NotImplementedError):

        if self.no_affine:
            return OrderedDict()

        W_class = self.W_class
        W_cluster = self.W_cluster

        assert W_class.ndim == 3
        assert W_cluster.ndim == 2

        sq_W = T.sqr(W_cluster)
        sq_W_class = T.sqr(W_class)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        row_norms_class = T.sqrt(sq_W_class.sum(axis=1))
        col_norms_class = T.sqrt(sq_W_class.sum(axis=0))

        rval = OrderedDict([
                            ('row_norms_min'  , row_norms.min()),
                            ('row_norms_mean' , row_norms.mean()),
                            ('row_norms_max'  , row_norms.max()),
                            ('col_norms_min'  , col_norms.min()),
                            ('col_norms_mean' , col_norms.mean()),
                            ('col_norms_max'  , col_norms.max()),
                            ('class_row_norms_min'  , row_norms_class.min()),
                            ('class_row_norms_mean' , row_norms_class.mean()),
                            ('class_row_norms_max'  , row_norms_class.max()),
                            ('class_col_norms_min'  , col_norms_class.min()),
                            ('class_col_norms_mean' , col_norms_class.mean()),
                            ('class_col_norms_max'  , col_norms_class.max()),
                            ])


        if (state_below is not None) or (state is not None):
            if state is None:

                #for value in get_debug_values(state_below):
                    #print 'value is'+ value
                state=self.fprop (state_below,targets)
            #print state
            probclass, probcluster = state
            mx = probclass.max(axis=1)
            rval.update(OrderedDict([('mean_max_class',mx.mean()),
                                     ('max_max_class' , mx.max()),
                                     ('min_max_class' , mx.min())
                                    ]))
            if targets is not None:
                rval['nll'] = self.cost(Y=targets,Y_hat=(probclass,probcluster))
                rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32'))
                rval['entropy'] = rval['nll']/np.log(2).astype('float32')
        return rval
Пример #18
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            decreased_learning_rate = T.cast(self.learning_rate / (1 + (self.decrease_constant * self.current_iteration)), dtype=theano.config.floatX)
            updates[param] = param - decreased_learning_rate * grads[param]

        updates[self.current_iteration] = self.current_iteration + 1

        return updates
Пример #19
0
    def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
        """
        Provides the updates for learning with gradient descent + momentum.

        Parameters
        ----------
        learning_rate : float
            Learning rate coefficient.
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        lr_scalers : dict
            A dictionary mapping from the model's parameters to a learning
            rate multiplier.
        """
        gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
                             name='%s_grad' % p.name)
                             for p, g in grads.iteritems()})

        gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())]
        get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
        gnorm = get_norms(grads.values())
        pnorm = get_norms(grads.keys())
        f_grad_shared = theano.function(inp,
                                        [cost, errors, gnorm, pnorm],
                                        updates=gsup)
        updates = OrderedDict()

        for param, grad in gshared.keys():
            vel = sharedX(param.get_value() * 0.)
            assert param.dtype == vel.dtype
            assert grad.dtype == param.dtype
            if param.name is not None:
                vel.name = 'vel_' + param.name

            scaled_lr = learning_rate * lr_scalers.get(param, 1.)
            updates[vel] = self.momentum * vel - scaled_lr * grad

            inc = updates[vel]
            if self.nesterov_momentum:
                inc = self.momentum * inc - scaled_lr * grad

            assert inc.dtype == vel.dtype
            updates[param] = param + inc

        f_update = theano.function([learning_rate],
                                   [],
                                   updates=updates,
                                   on_unused_input='ignore')

        return f_grad_shared, f_update
Пример #20
0
    def get_gradients(self, model, data, **kwargs):
        cost = self._cost(model, data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs="ignore", consider_constant=[self.sampler.particles])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        sampler_updates = self.sampler.updates()
        updates.update(sampler_updates)
        return gradients, updates
Пример #21
0
    def on_attach(self, fgraph):
        """
        When attaching to a new fgraph, check that
            1) This DestroyHandler wasn't already attached to some fgraph
               (its data structures are only set up to serve one)
            2) The FunctionGraph doesn't already have a DestroyHandler.
               This would result in it validating everything twice, causing
               compilation to be slower.

        Give the FunctionGraph instance:
            1) A new method "destroyers(var)"
                TODO: what does this do exactly?
            2) A new attribute, "destroy_handler"
        TODO: WRITEME: what does this do besides the checks?
        """

        ####### Do the checking ###########
        already_there = False
        if self.fgraph is fgraph:
            already_there = True
        if self.fgraph is not None:
            raise Exception(
                "A DestroyHandler instance can only serve one"
                " FunctionGraph. (Matthew 6:24)")
        for attr in ('destroyers', 'destroy_handler'):
            if hasattr(fgraph, attr):
                already_there = True

        if already_there:
            # FunctionGraph.attach_feature catches AlreadyThere and cancels the attachment
            raise toolbox.AlreadyThere(
                "DestroyHandler feature is already present"
                " or in conflict with another plugin.")

        ####### Annotate the FunctionGraph ############
        self.unpickle(fgraph)
        fgraph.destroy_handler = self

        self.fgraph = fgraph
        self.destroyers = OrderedSet()  # set of Apply instances with non-null destroy_map
        self.view_i = OrderedDict()  # variable -> variable used in calculation
        self.view_o = OrderedDict()  # variable -> set of variables that use this one as a direct input
        # clients: how many times does an apply use a given variable
        self.clients = OrderedDict()  # variable -> apply -> ninputs
        self.stale_droot = True

        self.debug_all_apps = OrderedSet()
        if self.do_imports_on_attach:
            toolbox.Bookkeeper.on_attach(self, fgraph)
Пример #22
0
    def __init__(self,
                 recurrent=[],
                 recurrent_dim=[],
                 self_recurrent=1,
                 clip_gradient = True,
                 clip_bound = 5,
                 init_U=InitCell('ortho'),
                 **kwargs):

        super(RecurrentLayer, self).__init__(**kwargs)
        self.recurrent = OrderedDict()

        if self_recurrent:
            self.recurrent[self.name] = self.nout

        recurrent_dim = tolist(recurrent_dim)

        for i, rec in enumerate(tolist(recurrent)):
            if len(recurrent_dim) != 0:
                self.recurrent[rec] = recurrent_dim[i]
            else:
                self.recurrent[rec] = None

        self.clip_gradient = clip_gradient
        self.clip_bound = clip_bound
        self.init_U = init_U
Пример #23
0
    def __init__(self, model):
        """
        Makes a monitor for `model`. Assumes the model has not been
        trained at all yet.

        Parameters
        ----------
        model : pylearn2.models.model.Model instance
        """
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        # Determine whether the model should use topological or vector form of
        # examples. If the model acts on a space with more than the batch index
        # and channel dimension, the model has topological dimensions, so the
        # topological view of the data should be used.
        vector = model.get_input_space().make_theano_batch(name='monitoring_input')
        if isinstance(vector.type, theano.sparse.SparseType):
            self.topo = False
        else:
            self.topo = len(vector.type.broadcastable) > 2

        self.require_label = False
        self.theano_function_mode = None
Пример #24
0
    def __init__(self, model):
        """
        Makes a monitor for `model`. Assumes the model has not been trained at
        all yet.

        Parameters
        ----------
        model : pylearn2.models.model.Model instance
            WRITEME
        """
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        self.theano_function_mode = None

        # Initialize self._nested_data_specs, self._data_specs_mapping,
        # and self._flat_data_specs
        self._build_data_specs()
Пример #25
0
 def __init__(self,
              parent=[],
              parent_dim=[],
              nout=None,
              init_W=InitCell('randn'),
              init_b=InitCell('zeros'),
              cons=0.,
              name=None,
              lr_scaler=None,
              **kwargs):
     super(StemCell, self).__init__(**kwargs)
     if name is None:
         name = self.__class__.name__.lower()
     self.name = name
     self.nout = nout
     self.init_W = init_W
     self.init_b = init_b
     self.cons = cons
     self.parent = OrderedDict()
     parent_dim = tolist(parent_dim)
     for i, par in enumerate(tolist(parent)):
         if len(parent_dim) != 0 and len(parent) != 0:
             if len(parent) != len(parent_dim):
                 raise AssertionError("You probably had a mistake providing,\
                                       write number of values. It will end,\
                                       up with a model containing a bug.")
             self.parent[par] = parent_dim[i]
         else:
             self.parent[par] = None
     self.params = OrderedDict()
     self.lr_scaler = lr_scaler
Пример #26
0
 def __init__(self, dim, layer_name, irange, indices=None,
              init_bias=0., svd=True, nonlinearity=tensor.tanh):
     self.rnn_friendly = True
     self._scan_updates = OrderedDict()
     self.__dict__.update(locals())
     del self.self
     super(Recurrent, self).__init__()
Пример #27
0
    def build_train_fn(self,):
        self.lr_theano = T.scalar('lr')
        self.grad_inputs = self.inputs + [self.lr_theano]
        if self.momentum:
            self.mom_theano = T.scalar('mom')
            self.grad_inputs = self.grad_inputs + [self.mom_theano]
        
        self.gparams = T.grad(self.costs[0],self.params,consider_constant=self.consider_constant)
        if not self.momentum:
            print 'Building SGD optimization graph without momentum'
            updates = OrderedDict((i, i - self.lr_theano*j) for i, j in zip(self.params, self.gparams))
        else:
            print 'Building SGD optimization graph with momentum'
            updates = OrderedDict()
            for param,param_mom,gparam in zip(self.params,self.params_mom,self.gparams):
                param_inc = self.mom_theano * param_mom - self.lr_theano * gparam
                updates[param_mom] = param_inc
                updates[param] = param + param_inc
        self.calc_cost = theano.function(self.inputs,self.costs)
        if self.updates_old:
            updates_old = copy.copy(updates_old) #To avoid updating the model dict if updates dict belongs to model class, very unlikely case.
            self.updates_old.update(updates)
        else:
            self.updates_old = OrderedDict()
            self.updates_old.update(updates)

        self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old)
Пример #28
0
    def get_gradients(self, model, data, ** kwargs):

        cost_cd, cost_ci = model.cost_from_X(data)
        params_dict = model.get_params()
        params = list(params_dict)

        zero_grads = []
        if self.zero_ci_grad_for_cd:
            #how to get this in less explicit way, i.e. using only dict?
            print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
            assert model.layers[-1].M in params_dict
            assert model.layers[-1].m in params_dict
            zero_grads = [model.layers[-1].M, model.layers[-1].m]

        grads_cd = T.grad(cost_cd, params, disconnected_inputs = 'ignore', consider_constant=zero_grads)
        grads_ci = T.grad(cost_ci, params, disconnected_inputs = 'ignore')

        gradients_cd = OrderedDict(izip(params, grads_cd))
        gradients_ci = OrderedDict(izip(params, grads_ci))

        indiv_results = []
        indiv_results.append((gradients_cd, OrderedDict()))
        indiv_results.append((gradients_ci, OrderedDict()))

        grads = OrderedDict()
        updates = OrderedDict()
        params = model.get_params()

        for coeff, packed in zip([self.coeff_cd, self.coeff_ci], indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError("A shared variable ("+str(param)+") that is not a parameter appeared in a cost gradient dictionary.")
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates
Пример #29
0
class StemCell(NonlinCell):
    """
    WRITEME

    Parameters
    ----------
    .. todo::
    """
    def __init__(self,
                 name,
                 parent=[],
                 parent_dim=[],
                 nout=None,
                 init_W=InitCell('randn'),
                 init_b=InitCell('zeros'),
                 cons=0.,
                 use_bias=1,
                 lr_scaler=None,
                 **kwargs):
        super(StemCell, self).__init__(**kwargs)
        if name is None:
            name = self.__class__.name__.lower()
        self.name = name
        self.nout = nout
        self.init_W = init_W
        self.init_b = init_b
        self.cons = cons
        self.parent = OrderedDict()
        parent_dim = tolist(parent_dim)
        for i, par in enumerate(tolist(parent)):
            if len(parent_dim) != 0 and len(parent) != 0:
                if len(parent) != len(parent_dim):
                    raise AssertionError("You probably had a mistake providing,\
                                          write number of values. It will end,\
                                          up with a model containing a bug.")
                self.parent[par] = parent_dim[i]
            else:
                self.parent[par] = None
        self.lr_scaler = lr_scaler
        self.use_bias = use_bias

    def fprop(self):
        raise NotImplementedError(
            str(type(self)) + " does not implement Layer.fprop.")

    def initialize(self):

        params = OrderedDict()

        for parname, parout in self.parent.items():
            W_shape = (parout, self.nout)
            W_name = 'W_' + parname + '__' + self.name
            params[W_name] = self.init_W.get(W_shape)

        if self.use_bias:
            params['b_'+self.name] = self.init_b.get(self.nout)

        return params
Пример #30
0
    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=None):

        # channels that does not require state information
#         if self.no_affine:
#             rval = OrderedDict()
#
#         W = self.W
# 
#         assert W.ndim == 2
# 
#         sq_W = T.sqr(W)
# 
#         row_norms = T.sqrt(sq_W.sum(axis=1))
#         col_norms = T.sqrt(sq_W.sum(axis=0))
# 
#         rval = OrderedDict([('row_norms_min',  row_norms.min()),
#                             ('row_norms_mean', row_norms.mean()),
#                             ('row_norms_max',  row_norms.max()),
#                             ('col_norms_min',  col_norms.min()),
#                             ('col_norms_mean', col_norms.mean()),
#                             ('col_norms_max',  col_norms.max()), ])

        rval = OrderedDict()
        if (state_below is not None) or (state is not None):
            if state is None:
                state = self.fprop(state_below)

            mx = state.max(axis=1)

            rval.update(OrderedDict([
                                ('mean_max_class', mx.mean()),
                                ('max_max_class', mx.max()),
                                ('min_max_class', mx.min())]))

            if targets is not None:
                y_hat = self.target_convert(T.argmax(state, axis=1))
                #Assume target is in [0,1] as binary one-hot
                y = self.target_convert(T.argmax(targets, axis=1))
                misclass = T.neq(y, y_hat).mean()
                misclass = T.cast(misclass, config.floatX)
                rval['misclass'] = misclass
                rval['nll'] = self.cost(Y_hat=state, Y=targets)

        return rval
Пример #31
0
    def __init__(self,
                 dataset,
                 model=None,
                 epochs=10,
                 batch_size=100,
                 min_batch_size=1,
                 save_freq=None,
                 stop_threshold=None,
                 stop_patience=None,
                 learning_rate=1e-6,
                 lr_decay=None,
                 lr_decay_factor=None,
                 decay=0.95,
                 max_scaling=1e5,
                 grad_clip=None,
                 hard_clip=False):
        """
        Initialize RMSProp.

        Parameters
        ----------
        dataset : Dataset
            The Dataset to use when training the Model.
        model : Model
            The Model to train. Needed if the Optimizer isn't being passed to a Model's .train() method.
        epochs : int
            how many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        min_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_freq : int
            How many epochs to train between each new save of the Model's parameters.
        stop_threshold : float
            The factor by how much the best validation training score needs to improve to determine early stopping.
        stop_patience : int
            The patience or number of epochs to wait after the stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The type of decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for options.
        lr_decay_factor : float
            The amount to use for the decay function when changing the learning rate over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        decay : float, optional
            Decay constant similar to that used in AdaDelta and Momentum methods.
        max_scaling: float, optional
            Restrict the RMSProp gradient scaling coefficient to values
            below `max_scaling`.
        grad_clip : float, optional
            Whether to clip gradients. This will clip with a maximum of grad_clip or the parameter norm.
        hard_clip : bool
            Whether to use a hard cutoff or rescaling for clipping gradients.
        """
        # need to call the Optimizer constructor
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(RMSProp, self).__init__(**initial_parameters)

        assert max_scaling > 0., "Max_scaling needs to be > 0."
        self.max_scaling = max_scaling
        self.epsilon = 1. / self.max_scaling
        self.decay = decay
        self.mean_square_grads = OrderedDict()
Пример #32
0
    def __init__(self, nh, nc, ne, de, cs, em, init, featdim):
        '''
        nh :: dimension of the hidden layer
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings
        cs :: word window context size 
        '''
        # parameters of the model

        self.featdim = featdim

        tmp_emb = 0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de))
        if init:
            for row in xrange(ne + 1):
                if em[row] is not None:
                    tmp_emb[row] = em[row]

        self.emb = theano.shared(tmp_emb.astype(
            theano.config.floatX))  # add one for PADDING at the end

        # weights for LSTM
        n_in = de * cs
        print "de,cs", de, cs
        # print  "n_i",n_i
        n_hidden = n_i = n_c = n_o = n_f = nh
        n_y = nc
        print "n_y", n_y
        print "n_hidden, n_i, n_c, n_o,nh", n_hidden, n_i, n_c, n_o, nh

        self.W_xi = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_i)).astype(dtype))
        self.W_hi = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_i)).astype(dtype))
        self.W_ci = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_c, n_i)).astype(dtype))
        self.b_i = theano.shared(numpy.cast[dtype](uniform(-0.5, .5,
                                                           size=n_i)))
        self.W_xf = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_f)).astype(dtype))
        self.W_hf = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_f)).astype(dtype))
        self.W_cf = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_c, n_f)).astype(dtype))
        self.b_f = theano.shared(numpy.cast[dtype](uniform(0, 1., size=n_f)))
        self.W_xc = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_c)).astype(dtype))
        self.W_hc = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_c)).astype(dtype))
        self.b_c = theano.shared(numpy.zeros(n_c, dtype=dtype))
        self.W_xo = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_o)).astype(dtype))
        self.W_ho = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_o)).astype(dtype))
        self.W_co = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_c, n_o)).astype(dtype))
        self.b_o = theano.shared(numpy.cast[dtype](uniform(-0.5, .5,
                                                           size=n_o)))
        self.W_hy = theano.shared(
            0.2 * uniform(-1.0, 1.0, (n_hidden + featdim, n_y)).astype(dtype))
        self.b_y = theano.shared(numpy.zeros(n_y, dtype=dtype))

        self.c0 = theano.shared(numpy.zeros(n_hidden, dtype=dtype))
        self.h0 = T.tanh(self.c0)

        # bundle weights
        self.params = [self.emb, self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf, \
                       self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_xo, self.W_ho, \
                       self.W_co, self.b_o, self.W_hy, self.b_y, self.c0]
        self.names  = ['embeddings', 'W_xi', 'W_hi', 'W_ci', 'b_i', 'W_xf', 'W_hf', 'W_cf', 'b_f', \
                       'W_xc', 'W_hc', 'b_c', 'W_xo', 'W_ho', 'W_co', 'b_o', 'W_hy', 'b_y', 'c0']

        idxs = T.imatrix(
        )  # as many columns as context window size/lines as words in the sentence
        # print idxs.shape()
        x = self.emb[idxs].reshape((idxs.shape[0], de * cs))
        # print type(x), x.shape(), "details of x"
        f = T.matrix('f')
        f.reshape((idxs.shape[0], featdim))
        # print type(f), f.shape(), "details of f"
        y = T.iscalar('y')  # label

        # print type(y), y.shape(), "details of y"

        def recurrence(x_t, feat_t, h_tm1, c_tm1):
            i_t = sigma(
                theano.dot(x_t, self.W_xi) + theano.dot(h_tm1, self.W_hi) +
                theano.dot(c_tm1, self.W_ci) + self.b_i)
            f_t = sigma(
                theano.dot(x_t, self.W_xf) + theano.dot(h_tm1, self.W_hf) +
                theano.dot(c_tm1, self.W_cf) + self.b_f)
            c_t = f_t * c_tm1 + i_t * T.tanh(
                theano.dot(x_t, self.W_xc) + theano.dot(h_tm1, self.W_hc) +
                self.b_c)
            o_t = sigma(
                theano.dot(x_t, self.W_xo) + theano.dot(h_tm1, self.W_ho) +
                theano.dot(c_t, self.W_co) + self.b_o)
            h_t = o_t * T.tanh(c_t)

            if self.featdim > 0:
                all_t = T.concatenate([h_t, feat_t])
            else:
                all_t = h_t

            # print "all_t", type(all_t), T.shape(all_t)
            s_t = softmax(theano.dot(all_t, self.W_hy) + self.b_y)
            # print T.shape(h_t), T.shape(c_t), T.shape(s_t)
            return [h_t, c_t, s_t]

        # Initialization occurs in outputs_info
        # scan gives -- result, updates
        [h, _, s], _ = theano.scan(fn=recurrence,
                                   sequences=[x, f],
                                   outputs_info=[self.h0, self.c0, None],
                                   n_steps=x.shape[0])

        p_y_given_x_lastword = s[-1, 0, :]
        p_y_given_x_sentence = s[:, 0, :]
        y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost and gradients and learning rate
        lr = T.scalar('lr')
        nll = -T.mean(T.log(p_y_given_x_lastword)[y])
        gradients = T.grad(nll, self.params)
        updates = OrderedDict(
            (p, p - lr * g) for p, g in zip(self.params, gradients))

        # theano functions
        self.classify = theano.function(inputs=[idxs, f], outputs=y_pred)

        self.train = theano.function(inputs=[idxs, f, y, lr],
                                     outputs=nll,
                                     updates=updates)

        self.normalize = theano.function(
            inputs=[],
            updates={
                self.emb:
                self.emb / T.sqrt(
                    (self.emb**2).sum(axis=1)).dimshuffle(0, 'x')
            })
Пример #33
0
def main():
    var = theano.shared(T.zeros(shape=(88, 100),
                                dtype=theano.config.floatX).eval(),
                        name='W')
    updates = [(var, add_uniform(input=var, noise_level=.02))]

    stats = get_stats(var)
    l1 = stats.pop('l1')
    l2 = stats.pop('l2')
    min = stats.pop('min')
    max = stats.pop('max')
    var = stats.pop('var')
    std = stats.pop('std')
    mean = stats.pop('mean')

    mean_monitor = Monitor('mean',
                           mean,
                           train=True,
                           valid=True,
                           out_service=FileService('outs/mean.txt'))
    var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt'))

    w_channel = MonitorsChannel('W', monitors=mean_monitor)

    stat_channel = MonitorsChannel('stats', monitors=[var_monitor])

    monitors = [w_channel, stat_channel]

    train_collapsed_raw = collapse_channels(monitors, train=True)
    train_collapsed = OrderedDict([(item[0], item[1])
                                   for item in train_collapsed_raw])
    train_services = OrderedDict([(item[0], item[2])
                                  for item in train_collapsed_raw])
    valid_collapsed_raw = collapse_channels(monitors, valid=True)
    valid_collapsed = OrderedDict([(item[0], item[1])
                                   for item in valid_collapsed_raw])
    valid_services = OrderedDict([(item[0], item[2])
                                  for item in valid_collapsed_raw])

    log.debug('compiling...')
    f = theano.function(inputs=[],
                        outputs=train_collapsed.values(),
                        updates=updates)
    f2 = theano.function(inputs=[],
                         outputs=valid_collapsed.values(),
                         updates=updates)
    log.debug('done')

    t1 = time.time()

    for epoch in range(10):
        t = time.time()
        log.debug(epoch)
        vals = f()
        m = OrderedDict(zip(train_collapsed.keys(), vals))
        for name, service in train_services.items():
            if name in m:
                service.write(m[name], TRAIN)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    for epoch in range(10):
        t = time.time()
        log.debug(epoch)
        vals = f2()
        m = OrderedDict(zip(valid_collapsed.keys(), vals))
        for name, service in valid_services.items():
            if name in m:
                service.write(m[name], VALID)
        log.debug('----- ' + make_time_units_string(time.time() - t))

    log.debug("TOTAL TIME " + make_time_units_string(time.time() - t1))
Пример #34
0
    def __init__(self,
                 model,
                 dataset,
                 iterator_class=SequentialIterator,
                 config=None,
                 defaults=_defaults,
                 rng=None,
                 n_epoch=None,
                 batch_size=None,
                 minimum_batch_size=None,
                 save_frequency=None,
                 early_stop_threshold=None,
                 early_stop_length=None,
                 learning_rate=None,
                 lr_decay=None,
                 lr_factor=None,
                 momentum=None,
                 momentum_decay=None,
                 momentum_factor=None,
                 nesterov_momentum=None,
                 flag_para_load=None):
        # superclass init
        super(SGD, self).__init__(config=config, defaults=defaults)
        # config and defaults are now combined in self.args! yay!

        self.model = model
        self.dataset = dataset
        self.iterator = iterator_class

        # Training epochs - how many times to iterate over the whole dataset
        self.n_epoch = n_epoch or self.args.get('n_epoch')

        # Dataset iteration batch sizes - number of examples in each calculation
        self.batch_size = batch_size or self.args.get('batch_size')
        self.minimum_batch_size = minimum_batch_size or self.args.get(
            'minimum_batch_size')

        # Number of epochs between saving model parameters
        self.save_frequency = save_frequency or self.args.get('save_frequency')

        # Early stopping threshold and patience - by how much does the cost have to improve over a number of epochs
        self.early_stop_threshold = early_stop_threshold or self.args.get(
            'early_stop_threshold')
        self.early_stop_length = early_stop_length or self.args.get(
            'early_stop_length')

        # Learning rate - how drastic of a step do the parameters change
        lr = learning_rate or self.args.get('learning_rate')
        self.learning_rate = sharedX(lr, 'learning_rate')
        self.lr_scalers = self.model.get_lr_scalers()
        if lr_decay or self.args.get('lr_decay'):
            self.learning_rate_decay = get_decay_function(
                lr_decay or self.args.get('lr_decay'), self.learning_rate,
                self.learning_rate.get_value(), lr_factor
                or self.args.get('lr_factor'))

        # Momentum - smoothing over the parameter changes (see Hinton)
        self.momentum = sharedX(momentum or self.args.get('momentum'),
                                'momentum')
        if self.args.get('momentum_decay'):
            self.momentum_decay = get_decay_function(
                momentum_decay or self.args.get('momentum_decay'),
                self.momentum, self.momentum.get_value(), momentum_factor
                or self.args.get('momentum_factor'))
        self.nesterov_momentum = nesterov_momentum or self.args.get(
            'nesterov_momentum')

        # RNG for working on random iterator
        if rng is None:
            random.seed(123)
            self.rng = random
        else:
            self.rng = rng

        self.params = self.model.get_params()

        # Now create the training cost function for the model to use while training - update parameters
        log.info("%s params: %s", str(type(self.model)), str(self.params))
        # gradient!
        gradient = grad(self.model.get_train_cost(), self.params)
        grads = OrderedDict(zip(self.params, gradient))

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(grads)

        # Combine the updates from the model also if applicable
        train_updates = model.get_updates()
        if train_updates:
            train_updates.update(gradient_updates)
        else:
            train_updates = gradient_updates

        # Compile the training function!
        log.info('Compiling f_learn function for model %s...',
                 str(type(self.model)))
        t = time.time()
        self.f_learn = function(inputs=model.get_inputs(),
                                updates=train_updates,
                                outputs=self.model.get_train_cost(),
                                name='f_learn')
        log.info('f_learn compilation took %s',
                 make_time_units_string(time.time() - t))

        # Determine if this function is unsupervised or not by looking at the number of inputs to the f_learn function.
        # If there is only one input, it is unsupervised, otherwise, it is supervised.
        # This workaround was provided by Pascal Lamblin on the theano-users google group
        num_inputs = len(
            [i for i in self.f_learn.maker.inputs if not i.shared])
        if num_inputs == 1:
            log.debug("Model is unsupervised: 1 input to f_learn.")
            self.unsupervised = True
        elif num_inputs == 2:
            log.debug("Model is supervised: 2 inputs to f_learn.")
            self.unsupervised = False
        else:
            log.error(
                "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised.",
                str(type(self.model)), str(num_inputs))
            raise AssertionError(
                "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised."
                % str(type(self.model)), str(num_inputs))

        # grab the function(s) to use to monitor different model values during training
        self.monitors = self.model.get_monitors()
Пример #35
0
    def __init__(self, nh, nc, ne, de, cs, decay):
        '''
        nh :: dimension of the hidden layer
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings
        cs :: word window context size 
        decay :: adaptive learning rate
        '''
        # parameters of the model
        # weights for GRU
        n_in = de * cs
        n_hidden = n_i = n_c = n_f = nh
        n_y = nc
        # forward pass
        self.W_xi = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_i)).astype(dtype))
        self.W_hi = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_i)).astype(dtype))
        self.b_i = theano.shared(numpy.cast[dtype](uniform(-0.5, .5,
                                                           size=n_i)))
        self.W_xf = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_f)).astype(dtype))
        self.W_hf = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_f)).astype(dtype))
        self.b_f = theano.shared(numpy.cast[dtype](uniform(0, 1., size=n_f)))
        self.W_xc = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_c)).astype(dtype))
        self.W_hc = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_c)).astype(dtype))
        self.b_c = theano.shared(numpy.zeros(n_c, dtype=dtype))

        self.c0 = theano.shared(numpy.zeros(n_hidden, dtype=dtype))
        self.h0 = T.tanh(self.c0)
        self.W_hy = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_y)).astype(dtype))
        self.b_y = theano.shared(numpy.zeros(n_y, dtype=dtype))
        '''
        # backward pass
        self.bW_xi = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_i)).astype(dtype))
        self.bW_hi = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_i)).astype(dtype))
        self.bb_i = theano.shared(numpy.cast[dtype](uniform(-0.5,.5,size = n_i)))
        self.bW_xf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_f)).astype(dtype))
        self.bW_hf = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_f)).astype(dtype))
        self.bb_f = theano.shared(numpy.cast[dtype](uniform(0, 1.,size = n_f)))
        self.bW_xc = theano.shared(0.2 * uniform(-1.0, 1.0, (n_in, n_c)).astype(dtype))
        self.bW_hc = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_c)).astype(dtype))
        self.bb_c = theano.shared(numpy.zeros(n_c, dtype=dtype))

        self.bc0 = theano.shared(numpy.zeros(n_hidden, dtype=dtype))
        self.bh0 = T.tanh(self.bc0)
        self.bW_hy = theano.shared(0.2 * uniform(-1.0, 1.0, (n_hidden, n_y)).astype(dtype))
        '''

        # bundle weights
        self.params = [self.W_xi, self.W_hi, self.b_i, self.W_xf, self.W_hf, \
                       self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_hy, self.b_y]

        self.names  = ['W_xi', 'W_hi', 'b_i', 'W_xf', 'W_hf', 'b_f', \
                       'W_xc', 'W_hc', 'b_c', 'W_xo', 'W_ho', 'b_o', 'W_hy', 'b_y', 'c0']
        # for dropout
        self.allcache = [
            theano.shared(W.get_value() * numpy.asarray(0., dtype=dtype))
            for W in self.params
        ]

        # input context vectors in a batch
        embs = T.ftensor3('embs')
        mask = T.ivector('mask')
        idxs = T.itensor3(
        )  # as many columns as context window size/lines as words in the sentence
        x, _ = theano.scan(lambda idx, emb: emb[idx].reshape(
            (idx.shape[0], de * cs)),
                           sequences=[idxs, embs])
        y = T.imatrix('y')

        def recurrence(x_t, h_tm1):
            i_t = sigma(
                theano.dot(x_t, self.W_xi) + theano.dot(h_tm1, self.W_hi) +
                self.b_i)
            f_t = sigma(
                theano.dot(x_t, self.W_xf) + theano.dot(h_tm1, self.W_hf) +
                self.b_f)
            c_t = T.tanh(
                theano.dot(x_t, self.W_xc) +
                theano.dot(h_tm1 * f_t, self.W_hc) + self.b_c)
            h_t = (T.ones_like(i_t) - i_t) * h_tm1 + i_t * c_t

            s_t = softmax(theano.dot(h_t, self.W_hy) + self.b_y)[0]

            return [h_t, s_t]

        '''    
        def brecurrence(x_t, feat_t, h_tm1, c_tm1):
            i_t = sigma(theano.dot(x_t, self.bW_xi) + theano.dot(h_tm1, self.bW_hi) + self.bb_i)
            f_t = sigma(theano.dot(x_t, self.bW_xf) + theano.dot(h_tm1, self.bW_hf) + self.bb_f)
            c_t = T.tanh(theano.dot(x_t, self.bW_xc) + theano.dot(h_tm1 * f_t, self.bW_hc) + self.bb_c)
            h_t = (T.ones_like(i_t) - i_t) * h_tm1 + i_t * c_t
            return [h_t, c_t]
        '''

        # loss for each sentence, m is mask
        def sent_model(x_sent, m, y_sent):
            [h, s], _ = theano.scan(fn=recurrence,
                                    sequences=[x_sent],
                                    outputs_info=[self.h0, None])
            max_y, _ = theano.scan(lambda v, l: T.log(v)[l],
                                   sequences=[s[:m], y_sent[:m]])
            nll = -T.mean(max_y)
            return nll

        # prediction for each sentence, m is mask
        def pred_model(x_sent, m):
            [h, s], _ = theano.scan(fn=recurrence,
                                    sequences=[x_sent],
                                    outputs_info=[self.h0, None])
            y_pred = T.argmax(s, axis=1)
            return y_pred

        nll_all, _ = theano.scan(fn=sent_model, sequences=[x, mask, y])
        nll_all = T.mean(nll_all)
        y_pred, _ = theano.scan(fn=pred_model, sequences=[x, mask])

        # cost and gradients and learning rate
        lr = T.scalar('lr')
        gradients = T.grad(nll_all, self.params)

        # rmsprop
        allcache = [
            decay * cacheW + (1 - decay) * gradient**2
            for cacheW, gradient in zip(self.allcache, gradients)
        ]
        updates = OrderedDict([( p, p-lr*g/T.sqrt(cache+1e-6) ) for p, g, cache in zip( self.params , gradients, allcache)] \
                                + [(w, new_w) for w, new_w in zip(self.allcache, allcache)])
        # gradients for input context vectors
        emb_update = T.grad(nll_all, embs)

        # theano functions
        self.predict = theano.function(inputs=[idxs, embs, mask],
                                       outputs=y_pred,
                                       allow_input_downcast=True)
        self.train = theano.function(inputs=[idxs, embs, y, lr, mask],
                                     outputs=nll_all,
                                     updates=updates,
                                     allow_input_downcast=True)

        #self.normalize = theano.function(inputs=[], updates={self.emb: self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0,'x')})
        #self.update_emb = theano.function(inputs=[new, idxs], updates={self.emb[idxs]: theano.shared(new[idxs].get_value())})

        #add returning gradients for embedding
        self.grad = theano.function(inputs=[idxs, embs, y, mask],
                                    outputs=emb_update,
                                    allow_input_downcast=True)
Пример #36
0
def build_updates(cost,
                  params,
                  clip_c=0,
                  clip_idx=None,
                  shrink_grad=None,
                  choice=None):
    updates = OrderedDict()
    grads = T.grad(cost, params)

    def apply_clip(g):
        g2 = 0.
        g2 += (g**2).sum()
        new_grad = T.switch(g2 > (clip_c**2), g / T.sqrt(g2) * clip_c, g)
        return new_grad

    if clip_c > 0. and clip_idx is not None:
        for idx in clip_idx:
            grads[idx] = apply_clip(grads[idx])
    if shrink_grad is not None:
        for idx in shrink_grad:
            grads[idx] *= 0.001

    def get_updates_adadelta(grads, params, decay=0.95):
        decay = constantX(decay)
        print 'build updates with adadelta'
        for param, grad in zip(params, grads):
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(
                numpy.zeros(param.get_value().shape, dtype=floatX))
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(
                numpy.zeros(param.get_value().shape, dtype=floatX))
            if param.name is not None:
                mean_square_grad.name = 'mean_square_grad_' + param.name
                mean_square_dx.name = 'mean_square_dx_' + param.name

            # Accumulate gradient
            new_mean_squared_grad = \
                    decay * mean_square_grad +\
                    (1. - decay) * T.sqr(grad)
            # Compute update
            epsilon = constantX(1e-7)
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = -rms_dx_tm1 / rms_grad_t * grad

            # Accumulate updates
            new_mean_square_dx = \
                    decay * mean_square_dx + \
                    (1. - decay) * T.sqr(delta_x_t)

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

    def get_updates_grads_momentum(gparams, params, lr=0.1, momentum=0.5):
        print 'building updates with momentum'
        # build momentum
        gparams_mom = []
        for param in params:
            gparam_mom = theano.shared(
                numpy.zeros(param.get_value(borrow=True).shape, dtype=floatX))
            gparams_mom.append(gparam_mom)

        for gparam, gparam_mom, param in zip(gparams, gparams_mom, params):
            inc = momentum * gparam_mom - (constantX(1) -
                                           momentum) * lr * gparam
            updates[gparam_mom] = inc
            updates[param] = param + inc

    def get_updates_rmsprop(grads, params, lr=0.1, decay=0.95):
        for param, grad in zip(params, grads):
            mean_square_grad = sharedX(
                numpy.zeros(param.get_value().shape, dtype=floatX))
            new_mean_squared_grad = (decay * mean_square_grad +
                                     (1. - decay) * T.sqr(grad))
            rms_grad_t = T.sqrt(new_mean_squared_grad)
            delta_x_t = constantX(-1) * lr * grad / rms_grad_t
            updates[mean_square_grad] = new_mean_squared_grad
            updates[param] = param + delta_x_t

    get_updates_adadelta(grads, params)
    #get_updates_grads_momentum(grads, params)
    #get_updates_rmsprop(grads, params)
    return updates
Пример #37
0
import numpy as np
from theano.compat.python2x import OrderedDict
from kdl_template import *

# random state so script is deterministic
random_state = np.random.RandomState(1999)
# home of the computational graph
graph = OrderedDict()

# minibatch size
minibatch_size = 20
# number of input units
n_in = 5
# number of hidden units
n_hid = 10
# number of output units
n_out = 5

# Generate sinewaves offset in phase
n_timesteps = 50
d1 = 3 * np.arange(n_timesteps) / (2 * np.pi)
d2 = 3 * np.arange(n_in) / (2 * np.pi)
all_sines = np.sin(np.array([d1] * n_in).T + d2)
all_sines = all_sines[:, None, :]
all_sines = np.concatenate([all_sines] * minibatch_size, axis=1)

# Setup dataset and initial hidden vector of zeros
X = all_sines[:-1].astype(theano.config.floatX)
y = all_sines[1:].astype(theano.config.floatX)
X_mask = np.ones_like(X[:, :, 0])
y_mask = np.ones_like(y[:, :, 0])
Пример #38
0
    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is called any time we need to evaluate the channels and the
        channel definitions have changed since last we called it, or if the
        theano functions are unavailable for any other reason (first time they
        are needed after construction or deserialization, etc.)

        All channels are compiled as part of the same theano function so that
        the theano optimizations can eliminate subexpressions that are shared
        between multiple channels.
        """
        self._dirty = False

        # Recompute the data specs, since the channels may have changed.
        self._build_data_specs()

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data
        # the model acts on
        theano_args = self._flat_data_specs[0].make_theano_batch(
            ['monitoring_%s' % s for s in self._flat_data_specs[1]])

        # Get a symbolic expression of the batch size
        # We do it here, rather than for each channel, because channels with an
        # empty data_specs do not use data, and are unable to extract the batch
        # size. The case where the whole data specs is empty is not supported.
        batch_size = self._flat_data_specs[0].batch_size(theano_args)

        # Also get a nested representation, for joint iteration
        # with each of channel.graph_input
        nested_theano_args = self._data_specs_mapping.nest(theano_args)
        if not isinstance(nested_theano_args, tuple):
            nested_theano_args = (nested_theano_args, )
        assert len(nested_theano_args) == (len(self.channels) + 1)

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including ' +
                                        'channel ' + key + '\n')
            log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b,
                         data_specs=self._flat_data_specs,
                         return_tuple=True) \
              for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                          self._num_batches, self._batch_size)]
        self.num_examples = [
            np.cast[config.floatX](float(i.num_examples)) for i in it
        ]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for i, channel in enumerate(self.channels.values()):
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]

            # Flatten channel.graph_input and the appropriate part of
            # nested_theano_args, to iterate jointly over them.
            c_mapping = DataSpecsMapping(channel.data_specs)
            channel_inputs = c_mapping.flatten(channel.graph_input,
                                               return_tuple=True)
            inputs = c_mapping.flatten(nested_theano_args[i + 1],
                                       return_tuple=True)

            for (channel_X, X) in safe_izip(channel_inputs, inputs):
                assert channel_X not in g or g[channel_X] is X
                assert channel_X.type == X.type, (channel_X.type, X.type)
                g[channel_X] = X

            if batch_size == 0:
                # No channel does need any data, so there is not need to
                # average results, and we will call the accum functions only
                # once.
                # TODO: better handling of channels not needing data when
                # some other channels need data.
                assert len(self._flat_data_specs[1]) == 0
                val = channel.val
            else:
                if n == 0:
                    raise ValueError("Iterating over 0 examples results in " +
                                     "divide by 0")
                val = (channel.val * T.cast(batch_size, config.floatX) /
                       cur_num_examples)
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' +
                                        key.name + ' has dtype ' + key.dtype +
                                        ' but is driven by an expression ' +
                                        'with type ' + up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if mode is not None and hasattr(mode, 'record'):
                    mode.record.handle_line('compiling supervised accum\n')
                # Some channels may not depend on the data, ie, they might just
                # monitor the model parameters, or some shared variable updated
                # by the training algorithm, so we need to ignore the unused
                # input error
                self.accum.append(
                    function(theano_args,
                             givens=g,
                             updates=u,
                             mode=self.theano_function_mode,
                             name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del(
            [name for name in final_names if name not in init_names])
Пример #39
0
    def __init__(self, network):
        self.trial_num = 0
        batch_size = network.parameters.batch_size
        num_iterations = network.parameters.num_iterations
        keep_spikes = network.parameters.keep_spikes
        norm_infer = network.parameters.norm_infer
        firing_decay = network.parameters.firing_decay
        #firing_decay = False
        time_data = network.parameters.time_data
        X = network.X
        updates = OrderedDict()
        for layer in range(network.n_layers):
            M = network.parameters.M[layer]
            Q = network.Q[layer]
            theta = network.theta[layer]
            W = network.W[layer]
            Y = T.alloc(0., batch_size, M)
            if time_data and self.trial_num != 0:
                Ys = network.Ys_tm1[layer]
                aas = network.aas_tm1[layer]
            else:
                Ys = T.zeros_like(Y)
                aas = T.zeros_like(Y)
            if keep_spikes:
                spike_train = T.alloc(0., batch_size, M, num_iterations)

            Q_norm = (Q * Q).sum(axis=0, keepdims=True)

            B = X.dot(Q)
            Th = theta.dimshuffle('x', 0)

            eta = .1

            for tt in xrange(num_iterations):
                if norm_infer:
                    Ys = (1. - eta * Q_norm) * Ys + eta * (B - aas.dot(W))
                elif firing_decay:
                    Ys = (1. - eta) * Ys + eta * (B - Y.dot(W))
                else:
                    Ys = (1. - eta) * Ys + eta * (B - aas.dot(W))

                aas = 0. * aas
                # This resets the current activity of the time step to 0's
                aas = T.switch(Ys > Th, 1., aas)

                # If the activity of a given neuron is above the threshold, set it to 1 a.k.a. fire.

                if keep_spikes:
                    spike_train = T.set_subtensor(spike_train[:, :, tt], aas)

                Y += aas
                # Update total activity
                Ys = T.switch(Ys > Th, 0., Ys)

            # Setting input of next layer to spikes of current one
            X = Y
            updates[network.Y[layer]] = Y

            if keep_spikes:
                if time_data:
                    updates[network.spike_train_tm1[
                        layer]] = network.spike_train[layer]
                updates[network.spike_train[layer]] = spike_train
            if time_data:
                updates[network.Ys_tm1[layer]] = Ys
                updates[network.aas_tm1[layer]] = aas

        self.f = theano.function([], [], updates=updates)
Пример #40
0
    def _build_computation_graph(self):
        ###################### BUILD NETWORK ##########################
        # whether or not to mirror the input images before feeding them into the network
        if self.flag_datalayer:
            layer_1_input = mirror_images(
                input=self.x,
                image_shape=(self.batch_size, 3, 256, 256),  # bc01 format
                cropsize=227,
                rand=self.rand,
                flag_rand=self.rand_crop)
        else:
            layer_1_input = self.x  # 4D tensor (going to be in bc01 format)

        # Start with 5 convolutional pooling layers
        log.debug("convpool layer 1...")
        convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227,
                                                      227), layer_1_input),
                                        filter_shape=(96, 3, 11, 11),
                                        convstride=4,
                                        padsize=0,
                                        group=1,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer1.get_params()

        log.debug("convpool layer 2...")
        convpool_layer2 = ConvPoolLayer(inputs_hook=((
            self.batch_size,
            96,
            27,
            27,
        ), convpool_layer1.get_outputs()),
                                        filter_shape=(256, 96, 5, 5),
                                        convstride=1,
                                        padsize=2,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.1,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer2.get_params()

        log.debug("convpool layer 3...")
        convpool_layer3 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 256, 13, 13),
                         convpool_layer2.get_outputs()),
            filter_shape=(384, 256, 3, 3),
            convstride=1,
            padsize=1,
            group=1,
            poolsize=1,
            poolstride=0,
            bias_init=0.0,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer3.get_params()

        log.debug("convpool layer 4...")
        convpool_layer4 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 384, 13, 13),
                         convpool_layer3.get_outputs()),
            filter_shape=(384, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=1,
            poolstride=0,
            bias_init=0.1,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer4.get_params()

        log.debug("convpool layer 5...")
        convpool_layer5 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 384, 13, 13),
                         convpool_layer4.get_outputs()),
            filter_shape=(256, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=3,
            poolstride=2,
            bias_init=0.0,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer5.get_params()

        # Now onto the fully-connected layers!
        fc_config = {
            'activation':
            'rectifier',  # type of activation function to use for output
            'weights_init':
            'gaussian',  # either 'gaussian' or 'uniform' - how to initialize weights
            'weights_mean': 0.0,  # mean for gaussian weights init
            'weights_std':
            0.005,  # standard deviation for gaussian weights init
            'bias_init': 0.0  # how to initialize the bias parameter
        }
        log.debug("fully connected layer 1 (model layer 6)...")
        # we want to have dropout applied to the training version, but not the test version.
        fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2)
        fc_layer6 = Dense(inputs_hook=(9216, fc_layer6_input),
                          output_size=4096,
                          noise='dropout',
                          noise_level=0.5,
                          **fc_config)
        # Add this layer's parameters!
        self.params += fc_layer6.get_params()
        # Add the dropout noise switch
        self.noise_switches += fc_layer6.get_noise_switch()

        log.debug("fully connected layer 2 (model layer 7)...")
        fc_layer7 = Dense(inputs_hook=(4096, fc_layer6.get_outputs()),
                          output_size=4096,
                          noise='dropout',
                          noise_level=0.5,
                          **fc_config)

        # Add this layer's parameters!
        self.params += fc_layer7.get_params()
        # Add the dropout noise switch
        self.noise_switches += fc_layer7.get_noise_switch()

        # last layer is a softmax prediction output layer
        softmax_config = {
            'weights_init': 'gaussian',
            'weights_mean': 0.0,
            'weights_std': 0.005,
            'bias_init': 0.0
        }
        log.debug("softmax classification layer (model layer 8)...")
        softmax_layer8 = SoftmaxLayer(inputs_hook=(4096,
                                                   fc_layer7.get_outputs()),
                                      output_size=1000,
                                      **softmax_config)

        # Add this layer's parameters!
        self.params += softmax_layer8.get_params()

        # finally the softmax output from the whole thing!
        self.output = softmax_layer8.get_outputs()
        self.targets = softmax_layer8.get_targets()

        #####################
        # Cost and monitors #
        #####################
        self.train_cost = softmax_layer8.negative_log_likelihood()
        cost = softmax_layer8.negative_log_likelihood()
        errors = softmax_layer8.errors()
        train_errors = softmax_layer8.errors()

        self.monitors = OrderedDict([('cost', cost), ('errors', errors),
                                     ('dropout_errors', train_errors)])

        #########################
        # Compile the functions #
        #########################
        log.debug("Compiling functions!")
        t = time.time()
        log.debug("f_run...")
        # use the actual argmax from the classification
        self.f_run = function(inputs=[self.x],
                              outputs=softmax_layer8.get_argmax_prediction())
        log.debug("compilation took %s",
                  make_time_units_string(time.time() - t))
Пример #41
0
    def orderings(self, fgraph):
        """Return orderings induced by destructive operations.

        Raise InconsistencyError when
        a) attempting to destroy indestructable variable, or
        b) attempting to destroy a value multiple times, or
        c) an Apply destroys (illegally) one of its own inputs by aliasing

        """
        rval = OrderedDict()

        if self.destroyers:
            # BUILD DATA STRUCTURES
            # CHECK for multiple destructions during construction of variables

            droot, impact, __ignore = self.refresh_droot_impact()

            # check for destruction of constants
            illegal_destroy = [r for r in droot if \
                    getattr(r.tag, 'indestructible', False) or \
                    isinstance(r, graph.Constant)]
            if illegal_destroy:
                raise InconsistencyError(
                    "Attempting to destroy indestructible variables: %s" %
                    illegal_destroy)

            # add destroyed variable clients as computational dependencies
            for app in self.destroyers:
                # for each destroyed input...
                for output_idx, input_idx_list in app.op.destroy_map.items():
                    destroyed_idx = input_idx_list[0]
                    destroyed_variable = app.inputs[destroyed_idx]
                    root = droot[destroyed_variable]
                    root_impact = impact[root]
                    # we generally want to put all clients of things which depend on root
                    # as pre-requisites of app.
                    # But, app is itself one such client!
                    # App will always be a client of the node we're destroying
                    # (destroyed_variable, but the tricky thing is when it is also a client of
                    # *another variable* viewing on the root.  Generally this is illegal, (e.g.,
                    # add_inplace(x, x.T).  In some special cases though, the in-place op will
                    # actually be able to work properly with multiple destroyed inputs (e.g,
                    # add_inplace(x, x).  An Op that can still work in this case should declare
                    # so via the 'destroyhandler_tolerate_same' attribute or
                    # 'destroyhandler_tolerate_aliased' attribute.
                    #
                    # destroyhandler_tolerate_same should be a list of pairs of the form
                    # [(idx0, idx1), (idx0, idx2), ...]
                    # The first element of each pair is the input index of a destroyed
                    # variable.
                    # The second element of each pair is the index of a different input where
                    # we will permit exactly the same variable to appear.
                    # For example, add_inplace.tolerate_same might be [(0,1)] if the destroyed
                    # input is also allowed to appear as the second argument.
                    #
                    # destroyhandler_tolerate_aliased is the same sort of list of
                    # pairs.
                    # op.destroyhandler_tolerate_aliased = [(idx0, idx1)] tells the
                    # destroyhandler to IGNORE an aliasing between a destroyed
                    # input idx0 and another input idx1.
                    # This is generally a bad idea, but it is safe in some
                    # cases, such as
                    # - the op reads from the aliased idx1 before modifying idx0
                    # - the idx0 and idx1 are guaranteed not to overlap (e.g.
                    #   they are pointed at different rows of a matrix).
                    #

                    # CHECK FOR INPUT ALIASING
                    # OPT: pre-compute this on import
                    tolerate_same = getattr(app.op,
                                            'destroyhandler_tolerate_same', [])
                    assert isinstance(tolerate_same, list)
                    tolerated = OrderedSet(idx1 for idx0, idx1 in tolerate_same
                                           if idx0 == destroyed_idx)
                    tolerated.add(destroyed_idx)
                    tolerate_aliased = getattr(
                        app.op, 'destroyhandler_tolerate_aliased', [])
                    assert isinstance(tolerate_aliased, list)
                    ignored = OrderedSet(idx1
                                         for idx0, idx1 in tolerate_aliased
                                         if idx0 == destroyed_idx)
                    # print 'tolerated', tolerated
                    # print 'ignored', ignored
                    for i, input in enumerate(app.inputs):
                        if i in ignored:
                            continue
                        if input in root_impact \
                                and (i not in tolerated or input is not destroyed_variable):
                            raise InconsistencyError(
                                "Input aliasing: %s (%i, %i)" %
                                (app, destroyed_idx, i))

                    # add the rule: app must be preceded by all other Apply instances that
                    # depend on destroyed_input
                    root_clients = OrderedSet()
                    for r in root_impact:
                        assert not [
                            a for a, c in self.clients[r].items() if not c
                        ]
                        root_clients.update(
                            [a for a, c in self.clients[r].items() if c])
                    root_clients.remove(app)
                    if root_clients:
                        rval[app] = root_clients

        return rval
Пример #42
0
class DestroyHandler(toolbox.Bookkeeper):
    """
    The DestroyHandler class detects when a graph is impossible to evaluate
    because of aliasing and destructive operations.

    Several data structures are used to do this.

    An Op can use its view_map property to declare that an output may be
    aliased to an input. If that output is destroyed, the input is also
    considered to be destroyed. The view_maps of several Ops can feed into
    one another and form a directed graph. The consequence of destroying any
    variable in such a graph is that all variables in the graph must be
    considered to be destroyed, because they could all be refering to the
    same underlying storage.

    In the current implementation, that graph is a tree, and the root of that
    tree is called the foundation.

    TODO: why "in the current implementation" ? is there another implementation
          planned?
    TODO: why is the graph a tree? isn't it possible that one variable could
          be aliased to many variables? for example, don't switch and ifelse
          have to do this?

    The original DestroyHandler (if 0'ed out above) computed several data
    structures from scratch each time it was asked to validate the graph.
    Because this happens potentially thousands of times and each graph to
    validate is extremely similar to the previous one, computing the
    data structures from scratch repeatedly was wasteful and resulted in
    high compile times for large graphs.

    This implementation computes the data structures once at initialization
    and then incrementally updates them.

    It is a work in progress. The following data structures have been
    converted to use the incremental strategy:
        <none>

    The following data structures remain to be converted:
        <unknown>
    """
    pickle_rm_attr = ["destroyers"]

    def __init__(self, do_imports_on_attach=True):
        self.fgraph = None
        self.do_imports_on_attach = do_imports_on_attach
        """maps every variable in the graph to its "foundation" (deepest
        ancestor in view chain)
        TODO: change name to var_to_vroot"""
        self.droot = OrderedDict()
        """maps a variable to all variables that are indirect or direct views of it
         (including itself)
         essentially the inverse of droot
        TODO: do all variables appear in this dict, or only those that are foundations?
        TODO: do only destroyed variables go in here? one old docstring said so
        TODO: rename to x_to_views after reverse engineering what x is"""
        self.impact = OrderedDict()
        """if a var is destroyed, then this dict will map
        droot[var] to the apply node that destroyed var
        TODO: rename to vroot_to_destroyer"""
        self.root_destroyer = OrderedDict()

    def on_attach(self, fgraph):
        """
        When attaching to a new fgraph, check that
            1) This DestroyHandler wasn't already attached to some fgraph
               (its data structures are only set up to serve one)
            2) The FunctionGraph doesn't already have a DestroyHandler.
               This would result in it validating everything twice, causing
               compilation to be slower.

        Give the FunctionGraph instance:
            1) A new method "destroyers(var)"
                TODO: what does this do exactly?
            2) A new attribute, "destroy_handler"
        TODO: WRITEME: what does this do besides the checks?
        """

        ####### Do the checking ###########
        already_there = False
        if self.fgraph is fgraph:
            already_there = True
        if self.fgraph is not None:
            raise Exception("A DestroyHandler instance can only serve one"
                            " FunctionGraph. (Matthew 6:24)")
        for attr in ('destroyers', 'destroy_handler'):
            if hasattr(fgraph, attr):
                already_there = True

        if already_there:
            # FunctionGraph.attach_feature catches AlreadyThere and cancels the attachment
            raise toolbox.AlreadyThere(
                "DestroyHandler feature is already present"
                " or in conflict with another plugin.")

        ####### Annotate the FunctionGraph ############
        self.unpickle(fgraph)
        fgraph.destroy_handler = self

        self.fgraph = fgraph
        self.destroyers = OrderedSet(
        )  # set of Apply instances with non-null destroy_map
        self.view_i = OrderedDict()  # variable -> variable used in calculation
        self.view_o = OrderedDict(
        )  # variable -> set of variables that use this one as a direct input
        # clients: how many times does an apply use a given variable
        self.clients = OrderedDict()  # variable -> apply -> ninputs
        self.stale_droot = True

        self.debug_all_apps = OrderedSet()
        if self.do_imports_on_attach:
            toolbox.Bookkeeper.on_attach(self, fgraph)

    def unpickle(self, fgraph):
        def get_destroyers_of(r):
            droot, impact, root_destroyer = self.refresh_droot_impact()
            try:
                return [root_destroyer[droot[r]]]
            except Exception:
                return []

        fgraph.destroyers = get_destroyers_of

    def refresh_droot_impact(self):
        """
        Makes sure self.droot, self.impact, and self.root_destroyer are
        up to date, and returns them.
        (see docstrings for these properties above)
        """
        if self.stale_droot:
            droot = OrderedDict(
            )  # destroyed view + nonview variables -> foundation
            impact = OrderedDict(
            )  # destroyed nonview variable -> it + all views of it
            root_destroyer = OrderedDict()  # root -> destroyer apply

            for app in self.destroyers:
                for output_idx, input_idx_list in app.op.destroy_map.items():
                    if len(input_idx_list) != 1:
                        raise NotImplementedError()
                    input_idx = input_idx_list[0]
                    input = app.inputs[input_idx]
                    input_root = getroot(input, self.view_i)
                    if input_root in droot:
                        raise InconsistencyError("Multiple destroyers of %s" %
                                                 input_root)
                    droot[input_root] = input_root
                    root_destroyer[input_root] = app
                    input_impact = get_impact(input_root, self.view_o)
                    for v in input_impact:
                        assert v not in droot
                        droot[v] = input_root

                    impact[input_root] = input_impact
                    impact[input_root].add(input_root)
            self.droot, self.impact, self.root_destroyer = droot, impact, root_destroyer
            self.stale_droot = False
        return self.droot, self.impact, self.root_destroyer

    def on_detach(self, fgraph):
        if fgraph is not self.fgraph:
            raise Exception("detaching wrong fgraph", fgraph)
        del self.destroyers
        del self.view_i
        del self.view_o
        del self.clients
        del self.stale_droot
        assert self.fgraph.destroyer_handler is self
        delattr(self.fgraph, 'destroyers')
        delattr(self.fgraph, 'destroy_handler')
        self.fgraph = None

    def on_import(self, fgraph, app, reason):
        """Add Apply instance to set which must be computed"""

        if app in self.debug_all_apps:
            raise ProtocolError("double import")
        self.debug_all_apps.add(app)
        # print 'DH IMPORT', app, id(app), id(self), len(self.debug_all_apps)

        # If it's a destructive op, add it to our watch list
        if getattr(app.op, 'destroy_map', OrderedDict()):
            self.destroyers.add(app)

        # add this symbol to the forward and backward maps
        for o_idx, i_idx_list in getattr(app.op, 'view_map',
                                         OrderedDict()).items():
            if len(i_idx_list) > 1:
                raise NotImplementedError(
                    'destroying this output invalidates multiple inputs',
                    (app.op))
            o = app.outputs[o_idx]
            i = app.inputs[i_idx_list[0]]
            self.view_i[o] = i
            self.view_o.setdefault(i, OrderedSet()).add(o)

        # update self.clients
        for i, input in enumerate(app.inputs):
            self.clients.setdefault(input, OrderedDict()).setdefault(app, 0)
            self.clients[input][app] += 1

        for i, output in enumerate(app.outputs):
            self.clients.setdefault(output, OrderedDict())

        self.stale_droot = True

    def on_prune(self, fgraph, app, reason):
        """Remove Apply instance from set which must be computed"""
        if app not in self.debug_all_apps:
            raise ProtocolError("prune without import")
        self.debug_all_apps.remove(app)

        # UPDATE self.clients
        for i, input in enumerate(OrderedSet(app.inputs)):
            del self.clients[input][app]

        if getattr(app.op, 'destroy_map', OrderedDict()):
            self.destroyers.remove(app)

        # Note: leaving empty client dictionaries in the struct.
        # Why? It's a pain to remove them. I think they aren't doing any harm, they will be
        # deleted on_detach().

        # UPDATE self.view_i, self.view_o
        for o_idx, i_idx_list in getattr(app.op, 'view_map',
                                         OrderedDict()).items():
            if len(i_idx_list) > 1:
                # destroying this output invalidates multiple inputs
                raise NotImplementedError()
            o = app.outputs[o_idx]
            i = app.inputs[i_idx_list[0]]

            del self.view_i[o]

            self.view_o[i].remove(o)
            if not self.view_o[i]:
                del self.view_o[i]

        self.stale_droot = True

    def on_change_input(self, fgraph, app, i, old_r, new_r, reason):
        """app.inputs[i] changed from old_r to new_r """
        if app == 'output':
            # app == 'output' is special key that means FunctionGraph is redefining which nodes are being
            # considered 'outputs' of the graph.
            pass
        else:
            if app not in self.debug_all_apps:
                raise ProtocolError("change without import")

            # UPDATE self.clients
            self.clients[old_r][app] -= 1
            if self.clients[old_r][app] == 0:
                del self.clients[old_r][app]

            self.clients.setdefault(new_r, OrderedDict()).setdefault(app, 0)
            self.clients[new_r][app] += 1

            # UPDATE self.view_i, self.view_o
            for o_idx, i_idx_list in getattr(app.op, 'view_map',
                                             OrderedDict()).items():
                if len(i_idx_list) > 1:
                    # destroying this output invalidates multiple inputs
                    raise NotImplementedError()
                i_idx = i_idx_list[0]
                output = app.outputs[o_idx]
                if i_idx == i:
                    if app.inputs[i_idx] is not new_r:
                        raise ProtocolError("wrong new_r on change")

                    self.view_i[output] = new_r

                    self.view_o[old_r].remove(output)
                    if not self.view_o[old_r]:
                        del self.view_o[old_r]

                    self.view_o.setdefault(new_r, OrderedSet()).add(output)

        self.stale_droot = True

    def validate(self, fgraph):
        """Return None

        Raise InconsistencyError when
        a) orderings() raises an error
        b) orderings cannot be topologically sorted.

        """

        if self.destroyers:
            ords = self.orderings(fgraph)

            if _contains_cycle(fgraph, ords):
                raise InconsistencyError("Dependency graph contains cycles")
        else:
            # James's Conjecture:
            # If there are no destructive ops, then there can be no cycles.

            # FB: This isn't always True. It can happend that
            # optimization introduce node that depend on itself. This
            # is very rare and should not happen in general. It will be
            # caught later. The error will be far from the source. But
            # doing this conjecture should speed up compilation most of
            # the time. The user should create such dependency except
            # if he mess too much with the internal.
            pass
        return True

    def orderings(self, fgraph):
        """Return orderings induced by destructive operations.

        Raise InconsistencyError when
        a) attempting to destroy indestructable variable, or
        b) attempting to destroy a value multiple times, or
        c) an Apply destroys (illegally) one of its own inputs by aliasing

        """
        rval = OrderedDict()

        if self.destroyers:
            # BUILD DATA STRUCTURES
            # CHECK for multiple destructions during construction of variables

            droot, impact, __ignore = self.refresh_droot_impact()

            # check for destruction of constants
            illegal_destroy = [r for r in droot if \
                    getattr(r.tag, 'indestructible', False) or \
                    isinstance(r, graph.Constant)]
            if illegal_destroy:
                raise InconsistencyError(
                    "Attempting to destroy indestructible variables: %s" %
                    illegal_destroy)

            # add destroyed variable clients as computational dependencies
            for app in self.destroyers:
                # for each destroyed input...
                for output_idx, input_idx_list in app.op.destroy_map.items():
                    destroyed_idx = input_idx_list[0]
                    destroyed_variable = app.inputs[destroyed_idx]
                    root = droot[destroyed_variable]
                    root_impact = impact[root]
                    # we generally want to put all clients of things which depend on root
                    # as pre-requisites of app.
                    # But, app is itself one such client!
                    # App will always be a client of the node we're destroying
                    # (destroyed_variable, but the tricky thing is when it is also a client of
                    # *another variable* viewing on the root.  Generally this is illegal, (e.g.,
                    # add_inplace(x, x.T).  In some special cases though, the in-place op will
                    # actually be able to work properly with multiple destroyed inputs (e.g,
                    # add_inplace(x, x).  An Op that can still work in this case should declare
                    # so via the 'destroyhandler_tolerate_same' attribute or
                    # 'destroyhandler_tolerate_aliased' attribute.
                    #
                    # destroyhandler_tolerate_same should be a list of pairs of the form
                    # [(idx0, idx1), (idx0, idx2), ...]
                    # The first element of each pair is the input index of a destroyed
                    # variable.
                    # The second element of each pair is the index of a different input where
                    # we will permit exactly the same variable to appear.
                    # For example, add_inplace.tolerate_same might be [(0,1)] if the destroyed
                    # input is also allowed to appear as the second argument.
                    #
                    # destroyhandler_tolerate_aliased is the same sort of list of
                    # pairs.
                    # op.destroyhandler_tolerate_aliased = [(idx0, idx1)] tells the
                    # destroyhandler to IGNORE an aliasing between a destroyed
                    # input idx0 and another input idx1.
                    # This is generally a bad idea, but it is safe in some
                    # cases, such as
                    # - the op reads from the aliased idx1 before modifying idx0
                    # - the idx0 and idx1 are guaranteed not to overlap (e.g.
                    #   they are pointed at different rows of a matrix).
                    #

                    # CHECK FOR INPUT ALIASING
                    # OPT: pre-compute this on import
                    tolerate_same = getattr(app.op,
                                            'destroyhandler_tolerate_same', [])
                    assert isinstance(tolerate_same, list)
                    tolerated = OrderedSet(idx1 for idx0, idx1 in tolerate_same
                                           if idx0 == destroyed_idx)
                    tolerated.add(destroyed_idx)
                    tolerate_aliased = getattr(
                        app.op, 'destroyhandler_tolerate_aliased', [])
                    assert isinstance(tolerate_aliased, list)
                    ignored = OrderedSet(idx1
                                         for idx0, idx1 in tolerate_aliased
                                         if idx0 == destroyed_idx)
                    # print 'tolerated', tolerated
                    # print 'ignored', ignored
                    for i, input in enumerate(app.inputs):
                        if i in ignored:
                            continue
                        if input in root_impact \
                                and (i not in tolerated or input is not destroyed_variable):
                            raise InconsistencyError(
                                "Input aliasing: %s (%i, %i)" %
                                (app, destroyed_idx, i))

                    # add the rule: app must be preceded by all other Apply instances that
                    # depend on destroyed_input
                    root_clients = OrderedSet()
                    for r in root_impact:
                        assert not [
                            a for a, c in self.clients[r].items() if not c
                        ]
                        root_clients.update(
                            [a for a, c in self.clients[r].items() if c])
                    root_clients.remove(app)
                    if root_clients:
                        rval[app] = root_clients

        return rval
Пример #43
0
 def get_lr_scalers(self):
     rval = OrderedDict()
     if self.encoder is not None:
         safe_update(rval, self.encoder.get_lr_scalers())
     return rval
Пример #44
0
class Monitor(object):
    """
    A class for monitoring Models while they are being trained.

    A monitor object records the number of minibatches and number of examples
    the model has trained, as well as any number of "channels" that track
    quantities of interest (examples: the objective function, measures of
    hidden unit activity, reconstruction error, sum of squared second
    derivatives, average norm of the weight vectors,  etc.)
    """
    def __init__(self, model):
        """
        Makes a monitor for `model`. Assumes the model has not been trained at
        all yet.

        Parameters
        ----------
        model : pylearn2.models.model.Model instance
        """
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        self.theano_function_mode = None

        # Initialize self._nested_data_specs, self._data_specs_mapping,
        # and self._flat_data_specs
        self._build_data_specs()

    def _build_data_specs(self):
        """
        Computes a nested data_specs for input and all channels

        Also computes the mapping to flatten it. This function is called from
        redo_theano.
        """
        # Ask the model what it needs
        m_space, m_source = self.model.get_monitoring_data_specs()
        input_spaces = [m_space]
        input_sources = [m_source]
        for channel in self.channels.values():
            space = channel.data_specs[0]
            assert isinstance(space, Space)
            input_spaces.append(space)
            input_sources.append(channel.data_specs[1])

        nested_space = CompositeSpace(input_spaces)
        nested_source = tuple(input_sources)

        self._nested_data_specs = (nested_space, nested_source)
        self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs)

        flat_space = self._data_specs_mapping.flatten(nested_space,
                                                      return_tuple=True)
        flat_source = self._data_specs_mapping.flatten(nested_source,
                                                       return_tuple=True)
        self._flat_data_specs = (CompositeSpace(flat_space), flat_source)

    def set_theano_function_mode(self, mode):
        """
        Parameters
        ----------
        mode : theano.compile.Mode
            Theano functions for the monitoring channels will be compiled and
            run using this mode.
        """
        if self.theano_function_mode != mode:
            self._dirty = True
            self.theano_function_mode = mode

    def add_dataset(self,
                    dataset,
                    mode='sequential',
                    batch_size=None,
                    num_batches=None,
                    seed=None):
        """
        Determines the data used to calculate the values of each channel.

        Parameters
        ----------
        dataset : object
            A `pylearn2.datasets.Dataset` object.
        mode : str or object, optional
            Iteration mode; see the docstring of the `iterator` method \
            on `pylearn2.datasets.Dataset` for details.
        batch_size : int, optional
            The size of an individual batch. Optional if `mode` is \
            'sequential' and `num_batches` is specified (batch size \
            will be calculated based on full dataset size).
        num_batches : int, optional
            The total number of batches. Unnecessary if `mode` is \
            'sequential' and `batch_size` is specified (number of \
            batches will be calculated based on full dataset size).
        seed : int, optional
            Optional. The seed to be used for random iteration modes.
        """
        # The user can ommit using lists if only one dataset is set
        if not isinstance(dataset, list):
            dataset = [dataset]
        if not isinstance(mode, list):
            mode = [mode]
        if not isinstance(batch_size, list):
            batch_size = [batch_size]
        if not isinstance(num_batches, list):
            num_batches = [num_batches]
        if seed is None:
            seed = [None] * len(dataset)
        if not isinstance(seed, list):
            seed = [seed]
        if len(mode) != len(dataset):
            raise ValueError("Received " + str(len(dataset)) +
                             " dataset but " + str(len(mode)) + " modes.")
        if any([len(l) != len(dataset) for l in [batch_size, seed]]):
            raise ValueError("make sure each dataset has its iteration " +
                             "batch size and number of batches.")
        for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size,
                                          num_batches, seed):
            try:
                it = d.iterator(mode=m,
                                batch_size=b,
                                num_batches=n,
                                data_specs=self._flat_data_specs,
                                return_tuple=True,
                                rng=sd)
            except ValueError as exc:
                raise ValueError("invalid iteration parameters in " +
                                 "Monitor.add_dataset: " + str(exc))
            if it.stochastic:
                # Must be a seed, not a random number generator. If it were a
                # random number generator, different iterators using it would
                # update its state, so we would not get the same iterator
                # each time. Also, must not be None, because this makes the
                # iterator pick a seed based on the clock
                if sd is None:
                    raise TypeError("Monitor requires a seed when using " +
                                    "stochastic iteration modes.")
                if not isinstance(sd, (list, tuple, int)):
                    raise TypeError("Monitor requires a seed (not a random " +
                                    "number generator) when using " +
                                    "stochastic iteration modes.")
            else:
                # The iterator should catch this, but let's double-check
                assert sd is None

            if not d in self._datasets:
                self._datasets.append(d)
                self._iteration_mode.append(m)
                self._batch_size.append(b)
                self._num_batches.append(n)
                self._rng_seed.append(sd)

    def __call__(self):
        """
        Runs the model on the monitoring dataset in order to add one data point
        to each of the channels.
        """

        # If the channels have changed at all, we need to recompile the theano
        # functions used to compute them
        if self._dirty:
            self.redo_theano()

        datasets = self._datasets

        # Set all channels' val_shared to 0
        self.begin_record_entry()
        for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode,
                                               self._batch_size,
                                               self._num_batches, self.accum,
                                               self._rng_seed,
                                               self.num_examples):
            if isinstance(d, basestring):
                d = yaml_parse.load(d)
                raise NotImplementedError()

            # need to put d back into self._datasets
            myiterator = d.iterator(mode=i,
                                    batch_size=b,
                                    num_batches=n,
                                    data_specs=self._flat_data_specs,
                                    return_tuple=True,
                                    rng=sd)

            # If self._flat_data_specs is empty, no channel needs data,
            # so we do not need to call the iterator in order to average
            # the monitored values across different batches, we only
            # have to call them once.
            if len(self._flat_data_specs[1]) == 0:
                X = ()
                self.run_prereqs(X, d)
                a(*X)

            else:
                actual_ne = 0
                for X in myiterator:
                    # X is a flat (not nested) tuple
                    self.run_prereqs(X, d)
                    a(*X)
                    actual_ne += self._flat_data_specs[0].np_batch_size(X)
                # end for X
                if actual_ne != ne:
                    raise RuntimeError("At compile time, your iterator said "
                                       "it had " + str(ne) +
                                       " examples total, but at "
                                       "runtime it gave us " + str(actual_ne) +
                                       ".")
        # end for d

        log.info("Monitoring step:")
        log.info("\tEpochs seen: %d" % self._epochs_seen)
        log.info("\tBatches seen: %d" % self._num_batches_seen)
        log.info("\tExamples seen: %d" % self._examples_seen)
        t = time.time() - self.t0
        for channel_name in sorted(self.channels.keys(),
                                   key=number_aware_alphabetical_key):
            channel = self.channels[channel_name]
            channel.time_record.append(t)
            channel.batch_record.append(self._num_batches_seen)
            channel.example_record.append(self._examples_seen)
            channel.epoch_record.append(self._epochs_seen)
            val = channel.val_shared.get_value()
            channel.val_record.append(val)
            # TODO: use logging infrastructure so that user can configure
            # formatting
            if abs(val) < 1e4:
                val_str = str(val)
            else:
                val_str = '%.3e' % val

            log.info("\t%s: %s" % (channel_name, val_str))

    def run_prereqs(self, data, dataset):
        """
        Runs all "prerequistie functions" on a batch of data. Always called
        right before computing the monitoring channels on that batch.

        Parameters
        ----------
        data : tuple or Variable
            a member of the Space used as input to the monitoring functions
        dataset : Dataset
            the Dataset the data was drawn from
        """
        if dataset not in self.prereqs:
            return
        for prereq in self.prereqs[dataset]:
            prereq(*data)

    def get_batches_seen(self):
        """
        Returns the number of batches the model has learned on (assuming that
        the learning code has been calling Monitor.report_batch correctly).
        """
        return self._num_batches_seen

    def get_epochs_seen(self):
        """
        Returns
        -------
        epochs_seen : int
            The number of epochs the model has been trained on.
            One "epoch" is one pass through Dataset.iterator.
        """
        return self._epochs_seen

    def get_examples_seen(self):
        """
        Returns
        -------
        examples_seen : int
            The number of examples the model has learned on (assuming that
            the learning code has been calling Monitor.report_batch correctly)
        """
        return self._examples_seen

    def report_batch(self, num_examples):
        """
        Call this whenever the model has learned on another batch of examples.
        Report how many examples were learned on.

        Parameters
        ----------
        num_examples : int
            The number of examples learned on in this minibatch.
        """
        self._examples_seen += num_examples
        self._num_batches_seen += 1

    def report_epoch(self):
        """
        Call this whenever the model has completed another "epoch" of learning.
        We regard one pass through Dataset.iterator as one epoch.
        """
        self._epochs_seen += 1

    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is called any time we need to evaluate the channels and the
        channel definitions have changed since last we called it, or if the
        theano functions are unavailable for any other reason (first time they
        are needed after construction or deserialization, etc.)

        All channels are compiled as part of the same theano function so that
        the theano optimizations can eliminate subexpressions that are shared
        between multiple channels.
        """
        self._dirty = False

        # Recompute the data specs, since the channels may have changed.
        self._build_data_specs()

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data
        # the model acts on
        theano_args = self._flat_data_specs[0].make_theano_batch(
            ['monitoring_%s' % s for s in self._flat_data_specs[1]])

        # Get a symbolic expression of the batch size
        # We do it here, rather than for each channel, because channels with an
        # empty data_specs do not use data, and are unable to extract the batch
        # size. The case where the whole data specs is empty is not supported.
        batch_size = self._flat_data_specs[0].batch_size(theano_args)

        # Also get a nested representation, for joint iteration
        # with each of channel.graph_input
        nested_theano_args = self._data_specs_mapping.nest(theano_args)
        if not isinstance(nested_theano_args, tuple):
            nested_theano_args = (nested_theano_args, )
        assert len(nested_theano_args) == (len(self.channels) + 1)

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line('compiling monitor including ' +
                                        'channel ' + key + '\n')
            log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b,
                         data_specs=self._flat_data_specs,
                         return_tuple=True) \
              for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                          self._num_batches, self._batch_size)]
        self.num_examples = [
            np.cast[config.floatX](float(i.num_examples)) for i in it
        ]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for i, channel in enumerate(self.channels.values()):
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]

            # Flatten channel.graph_input and the appropriate part of
            # nested_theano_args, to iterate jointly over them.
            c_mapping = DataSpecsMapping(channel.data_specs)
            channel_inputs = c_mapping.flatten(channel.graph_input,
                                               return_tuple=True)
            inputs = c_mapping.flatten(nested_theano_args[i + 1],
                                       return_tuple=True)

            for (channel_X, X) in safe_izip(channel_inputs, inputs):
                assert channel_X not in g or g[channel_X] is X
                assert channel_X.type == X.type, (channel_X.type, X.type)
                g[channel_X] = X

            if batch_size == 0:
                # No channel does need any data, so there is not need to
                # average results, and we will call the accum functions only
                # once.
                # TODO: better handling of channels not needing data when
                # some other channels need data.
                assert len(self._flat_data_specs[1]) == 0
                val = channel.val
            else:
                if n == 0:
                    raise ValueError("Iterating over 0 examples results in " +
                                     "divide by 0")
                val = (channel.val * T.cast(batch_size, config.floatX) /
                       cur_num_examples)
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' +
                                        key.name + ' has dtype ' + key.dtype +
                                        ' but is driven by an expression ' +
                                        'with type ' + up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if mode is not None and hasattr(mode, 'record'):
                    mode.record.handle_line('compiling supervised accum\n')
                # Some channels may not depend on the data, ie, they might just
                # monitor the model parameters, or some shared variable updated
                # by the training algorithm, so we need to ignore the unused
                # input error
                self.accum.append(
                    function(theano_args,
                             givens=g,
                             updates=u,
                             mode=self.theano_function_mode,
                             name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del(
            [name for name in final_names if name not in init_names])

    def register_names_to_del(self, names):
        """
        Register names of fields that should be deleted before pickling.

        Parameters
        ----------
        names : list
            A list of attribute names as strings.
        """
        for name in names:
            if name not in self.names_to_del:
                self.names_to_del.append(name)

    def __getstate__(self):
        """
        In order to avoid pickling a copy of the dataset whenever a monitor
        is saved, the __getstate__ method replaces the dataset field with the
        dataset's yaml source. This is not a perfect solution because it won't
        work with job resuming, which would require saving the state of the
        dataset's random number generator.

        Like in the Model class, we also need to avoid saving any Theano
        functions, so we delete everything that can be regenerated with
        `redo_theano` by deleting the fields in `self.names_to_del`
        """

        # Patch old pickled monitors
        if not hasattr(self, '_datasets'):
            self._datasets = [self._dataset]
            del self._dataset

        temp = self._datasets

        if self._datasets:
            self._datasets = []
            for dataset in temp:
                if isinstance(dataset, basestring):
                    self._datasets.append(dataset)
                else:
                    try:
                        self._datasets.append(dataset.yaml_src)
                    except AttributeError:
                        warnings.warn('Trained model saved without ' +
                                      'indicating yaml_src')
        d = copy.copy(self.__dict__)
        self._datasets = temp
        for name in self.names_to_del:
            if name in d:
                del d[name]

        return d

    def __setstate__(self, d):
        """
        Sets the object to have the state described by `d`.

        Parameters
        ----------
        d : dict
            A dictionary mapping string names of fields to values for
            these fields.
        """
        # patch old pkl files
        if '_dataset' in d:
            d['_datasets'] = [d['_dataset']]
            del d['_dataset']

        self.__dict__.update(d)

    def add_channel(self,
                    name,
                    ipt,
                    val,
                    dataset=None,
                    prereqs=None,
                    data_specs=None):
        """
        Asks the monitor to start tracking a new value.  Can be called even
        after the monitor is already in use.

        Parameters
        ----------
        name : str
            The display name in the monitor.
        ipt : tensor_like
            The symbolic tensor which should be clamped to the data. \
            (or a list/tuple containing symbolic tensors, following the \
            data_specs)
        val : tensor_like
            The value (function of `ipt`) to be tracked.
        dataset : pylearn2.datasets.Dataset
            Which dataset to compute this channel on
        prereqs : list of callables that take a list of numpy tensors
            Each prereq must be called exactly once per each new batch of \
            data drawn *from dataset* before the channel value is computed \
            if two channels provide a prereq with exactly the same id, that \
            prereq will only be called once
        data_specs : (space, source) pair
            Identifies the order, format and semantics of ipt
        """
        if isinstance(val, (float, int, long)):
            val = np.cast[theano.config.floatX](val)

        val = T.as_tensor_variable(val)

        if data_specs is None:
            warnings.warn("parameter 'data_specs' should be provided when " +
                          "calling add_channel. We will build a default one.",
                          stacklevel=2)
            if isinstance(ipt, list):
                ipt = tuple(ipt)
            if ipt is not None and not isinstance(ipt, tuple):
                ipt = (ipt, )

            if ipt is None:
                data_specs = (NullSpace(), '')
            elif len(ipt) == 0:
                data_specs = (CompositeSpace([]), ())
            elif hasattr(dataset, 'get_data_specs'):
                dataset_space, dataset_source = dataset.get_data_specs()
                if (len(ipt) == 1 and dataset_source is not None
                        and (not isinstance(dataset_source, tuple)
                             or len(dataset_source) == 1)
                        and 'features' in dataset_source):
                    data_specs = (dataset_space, dataset_source)
                elif (len(ipt) == 2
                      and dataset_source == ('features', 'targets')):
                    data_specs = (dataset_space, dataset_source)
                else:
                    raise ValueError("Cannot infer default data_specs for " +
                                     "the following input points and " +
                                     "dataset: ipt = %s, dataset = %s" %
                                     (ipt, dataset))

        data_specs[0].validate(ipt)

        mapping = DataSpecsMapping(data_specs)
        flat_ipt = mapping.flatten(ipt)
        if not isinstance(flat_ipt, tuple):
            flat_ipt = (flat_ipt, )
        inputs = theano.gof.graph.inputs([val])
        for elem in inputs:
            if not hasattr(elem, 'get_value') and not isinstance(
                    elem, theano.gof.graph.Constant):
                if elem not in flat_ipt:
                    raise ValueError("Unspecified input: " + str(elem) +
                                     ". This may be due to an incorrect " +
                                     "implementation of a cost's " +
                                     "get_data_specs() method, or of a " +
                                     "model's get_monitoring_data_specs() " +
                                     "method.")

        mode = self.theano_function_mode
        if mode is not None and hasattr(mode, 'record'):
            mode.record.handle_line('Adding monitor channel ' + name + '\n')
            assert isinstance(flat_ipt, tuple)
            if len(flat_ipt) != 1:
                for elem in flat_ipt:
                    mode.record.handle_line('Includes input var ' +
                                            var_descriptor(elem) + '\n')
            else:
                mode.record.handle_line(name + ' input var is ' +
                                        var_descriptor(flat_ipt[0]) + '\n')
            mode.record.handle_line('channel ' + name + ' is ' +
                                    var_descriptor(val) + '\n')

        if dataset is None:
            if len(self._datasets) == 1:
                dataset = self._datasets[0]
            elif len(self._datasets) == 0:
                raise ValueError(_err_no_data)
            else:
                raise ValueError(_err_ambig_data)

        try:
            self._datasets.index(dataset)
        except ValueError:
            raise ValueError("The dataset specified is not one of the " +
                             "monitor's datasets")

        if name in self.channels:
            raise ValueError("Tried to create the same channel twice (%s)" %
                             name)

        self.channels[name] = MonitorChannel(ipt, val, name, data_specs,
                                             dataset, prereqs)
        self._dirty = True

    def _sanity_check(self):
        """
        Sometimes we serialize models and then load them somewhere else
        but still try to use their Monitor, and the Monitor is in a mangled
        state. I've added some calls to _sanity_check to try to catch when
        that happens. Not sure what to do for a long term fix. I think it
        requires making theano graphs serializable first.
        """
        for name in self.channels:
            channel = self.channels[name]
            assert hasattr(channel, 'prereqs')

    @classmethod
    def get_monitor(cls, model):
        """
        Returns a model's monitor. If the model doesn't have a monitor yet,
        installs one and returns that.

        Parameters
        ----------
        model : object
            An object that implements the `Model` interface specified in \
            `pylearn2.models`.
        """

        if hasattr(model, 'monitor'):
            rval = model.monitor
            rval._sanity_check()
        else:
            rval = Monitor(model)
            model.monitor = rval

        return rval

    # TODO: find out if this method is used anywhere, remove if not.
    @property
    def batch_size(self):
        """
        Returns
        -------
        batch_size : int
            The size of the batches used for monitoring
        """
        return self._batch_size

    # TODO: find out if this method is used anywhere, remove if not.
    @property
    def num_batches(self):
        """
        Returns
        -------
        num_batches : int
            The number of batches used for monitoring
        """
        return self._num_batches

    def setup(self,
              dataset,
              cost,
              batch_size,
              num_batches=None,
              extra_costs=None,
              mode='sequential',
              obj_prereqs=None,
              cost_monitoring_args=None):
        """
        Sets up the monitor for a cost minimization problem.
        Adds channels defined by both the model and the cost for
        the specified dataset(s), as well as a channel called 'objective'
        defined by the costs' __call__ method.

        Parameters
        ----------
        dataset : pylearn2.datasets.Dataset
            Dataset or dictionary mapping string names to Datasets.  If \
            string names are used, then for every dataset, each channel \
            defined by the model or cost will be replicated with that \
            dataset's name followed by an underscore as the prefix. For \
            example, if your cost defines a channel called 'misclass', and \
            datasets is {'train' : train_dataset, 'valid' : valid_dataset} \
            you will get channels called 'train_misclass' and 'valid_misclass'.
        cost : pylearn2.costs.Cost
            The cost being optimized by training. The value of the cost will
            appear as the `objective` channel. Its `get_monitoring_channels`
            method will also be used to supply other channels.
        extra_costs : OrderedDict, optional
            A dictionary mapping channel names to Cost objects.
            Their value will appear as the specified channel name.
            They will also provide more monitoring channels via their
            `get_monitoring_channels` method.
        obj_prereqs : None, or list of functions
            Functions to pass as prerequisites to the `objective` channel.
        cost_monitoring_args : dict
            Dictionary of kwargs that will be passed to \
            `cost.get_monitoring_channels()` (but not for the extra_costs).
        """

        if dataset is None:
            return
        if isinstance(dataset, Dataset):
            dataset = {'': dataset}
        else:
            assert isinstance(dataset, dict)
            assert all(isinstance(key, str) for key in dataset)
            assert all(isinstance(dataset[key], Dataset) for key in dataset)

        if extra_costs is None:
            costs = {}
        else:
            costs = extra_costs
        assert '' not in costs
        costs[''] = cost

        if cost_monitoring_args is None:
            cost_monitoring_args = {}

        model = self.model

        # Build a composite data_specs containing the specs for all costs,
        # then the specs of the model
        cost_names = sorted(costs.keys())
        spaces = []
        sources = []
        for c in cost_names:
            c_space, c_source = costs[c].get_data_specs(model)
            spaces.append(c_space)
            sources.append(c_source)

        # Ask the model for the data_specs needed
        m_space, m_source = model.get_monitoring_data_specs()
        spaces.append(m_space)
        sources.append(m_source)

        nested_space = CompositeSpace(spaces)
        nested_sources = tuple(sources)

        # Flatten this data_specs, so we build only one symbolic Theano
        # variable for each of the unique (space, source) pairs.
        mapping = DataSpecsMapping((nested_space, nested_sources))
        space_tuple = mapping.flatten(nested_space, return_tuple=True)
        source_tuple = mapping.flatten(nested_sources, return_tuple=True)
        ipt = tuple(
            space.make_theano_batch(name='monitor_%s' % source,
                                    batch_size=None)
            for (space, source) in safe_zip(space_tuple, source_tuple))

        # Build a nested tuple from ipt, to dispatch the appropriate parts
        # of the ipt batch to each cost
        nested_ipt = mapping.nest(ipt)

        custom_channels = {}
        for i, cost_name in enumerate(cost_names):
            if cost_name == '':
                prefix = ''
            else:
                prefix = cost_name + '_'
            cost = costs[cost_name]
            cost_ipt = nested_ipt[i]
            raw_channels = cost.get_monitoring_channels(model, cost_ipt)
            channels = {}
            for name in raw_channels:
                # We need three things: the value itself (raw_channels[name]),
                # the input variables (cost_ipt), and the data_specs for
                # these input variables ((spaces[i], sources[i]))
                channels[prefix + name] = (raw_channels[name], cost_ipt,
                                           (spaces[i], sources[i]))
            custom_channels.update(channels)

        # Use the last inputs from nested_ipt for the model
        model_channels = model.get_monitoring_channels(nested_ipt[-1])
        channels = {}
        for name in model_channels:
            # Note: some code used to consider that model_channels[name]
            # could be a a (channel, prereqs) pair, this is not supported.
            channels[name] = (model_channels[name], nested_ipt[-1],
                              (spaces[-1], sources[-1]))
        custom_channels.update(channels)

        if is_stochastic(mode):
            seed = [[2013, 02, 22]]
        else:
            seed = None

        for dataset_name in dataset:
            cur_dataset = dataset[dataset_name]
            self.add_dataset(dataset=cur_dataset,
                             mode=mode,
                             batch_size=batch_size,
                             num_batches=num_batches,
                             seed=seed)
            if dataset_name == '':
                dprefix = ''
            else:
                dprefix = dataset_name + '_'
            # These channel name 'objective' must not vary, since callbacks
            # that respond to the values in the monitor use the name to find
            # it.
            for i, cost_name in enumerate(cost_names):
                cost = costs[cost_name]
                cost_ipt = nested_ipt[i]
                cost_value = cost.expr(model, cost_ipt)
                if cost_value is not None:
                    if cost_name == '':
                        name = dprefix + 'objective'
                        prereqs = obj_prereqs
                    else:
                        name = dprefix + cost_name
                        prereqs = None

                    cost.get_data_specs(model)[0].validate(cost_ipt)
                    self.add_channel(name=name,
                                     ipt=cost_ipt,
                                     val=cost_value,
                                     data_specs=cost.get_data_specs(model),
                                     dataset=cur_dataset,
                                     prereqs=prereqs)

            for key in custom_channels:
                val, ipt, data_specs = custom_channels[key]
                data_specs[0].validate(ipt)
                self.add_channel(name=dprefix + key,
                                 ipt=ipt,
                                 val=val,
                                 data_specs=data_specs,
                                 dataset=cur_dataset)
Пример #45
0
    def train(self, monitor_channels=None, plot=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #########################
        # gradients and updates #
        #########################
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        # Now create the training cost function for the model to use while training - update parameters
        # gradient!
        # First find the basic variables that will be updated
        params = set()
        for param in self.params.values():
            params.update(base_variables(param))
        params = list(params)
        gradients = grad(cost=self.loss_expression, wrt=params)
        # now create the dictionary mapping the parameter with its gradient
        gradients = OrderedDict(
            [(param, g) for param, g in zip(params, gradients)]
        )
        # clip gradients if we want.
        gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(gradients)

        # Combine the updates from the model also if applicable
        updates = self.model.get_updates()
        if updates:
            updates.update(gradient_updates)
        else:
            updates = gradient_updates

        log.info("%s params: %s", self.model._classname, str(list(self.params.keys())))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs())
        if self.loss_targets is not None:
            function_input += self.loss_targets
        # Compile the training function!
        log.info('Compiling f_learn function for model %s...', self.model._classname)
        t = time.time()

        f_learn = function(inputs=function_input,
                           updates=updates,
                           outputs=[self.loss_expression] + list(self.train_monitors_dict.values()),
                           name='f_learn')

        log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        log.info("-----------TRAINING %s FOR %d EPOCHS-----------",
                 self.model._classname, self.n_epoch)

        self.STOP = False
        self.epoch_counter = 0
        # reset any decay params
        for decay_param in self.get_decay_params():
            decay_param.reset()

        self.times = []
        self.best_cost = numpy.inf
        self.best_params = None
        self.patience = 0

        t = time.time()

        while not self.STOP:
            try:
                self.STOP = self._perform_one_epoch(f_learn, plot)
            except KeyboardInterrupt:
                log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                self.STOP = True

        # save params
        if self.best_params is not None:
            log.debug("Restoring best model parameters...")
            self.model.set_param_values(self.best_params, borrow=False)
        log.debug("Saving model parameters...")
        self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
Пример #46
0
class StemCell(NonlinCell):
    """
    WRITEME

    Parameters
    ----------
    .. todo::
    """
    def __init__(self,
                 parent=[],
                 parent_dim=[],
                 nout=None,
                 init_W=InitCell('randn'),
                 init_b=InitCell('zeros'),
                 cons=0.,
                 name=None,
                 lr_scaler=None,
                 **kwargs):
        super(StemCell, self).__init__(**kwargs)
        if name is None:
            name = self.__class__.name__.lower()
        self.name = name
        self.nout = nout
        self.init_W = init_W
        self.init_b = init_b
        self.cons = cons
        self.parent = OrderedDict()
        parent_dim = tolist(parent_dim)
        for i, par in enumerate(tolist(parent)):
            if len(parent_dim) != 0 and len(parent) != 0:
                if len(parent) != len(parent_dim):
                    raise AssertionError(
                        "You probably had a mistake providing,\
                                          write number of values. It will end,\
                                          up with a model containing a bug.")
                self.parent[par] = parent_dim[i]
            else:
                self.parent[par] = None
        self.params = OrderedDict()
        self.lr_scaler = lr_scaler

    def get_params(self):
        return self.params

    def fprop(self, x=None):
        raise NotImplementedError(
            str(type(self)) + " does not implement Layer.fprop.")

    def alloc(self, x):
        self.params[x.name] = x

    def initialize(self):
        for parname, parout in self.parent.items():
            W_shape = (parout, self.nout)
            W_name = 'W_' + parname + '__' + self.name
            self.alloc(self.init_W.get(W_shape, W_name))
        self.alloc(self.init_b.get(self.nout, 'b_' + self.name))

    def add_noisy_params(self, key=['W'], weight_noise=0.075):
        self.noisy_params = OrderedDict()
        for param in self.params.items():
            if param[0].split('_')[0] in key:
                self.noisy_params[param[0]] = add_noise(
                    param[1], weight_noise, self.theano_rng)

    def del_noisy_params(self):
        del self.noisy_params
Пример #47
0
 def __init__(self, iterable=None):
     self.data = OrderedDict()
     if iterable is not None:
         self.update(iterable)
Пример #48
0
class Optimizer(object):
    """
    Default interface for an optimizer implementation - this provides the necessary parameter updates when
    training a model on a dataset using an online stochastic process. The base framework for performing
    stochastic gradient descent.
    """
    def __init__(self, dataset, loss=None, model=None,
                 epochs=1000, batch_size=100, min_batch_size=1,
                 save_freq=10, stop_threshold=None, stop_patience=50,
                 learning_rate=1e-3, lr_decay=None, lr_decay_factor=None,
                 grad_clip=None, hard_clip=False,
                 **kwargs):
        """
        Initialize the Optimizer.

        Parameters
        ----------
        dataset : Dataset
            The :class:`opendeep.data.Dataset` to use when training the Model.
        loss : Loss
            The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result.
        model : Model
            The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a
            Model's .train() method.
        epochs : int
            How many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        min_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_freq : int, optional
            How many epochs to train between each new save of the Model's parameters.
        stop_threshold : float, optional
            The factor by how much the best validation training score needs to improve to determine early stopping.
        stop_patience : int, optional
            The patience or number of epochs to wait after the stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for classes of decay and documentation.
        lr_decay_factor : float
            The amount of decay to use for the ``lr_decay`` type of decay.
        grad_clip : float, optional
            Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling.
        hard_clip : bool
            Whether to use a hard cutoff or rescaling for clipping gradients.
        """
        log.info("Initializing optimizer %s", str(self.__class__.__name__))

        # Deal with early stopping None initializations (no early stopping).
        if not stop_threshold:
            stop_threshold = numpy.inf
        if not save_freq:
            save_freq = 1000000
        if not stop_patience:
            stop_patience = 1

        # Put all init parameters in self.args so we can log the initial configuration.
        self.args = locals().copy()
        self.args.pop('self')
        kwargs = self.args.pop('kwargs')
        self.args = add_kwargs_to_dict(kwargs, self.args)
        # log the arguments
        log.info("Optimizer config args: %s", str(self.args))
        # if the optimizer wasn't initialized with a Model (train() being called from the model class itself),
        # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there
        # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't
        # the best option because other methods besides stochastic ones can exist for optimizers in the future.
        # TODO: fix this up - feels like a hack just to make model.train() work...
        if not model:
            return
        # Otherwise, things are proceeding as normal. Carry on...

        assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \
                                         "Found %s" % str(model.__class__.__name__)
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \
                                             "Found %s" % str(dataset.__class__.__name__)
        # deal with loss expression/targets
        if loss is not None:
            assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \
                                           "Found %s" % str(loss.__class__.__name__)
        if isinstance(loss, Loss):
            self.loss_targets = loss.get_targets()
            self.loss_expression = loss.get_loss()
        else:
            assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented."
            if isinstance(model.get_loss(), tuple):
                self.loss_targets = raise_to_list(model.get_loss()[0])
                self.loss_expression = model.get_loss()[1]
            else:
                self.loss_targets = None
                self.loss_expression = model.get_loss()

        model_inputs = raise_to_list(model.get_inputs())
        n_model_inputs = len(model_inputs)

        model_targets = self.loss_targets or []
        for input in model_inputs:
            if input in model_targets:
                model_targets.remove(input)

        n_model_targets = len(model_targets)
        self.unsupervised = (n_model_targets is 0)
        # make sure the number of inputs/targets matches up with the dataset properties
        # train
        assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \
            "Dataset has %d train inputs, while model expects %d" % \
            (len(raise_to_list(dataset.train_inputs)), n_model_inputs)
        if not self.unsupervised:
            assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \
                "Dataset has %d train targets, while model expects %d" % \
                (len(raise_to_list(dataset.train_targets) or []), n_model_targets)
        # valid
        if dataset.valid_inputs is not None:
            assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \
                "Dataset has %d valid inputs, while model expects %d" % \
                (len(raise_to_list(dataset.valid_inputs)), n_model_inputs)
            if not self.unsupervised:
                assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \
                    "Dataset has %d valid targets, while model expects %d" % \
                    (len(raise_to_list(dataset.valid_targets) or []), n_model_targets)
        # test
        if dataset.test_inputs is not None:
            assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \
                "Dataset has %d test inputs, while model expects %d" % \
                (len(raise_to_list(dataset.test_inputs)), n_model_inputs)
            if not self.unsupervised:
                assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \
                    "Dataset has %d test targets, while model expects %d" % \
                    (len(raise_to_list(dataset.test_targets) or []), n_model_targets)

        # now we are happy, we can add them to `self`
        self.model = model
        self.dataset = dataset
        self.loss = loss

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        # whether to scale individual model parameters' learning rates.
        self.lr_scalers = self.model.get_lr_scalers()
        # whether to decay
        if lr_decay:
            self.learning_rate_decay = get_decay_function(lr_decay,
                                                          self.learning_rate,
                                                          learning_rate,
                                                          lr_decay_factor)
        else:
            self.learning_rate_decay = False

        # rest of initial parameters needed for training.
        self.batch_size = batch_size
        self.min_batch_size = min_batch_size
        self.n_epoch = epochs
        self.save_frequency = save_freq
        self.early_stop_threshold = stop_threshold
        self.early_stop_length = stop_patience
        self.grad_clip = grad_clip
        self.hard_clip = hard_clip

    def get_updates(self, gradients):
        """
        This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate.

        Parameters
        ----------
        gradients : dict
            A dictionary mapping from the model's parameters to their gradients.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.
        """
        log.debug('Setting up Stochastic Gradient Descent for optimizer...')
        updates = OrderedDict()
        for (param, gradient) in iteritems(gradients):
            scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.)
            updates[param] = param - scaled_lr * gradient
        return updates

    def train(self, monitor_channels=None, plot=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #########################
        # gradients and updates #
        #########################
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        # Now create the training cost function for the model to use while training - update parameters
        # gradient!
        # First find the basic variables that will be updated
        params = set()
        for param in self.params.values():
            params.update(base_variables(param))
        params = list(params)
        gradients = grad(cost=self.loss_expression, wrt=params)
        # now create the dictionary mapping the parameter with its gradient
        gradients = OrderedDict(
            [(param, g) for param, g in zip(params, gradients)]
        )
        # clip gradients if we want.
        gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(gradients)

        # Combine the updates from the model also if applicable
        updates = self.model.get_updates()
        if updates:
            updates.update(gradient_updates)
        else:
            updates = gradient_updates

        log.info("%s params: %s", self.model._classname, str(list(self.params.keys())))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs())
        if self.loss_targets is not None:
            function_input += self.loss_targets
        # Compile the training function!
        log.info('Compiling f_learn function for model %s...', self.model._classname)
        t = time.time()

        f_learn = function(inputs=function_input,
                           updates=updates,
                           outputs=[self.loss_expression] + list(self.train_monitors_dict.values()),
                           name='f_learn')

        log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        log.info("-----------TRAINING %s FOR %d EPOCHS-----------",
                 self.model._classname, self.n_epoch)

        self.STOP = False
        self.epoch_counter = 0
        # reset any decay params
        for decay_param in self.get_decay_params():
            decay_param.reset()

        self.times = []
        self.best_cost = numpy.inf
        self.best_params = None
        self.patience = 0

        t = time.time()

        while not self.STOP:
            try:
                self.STOP = self._perform_one_epoch(f_learn, plot)
            except KeyboardInterrupt:
                log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                self.STOP = True

        # save params
        if self.best_params is not None:
            log.debug("Restoring best model parameters...")
            self.model.set_param_values(self.best_params, borrow=False)
        log.debug("Saving model parameters...")
        self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

    def _perform_one_epoch(self, f_learn, plot=None):
        """
        Performs a single training iteration with the given learn function.
        """
        self.epoch_counter += 1
        t = time.time()
        log.info('EPOCH %s', str(self.epoch_counter))

        # set the noise switches on for training function! (this is where things like dropout happen)
        if not self.model.switches_on:
            self.model.turn_on_switches()

        #########
        # train #
        #########
        train_costs = []
        train_monitors = {key: [] for key in self.train_monitors_dict.keys()}
        train_data = [
            minibatch(input_data, self.batch_size, self.min_batch_size)
            for input_data in raise_to_list(self.dataset.train_inputs)
            ]
        if self.dataset.train_targets is not None and not self.unsupervised:
            train_data += [
                minibatch(target, self.batch_size, self.min_batch_size)
                for target in raise_to_list(self.dataset.train_targets)
                ]

        for batch in min_normalized_izip(*train_data):
            _outs = raise_to_list(f_learn(*batch))
            train_costs.append(_outs[0])
            # handle any user defined monitors (if different from the train cost)
            if len(train_monitors) > 0:
                current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:])
                for name, val in current_monitors:
                    val = numpy.asarray(val)
                    train_monitors[name].append(val)

        # get the mean values for the batches
        mean_train = numpy.mean(train_costs, 0)
        current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()}
        # log the mean values!
        log.info('Train cost: %s', trunc(mean_train))
        if len(current_mean_monitors) > 0:
            log.info('Train monitors: %s', str(current_mean_monitors))
        # send the values to their outservices
        for name, service in self.train_monitors_outservice_dict.items():
            if name in current_mean_monitors and service:
                service.write(current_mean_monitors[name], "train")
        # if there is a plot, also send them over!
        if plot:
            plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

        # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :)
        if self.model.switches_on:
            self.model.turn_off_switches()

        #########
        # valid #
        #########
        self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets,
                                  self.valid_monitors_dict, self.valid_monitor_function,
                                  self.valid_monitors_outservice_dict, plot)

        ########
        # test #
        ########
        self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets,
                                  self.test_monitors_dict, self.test_monitor_function,
                                  self.test_monitors_outservice_dict, plot)

        ###########
        # cleanup #
        ###########
        # check for early stopping on train costs
        cost = numpy.sum(train_costs)
        # if the cost improved, reset the patience and record the best cost.
        if cost < self.best_cost * self.early_stop_threshold:
            self.patience = 0
            self.best_cost = cost
            # save the parameters that made it the best
            self.best_params = self.model.get_param_values(borrow=False)
        elif not numpy.isnan(cost):
            self.patience += 1

        # check for stopping either from n_epochs or from threshold/patience
        stop = False
        if self.epoch_counter >= self.n_epoch:
            log.info("Stopping (reached max number of epochs)...")
            stop = True
        if self.patience >= self.early_stop_length:
            log.info("Stopping early (reached stop threshold)...")
            stop = True

        timing = time.time() - t
        self.times.append(timing)

        log.info('time: ' + make_time_units_string(timing))

        log.debug('remaining time: ' +
                 make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times)))

        if (self.epoch_counter % self.save_frequency) == 0:
            #save params
            self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        # ANNEAL!
        if not stop:
            # perform the appropriate decay on the decay functions/parameters for this optimizer and model
            for decay_param in self.get_decay_params():
                decay_param.decay()

        # return whether or not to stop this epoch
        return stop

    def _compute_over_subset(self, subset, inputs, targets,
                             monitors_dict, monitor_function, monitors_outservice_dict,
                             plot):
        inputs = raise_to_list(inputs)
        targets = raise_to_list(targets)
        if inputs is not None and len(monitors_dict) > 0:
            monitors = {key: [] for key in monitors_dict.keys()}
            data = [minibatch(input, self.batch_size, self.min_batch_size) for input in inputs]
            if targets is not None and not self.unsupervised:
                data += [minibatch(target, self.batch_size, self.min_batch_size) for target in targets]

            for batch in min_normalized_izip(*data):
                _outs = raise_to_list(monitor_function(*batch))
                current_monitors = zip(monitors_dict.keys(), _outs)
                for name, val in current_monitors:
                    val = numpy.asarray(val)
                    monitors[name].append(val)

            # get the mean values for the batches
            current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in monitors.items()}
            # log the mean values!
            log.info('%s monitors: %s', subset, str(current_mean_monitors))
            # send the values to their outservices
            for name, service in monitors_outservice_dict.items():
                if name in current_mean_monitors and service:
                    service.write(current_mean_monitors[name], subset)
            # if there is a plot, also send them over!
            if plot:
                plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)

    def get_decay_params(self):
        """
        Returns a list of all the Decay objects to decay during training.

        Returns
        -------
        list
            List of Decay objects to use after each training epoch - in this case the
            learning rate decay.
        """
        decay_params = self.model.get_decay_params()
        if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay:
            decay_params.append(self.learning_rate_decay)
        return decay_params
    def build_computation_graph(self):
        #################
        # Build the GSN #
        #################
        log.debug("Building GSN graphs...")

        # GSN for training - with noise specified in initialization
        # if there is no hiddens_hook, build the GSN normally using the input X
        if not self.hiddens_flag:
            p_X_chain, _ = self.build_gsn(add_noise=self.add_noise)

        # if there is a hiddens_hook, we want to change the order layers are updated and make this purely
        # generative from the hiddens
        else:
            p_X_chain, _, = self.build_gsn(hiddens=self.hiddens,
                                           add_noise=self.add_noise,
                                           reverse=True)

        # GSN for prediction - same as above but no noise
        # deal with hiddens_hook exactly as above.
        if not self.hiddens_flag:
            p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False)
        else:
            p_X_chain_recon, recon_hiddens = self.build_gsn(
                hiddens=self.hiddens, add_noise=False, reverse=True)

        ####################
        # Costs and output #
        ####################
        log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN')
        # use the noisy ones for training cost
        costs = [
            self.cost_function(output=rX, target=self.X, **self.cost_args)
            for rX in p_X_chain
        ]
        self.show_cost = costs[-1]  # for a monitor to show progress
        cost = numpy.sum(
            costs
        )  # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH

        # use the non-noisy graph for prediction
        gsn_costs_recon = [
            self.cost_function(output=rX, target=self.X, **self.cost_args)
            for rX in p_X_chain_recon
        ]
        # another monitor, same as self.show_cost but on the non-noisy graph.
        self.monitor = gsn_costs_recon[-1]
        # this should be considered the main output of the computation, the sample after the
        # last walkback from the non-noisy graph.
        output = p_X_chain_recon[-1]
        # these should be considered the model's hidden representation - the hidden representation after
        # the last walkback from the non-noisy graph.
        hiddens = recon_hiddens

        train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0)
        train_mse = T.mean(train_mse)

        mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0)
        mse = T.mean(mse)

        monitors = OrderedDict([('noisy_recon_cost', self.show_cost),
                                ('recon_cost', self.monitor), ('mse', mse),
                                ('train_mse', train_mse)])

        ############
        # Sampling #
        ############
        # the input to the sampling function
        X_sample = T.matrix("X_sampling")
        self.network_state_input = [X_sample] + [
            T.matrix("H_sampling_" + str(i + 1)) for i in range(self.layers)
        ]

        # "Output" state of the network (noisy)
        # initialized with input, then we apply updates
        self.network_state_output = [X_sample] + self.network_state_input[1:]
        visible_pX_chain = []

        # ONE update
        log.debug("Performing one walkback in network state sampling.")
        self.update_layers(self.network_state_output,
                           visible_pX_chain,
                           add_noise=True,
                           reverse=False)

        #####################################################
        #     Create the run and monitor functions      #
        #####################################################
        log.debug("Compiling functions...")
        t = time.time()

        # doesn't make sense to have this if there is a hiddens_hook
        if not self.hiddens_flag:
            # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy
            # computation graph
            log.debug("f_run...")
            self.f_run = function(inputs=[self.X],
                                  outputs=output,
                                  name='gsn_f_run')

        # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the
        # input to f_run)
        log.debug("f_noise...")
        self.f_noise = function(inputs=[self.X],
                                outputs=self.input_noise(self.X),
                                name='gsn_f_noise')

        # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood
        # or visualization)
        log.debug("f_sample...")
        if self.layers == 1:
            self.f_sample = function(inputs=[X_sample],
                                     outputs=visible_pX_chain[-1],
                                     name='gsn_f_sample_single_layer')
        else:
            # WHY IS THERE A WARNING????
            # because the first odd layers are not used -> directly computed FROM THE EVEN layers
            # unused input = warn
            self.f_sample = function(inputs=self.network_state_input,
                                     outputs=self.network_state_output +
                                     visible_pX_chain,
                                     name='gsn_f_sample')

        log.debug("GSN compiling done. Took %s",
                  make_time_units_string(time.time() - t))

        return cost, monitors, output, hiddens
Пример #50
0
    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is needed so that if new channels are added, Theano's
        optimizations make sure (to the extent that they can) that the new
        channels and old channels don't have any redundant calculations.

        It is also needed to regenerate Theano functions after pickling and
        unpickling, since Theano functions should not be pickled.
        """
        self._dirty = False

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data the model
        # acts on
        X = self.model.get_input_space().make_theano_batch(name="monitoring_X")
        if config.compute_test_value != 'off':
            m = self.model.get_test_batch_size()
            test_value = self.model.get_input_space().get_origin_batch(m)
            X.tag.test_value = np.cast[X.type.dtype](test_value)
        if self.require_label:
            Y = self.model.get_output_space().make_theano_batch(
                name="monitoring_Y")

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line(
                    'compiling monitor including channel ' + key + '\n')
            log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \
              for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                    self._num_batches, self._batch_size)]
        self.num_examples = [
            np.cast[config.floatX](float(i.num_examples)) for i in it
        ]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for channel in self.channels.values():
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]
            if isinstance(channel.graph_input, (list, tuple)):
                channel_X, channel_Y = channel.graph_input
                assert channel_X not in g or g[channel_X] is X
                assert channel_Y not in g or g[channel_Y] is Y
                g[channel_X] = X
                g[channel_Y] = Y
            else:
                channel_X = channel.graph_input
                assert channel_X not in g or g[channel_X] is X
                g[channel_X] = X
            if n == 0:
                raise ValueError(
                    "Iterating over 0 examples results in divide by 0")
            if self.topo:
                batch_index = d.get_topo_batch_axis()
            else:
                batch_index = 0
            val = channel.val * T.cast(X.shape[batch_index],
                                       config.floatX) / cur_num_examples
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' \
                                + key.name + ' has dtype ' + key.dtype + \
                                ' but is driven by an expression with type ' + \
                                up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if self.require_label:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line('compiling supervised accum\n')
                    # Some channels may not depend on the data, ie, they might just monitor the model
                    # parameters, or some shared variable updated by the training algorithm, so we
                    # need to ignore the unused input error
                    self.accum.append(
                        function([X, Y],
                                 givens=g,
                                 updates=u,
                                 mode=self.theano_function_mode,
                                 name=function_name))
                else:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line(
                            'compiling unsupervised accum\n')
                    self.accum.append(
                        function([X],
                                 givens=g,
                                 updates=u,
                                 mode=self.theano_function_mode,
                                 name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del(
            [name for name in final_names if name not in init_names])
Пример #51
0
 def add_noisy_params(self, key=['W'], weight_noise=0.075):
     self.noisy_params = OrderedDict()
     for param in self.params.items():
         if param[0].split('_')[0] in key:
             self.noisy_params[param[0]] = add_noise(
                 param[1], weight_noise, self.theano_rng)
Пример #52
0
def clip_gradients(gradients, grad_clip=5., hard_clip=False):
    """
    This returns the gradient parameters clipped according to the grad_clip value given in initialization.

    As described here: http://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/

    Code mostly taken from https://github.com/kastnerkyle/minet/blob/master/minet/net.py

    Based on:

    Pascanu, Razvan, Tomas Mikolov, and Yoshua Bengio. "On the difficulty of training
            recurrent neural networks." arXiv preprint arXiv:1211.5063 (2012).

    Parameters
    ----------
    gradients : dict
        A dictionary mapping from the model's parameters to their
        gradients.
    grad_clip : float, optional
        How much to clip gradients (if at all).
    hard_clip : bool
        Whether to use hard clipping (keeping gradients at grad_clip level), or soft clipping (rescaling based
        on grad_clip).

    Returns
    -------
    clipgrads : dict
        A dictionary mapping from the model's parameters to their correctly clipped
        gradients. (If no self.grad_clip, this just returns the original `gradients` input parameter).
    """
    if grad_clip:
        gradients = gradients.items()
        params = [item[0] for item in gradients]
        grads = [item[1] for item in gradients]

        # Gradient clipping
        grad_norm = T.sqrt(sum([T.sqr(grad).sum() for grad in grads]))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        grad_norm = T.sqrt(grad_norm)
        scaling_num = grad_clip
        scaling_den = T.maximum(grad_clip, grad_norm)

        if hard_clip:
            # do the NaN/inf trick
            grads = [T.switch(not_finite,
                              0.1 * param,
                              grad)
                     for param, grad in gradients]
            # hard clip gradients above or below grad_clip to be = grad_clip
            grads = [T.switch(T.ge(grad_norm, grad_clip),
                              T.sgn(grad) * grad_clip,
                              grad)
                     for grad in grads]
        else:
            # NaN/inf trick combined with scaling.
            grads = [T.switch(not_finite,
                              0.1 * param,
                              grad * (scaling_num / scaling_den))
                     for param, grad in gradients]

        clipgrads = OrderedDict(zip(params, grads))
        return clipgrads
    else:
        return gradients
Пример #53
0
def init_params(params):
    tparams = OrderedDict()
    for kk, pp in params.items():
        tparams[kk] = theano.shared(params[kk], name=kk)
    return tparams
Пример #54
0
class AlexNet(Model):
    """
    This is the base model for AlexNet, Alex Krizhevsky's efficient deep convolutional net described in:
    'ImageNet Classification with Deep Convolutional Neural Networks'
    Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton
    http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf

    Most of the code here is adapted from the authors listed in the license above, from the paper:
    'Theano-based large-scale visual recognition with multiple GPUs'
    Weiguang Ding & Ruoyan Wnag, Fei Mao, Graham Taylor
    http://arxiv.org/pdf/1412.2302.pdf

    Copyright (c) 2014, Weiguang Ding, Ruoyan Wang, Fei Mao and Graham Taylor
    All rights reserved.
    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
        1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
        2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
        3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    """
    defaults = {  # data stuff
        "use_data_layer": False,
        "rand_crop": True,
        "batch_size":
        256,  # convolutional nets are particular about the batch size
        "output_path": '/outputs/alexnet/'
    }

    def __init__(self,
                 config=None,
                 defaults=defaults,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 use_data_layer=None,
                 rand_crop=None,
                 batch_size=None):
        # init Model to combine the defaults and config dictionaries.
        super(AlexNet, self).__init__(config, defaults)
        # all configuration parameters are now in self.args

        if inputs_hook or hiddens_hook or params_hook:
            log.critical(
                "Inputs_hook, hiddens_hook, and params_hook not implemented yet for AlexNet!"
            )
            raise NotImplementedError()

        self.flag_datalayer = use_data_layer or self.args.get('use_data_layer')
        self.batch_size = batch_size or self.args.get('batch_size')
        self.rand_crop = rand_crop or self.args.get('rand_crop')

        ####################
        # Theano variables #
        ####################
        # allocate symbolic variables for the data
        # 'rand' is a random array used for random cropping/mirroring of data
        self.x = T.ftensor4('x')
        self.y = T.lvector('y')
        self.rand = T.fvector('rand')

        ##########
        # params #
        ##########
        self.params = []

        # make the network!
        self.build_computation_graph()

    def build_computation_graph(self):
        ###################### BUILD NETWORK ##########################
        # whether or not to mirror the input images before feeding them into the network
        if self.flag_datalayer:
            layer_1_input = mirror_images(
                input=self.x,
                image_shape=(
                    self.batch_size,
                    3,
                    256,
                    256,
                ),  # bc01 format
                cropsize=227,
                rand=self.rand,
                flag_rand=self.rand_crop)
        else:
            layer_1_input = self.x  # 4D tensor (going to be in c01b format)

        # Start with 5 convolutional pooling layers
        log.debug("convpool layer 1...")
        convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227,
                                                      227), layer_1_input),
                                        filter_shape=(96, 3, 11, 11),
                                        convstride=4,
                                        padsize=0,
                                        group=1,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer1.get_params()

        log.debug("convpool layer 2...")
        convpool_layer2 = ConvPoolLayer(inputs_hook=((
            self.batch_size,
            96,
            27,
            27,
        ), convpool_layer1.get_outputs()),
                                        filter_shape=(256, 96, 5, 5),
                                        convstride=1,
                                        padsize=2,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.1,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer2.get_params()

        log.debug("convpool layer 3...")
        convpool_layer3 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 256, 13, 13),
                         convpool_layer2.get_outputs()),
            filter_shape=(384, 256, 3, 3),
            convstride=1,
            padsize=1,
            group=1,
            poolsize=1,
            poolstride=0,
            bias_init=0.0,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer3.get_params()

        log.debug("convpool layer 4...")
        convpool_layer4 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 384, 13, 13),
                         convpool_layer3.get_outputs()),
            filter_shape=(384, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=1,
            poolstride=0,
            bias_init=0.1,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer4.get_params()

        log.debug("convpool layer 5...")
        convpool_layer5 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 384, 13, 13),
                         convpool_layer4.get_outputs()),
            filter_shape=(256, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=3,
            poolstride=2,
            bias_init=0.0,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer5.get_params()

        # Now onto the fully-connected layers!
        fc_config = {
            'activation':
            'rectifier',  # type of activation function to use for output
            'weights_init':
            'gaussian',  # either 'gaussian' or 'uniform' - how to initialize weights
            'weights_mean': 0.0,  # mean for gaussian weights init
            'weights_std':
            0.005,  # standard deviation for gaussian weights init
            'bias_init': 0.0  # how to initialize the bias parameter
        }
        log.debug("fully connected layer 1 (model layer 6)...")
        # we want to have dropout applied to the training version, but not the test version.
        fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2)
        fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input),
                               output_size=4096,
                               config=fc_config)
        # Add this layer's parameters!
        self.params += fc_layer6.get_params()

        # now apply dropout to the output for training
        dropout_layer6 = dropout(fc_layer6.get_outputs(), corruption_level=0.5)

        log.debug("fully connected layer 2 (model layer 7)...")
        fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()),
                               output_size=4096,
                               config=fc_config)
        fc_layer7_train = BasicLayer(inputs_hook=(4096, dropout_layer6),
                                     output_size=4096,
                                     params_hook=fc_layer7.get_params(),
                                     config=fc_config)
        # Add this layer's parameters!
        self.params += fc_layer7_train.get_params()

        # apply dropout again for training
        dropout_layer7 = dropout(fc_layer7_train.get_outputs(),
                                 corruption_level=0.5)

        # last layer is a softmax prediction output layer
        softmax_config = {
            'weights_init': 'gaussian',
            'weights_mean': 0.0,
            'weights_std': 0.005,
            'bias_init': 0.0
        }
        log.debug("softmax classification layer (model layer 8)...")
        softmax_layer8 = SoftmaxLayer(inputs_hook=(4096,
                                                   fc_layer7.get_outputs()),
                                      output_size=1000,
                                      config=softmax_config)
        softmax_layer8_train = SoftmaxLayer(
            inputs_hook=(4096, dropout_layer7),
            output_size=1000,
            params_hook=softmax_layer8.get_params(),
            config=softmax_config)
        # Add this layer's parameters!
        self.params += softmax_layer8.get_params()

        # finally the softmax output from the whole thing!
        self.output = softmax_layer8.get_outputs()

        #####################
        # Cost and monitors #
        #####################
        self.train_cost = softmax_layer8_train.negative_log_likelihood(self.y)
        cost = softmax_layer8.negative_log_likelihood(self.y)
        errors = softmax_layer8.errors(self.y)
        train_errors = softmax_layer8_train.errors(self.y)

        self.monitors = OrderedDict([('cost', cost), ('errors', errors),
                                     ('dropout_errors', train_errors)])

        #########################
        # Compile the functions #
        #########################
        log.debug("Compiling functions!")
        t = time.time()
        log.debug("f_predict...")
        # use the actual argmax from the classification
        self.f_predict = function(
            inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction())
        log.debug("f_monitors")
        self.f_monitors = function(inputs=[self.x, self.y],
                                   outputs=self.monitors.values())
        log.debug("compilation took %s" %
                  make_time_units_string(time.time() - t))

    def get_inputs(self):
        """
        This should return the input(s) to the model's computation graph. This is called by the Optimizer when creating
        the theano train function on the cost expression returned by get_train_cost().

        This should normally return the same theano variable list that is used in the inputs= argument to the f_predict
        function.
        ------------------

        :return: Theano variables representing the input(s) to the training function.
        :rtype: List(theano variable)
        """
        return [self.x]

    def get_outputs(self):
        """
        This method will return the model's output variable expression from the computational graph.
        This should be what is given for the outputs= part of the 'f_predict' function from self.predict().

        This will be used for creating hooks to link models together, where these outputs can be strung as the inputs
        or hiddens to another model :)
        ------------------

        :return: theano expression of the outputs from this model's computation
        :rtype: theano tensor (expression)
        """
        return self.output

    def predict(self, input):
        """
        This method will return the model's output (run through the function), given an input. In the case that
        input_hooks or hidden_hooks are used, the function should use them appropriately and assume they are the input.

        Try to avoid re-compiling the theano function created for predict - check a hasattr(self, 'f_predict') or
        something similar first. I recommend creating your theano f_predict in a create_computation_graph method
        to be called after the class initializes.
        ------------------

        :param input: Theano/numpy tensor-like object that is the input into the model's computation graph.
        :type input: tensor

        :return: Theano/numpy tensor-like object that is the output of the model's computation graph.
        :rtype: tensor
        """
        if not hasattr(self, 'f_predict'):
            log.error(
                "Missing self.f_predict - make sure you ran self.build_computation_graph()! "
                "This should have run during initialization....")
            raise NotImplementedError()
        return self.f_predict(*input)

    def get_train_cost(self):
        """
        This returns the expression that represents the cost given an input, which is used for the Optimizer during
        training. The reason we can't just compile a f_train theano function is because updates need to be calculated
        for the parameters during gradient descent - and these updates are created in the Optimizer object.
        ------------------

        :return: theano expression of the model's training cost, from which parameter gradients will be computed.
        :rtype: theano tensor
        """
        return self.train_cost

    def get_monitors(self):
        """
        This returns a dictionary of (monitor_name: monitor_function) of variables (monitors) whose values we care
        about during training. For every monitor returned by this method, the function will be run on the
        train/validation/test dataset and its value will be reported.

        Again, please avoid recompiling the monitor functions every time - check your hasattr to see if they already
        exist!
        ------------------

        :return: Dictionary of String: theano_function for each monitor variable we care about in the model.
        :rtype: Dictionary
        """
        if not hasattr(self, 'f_monitors'):
            log.error(
                "Missing self.f_monitors - make sure you ran self.build_computation_graph()! "
                "This should have run during initialization....")
            raise NotImplementedError()
        names = ', '.join(self.monitors.keys())
        return {names: self.f_monitors}

    def get_params(self):
        """
        This returns the list of theano shared variables that will be trained by the Optimizer.
        These parameters are used in the gradient.
        ------------------

        :return: flattened list of theano shared variables to be trained
        :rtype: List(shared_variables)
        """
        return self.params
Пример #55
0
    def __init__(self, nh, nc, ne, de, cs, em, init):
        """
        nh :: dimension of the hidden layer
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings
        cs :: word window context size
        """
        tmp_emb = 0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de))
        if init:
            for row in xrange(ne + 1):
                if em[row] is not None:
                    tmp_emb[row] = em[row]

        self.emb = theano.shared(tmp_emb.astype(theano.config.floatX))

        self.Wx = theano.shared(
            0.2 *
            numpy.random.uniform(-1.0, 1.0,
                                 (de * cs, nh)).astype(theano.config.floatX))
        self.Ws = theano.shared(
            0.2 * numpy.random.uniform(-1.0, 1.0,
                                       (nc, nh)).astype(theano.config.floatX))
        self.W = theano.shared(
            0.2 * numpy.random.uniform(-1.0, 1.0,
                                       (nh, nc)).astype(theano.config.floatX))
        self.bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX))
        self.b = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX))
        self.s0 = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX))

        # bundle
        self.params = [
            self.emb, self.Wx, self.Ws, self.W, self.bh, self.b, self.s0
        ]
        self.names = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 's0']
        idxs = T.imatrix(
        )  # as many columns as context window size/lines as words in the sentence
        x = self.emb[idxs].reshape((idxs.shape[0], de * cs))
        y = T.iscalar('y')  # label

        def recurrence(x_t, s_tm1):
            h_t = T.nnet.sigmoid(
                T.dot(x_t, self.Wx) + T.dot(s_tm1, self.Ws) + self.bh)
            s_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b)[0]
            return [h_t, s_t]

        [h, s], _ = theano.scan(fn=recurrence,
                                sequences=x,
                                outputs_info=[None, self.s0],
                                n_steps=x.shape[0])

        p_y_given_x_lastword = s[-1, :]
        p_y_given_x_sentence = s
        y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost and gradients and learning rate
        lr = T.scalar('lr')
        nll = -T.mean(T.log(p_y_given_x_lastword)[y])
        gradients = T.grad(nll, self.params)
        updates = OrderedDict(
            (p, p - lr * g) for p, g in zip(self.params, gradients))

        # theano functions
        self.classify = theano.function(inputs=[idxs], outputs=y_pred)

        self.train = theano.function(inputs=[idxs, y, lr],
                                     outputs=nll,
                                     updates=updates)

        self.normalize = theano.function(
            inputs=[],
            updates={
                self.emb:
                self.emb / T.sqrt(
                    (self.emb**2).sum(axis=1)).dimshuffle(0, 'x')
            })
Пример #56
0
    def get_monitoring_channels(self, model, X, Y = None, drop_mask = None, drop_mask_Y = None, **kwargs):
        """
        .. todo::

            WRITEME
        """

        if self.supervised:
            assert Y is not None

        rval = OrderedDict()

        # TODO: shouldn't self() handle this?
        if drop_mask is not None and drop_mask.ndim < X.ndim:
            if self.mask_gen is not None:
                assert self.mask_gen.sync_channels
            if X.ndim != 4:
                raise NotImplementedError()
            drop_mask = drop_mask.dimshuffle(0,1,2,'x')

        scratch = self(model, X, Y, drop_mask = drop_mask, drop_mask_Y = drop_mask_Y,
                return_locals = True)

        history = scratch['history']
        new_history = scratch['new_history']
        new_drop_mask = scratch['new_drop_mask']
        new_drop_mask_Y = None
        drop_mask = scratch['drop_mask']
        if self.supervised:
            drop_mask_Y = scratch['drop_mask_Y']
            new_drop_mask_Y = scratch['new_drop_mask_Y']

        ii = 0
        for name in ['inpaint_cost', 'l1_act_cost', 'toronto_act_cost',
                'reweighted_act_cost']:
            var = scratch[name]
            if var is not None:
                rval['total_inpaint_cost_term_'+str(ii)+'_'+name] = var
                ii = ii + 1

        if self.monitor_each_step:
            for ii, packed in enumerate(safe_izip(history, new_history)):
                state, new_state = packed
                rval['all_inpaint_costs_after_' + str(ii)] = self.cost_from_states(state,
                        new_state,
                        model, X, Y, drop_mask, drop_mask_Y,
                        new_drop_mask, new_drop_mask_Y)

                if ii > 0:
                    prev_state = history[ii-1]
                    V_hat = state['V_hat']
                    prev_V_hat = prev_state['V_hat']
                    rval['max_pixel_diff[%d]'%ii] = abs(V_hat-prev_V_hat).max()

        final_state = history[-1]

        #empirical beta code--should be moved to gaussian visible layer, should support topo data
        #V_hat = final_state['V_hat']
        #err = X - V_hat
        #masked_err = err * drop_mask
        #sum_sqr_err = T.sqr(masked_err).sum(axis=0)
        #recons_count = T.cast(drop_mask.sum(axis=0), 'float32')

        # empirical_beta = recons_count / sum_sqr_err
        # assert empirical_beta.ndim == 1


        #rval['empirical_beta_min'] = empirical_beta.min()
        #rval['empirical_beta_mean'] = empirical_beta.mean()
        #rval['empirical_beta_max'] = empirical_beta.max()

        layers = model.get_all_layers()
        states = [ final_state['V_hat'] ] + final_state['H_hat']

        for layer, state in safe_izip(layers, states):
            d = layer.get_monitoring_channels_from_state(state)
            for key in d:
                mod_key = 'final_inpaint_' + layer.layer_name + '_' + key
                assert mod_key not in rval
                rval[mod_key] = d[key]

        if self.supervised:
            inpaint_Y_hat = history[-1]['H_hat'][-1]
            err = T.neq(T.argmax(inpaint_Y_hat, axis=1), T.argmax(Y, axis=1))
            assert err.ndim == 1
            assert drop_mask_Y.ndim == 1
            err =  T.dot(err, drop_mask_Y) / drop_mask_Y.sum()
            if err.dtype != inpaint_Y_hat.dtype:
                err = T.cast(err, inpaint_Y_hat.dtype)

            rval['inpaint_err'] = err

            Y_hat = model.mf(X)[-1]

            Y = T.argmax(Y, axis=1)
            Y = T.cast(Y, Y_hat.dtype)

            argmax = T.argmax(Y_hat,axis=1)
            if argmax.dtype != Y_hat.dtype:
                argmax = T.cast(argmax, Y_hat.dtype)
            err = T.neq(Y , argmax).mean()
            if err.dtype != Y_hat.dtype:
                err = T.cast(err, Y_hat.dtype)

            rval['err'] = err

            if self.monitor_multi_inference:
                Y_hat = model.inference_procedure.multi_infer(X)

                argmax = T.argmax(Y_hat,axis=1)
                if argmax.dtype != Y_hat.dtype:
                    argmax = T.cast(argmax, Y_hat.dtype)
                err = T.neq(Y , argmax).mean()
                if err.dtype != Y_hat.dtype:
                    err = T.cast(err, Y_hat.dtype)

                rval['multi_err'] = err

        return rval
Пример #57
0
class Monitor(object):
    """
    A class for monitoring Models while they are being trained.

    A monitor object records the number of minibatches and number of examples
    the model has trained, as well as any number of "channels" that track
    quantities of interest (examples: the objective function, measures of
    hidden unit activity, reconstruction error, sum of squared second
    derivatives, average norm of the weight vectors,  etc.)
    """
    def __init__(self, model):
        """
        Makes a monitor for `model`. Assumes the model has not been
        trained at all yet.

        Parameters
        ----------
        model : pylearn2.models.model.Model instance
        """
        self.training_succeeded = False
        self.model = model
        self.channels = OrderedDict()
        self._num_batches_seen = 0
        self._examples_seen = 0
        self._epochs_seen = 0
        self._datasets = []
        self._iteration_mode = []
        self._batch_size = []
        self._num_batches = []
        self._dirty = True
        self._rng_seed = []
        self.names_to_del = ['theano_function_mode']
        self.t0 = time.time()
        # Determine whether the model should use topological or vector form of
        # examples. If the model acts on a space with more than the batch index
        # and channel dimension, the model has topological dimensions, so the
        # topological view of the data should be used.
        vector = model.get_input_space().make_theano_batch(
            name='monitoring_input')
        if isinstance(vector.type, theano.sparse.SparseType):
            self.topo = False
        else:
            self.topo = len(vector.type.broadcastable) > 2

        self.require_label = False
        self.theano_function_mode = None

    def set_theano_function_mode(self, mode):
        if self.theano_function_mode != mode:
            self._dirty = True
            self.theano_function_mode = mode

    def add_dataset(self,
                    dataset,
                    mode='sequential',
                    batch_size=None,
                    num_batches=None,
                    seed=None):
        """
        Determines the data used to calculate the values of each channel.

        Parameters
        ----------
        dataset : object
            A `pylearn2.datasets.Dataset` object.
        mode : str or object, optional
            Iteration mode; see the docstring of the `iterator` method
            on `pylearn2.datasets.Dataset` for details.
        batch_size : int, optional
            The size of an individual batch. Optional if `mode` is
            'sequential' and `num_batches` is specified (batch size
            will be calculated based on full dataset size).
        num_batches : int, optional
            The total number of batches. Unnecessary if `mode` is
            'sequential' and `batch_size` is specified (number of
            batches will be calculated based on full dataset size).
        """
        # The user can ommit using lists if only one dataset is set
        if not isinstance(dataset, list):
            dataset = [dataset]
        if not isinstance(mode, list):
            mode = [mode]
        if not isinstance(batch_size, list):
            batch_size = [batch_size]
        if not isinstance(num_batches, list):
            num_batches = [num_batches]
        if seed is None:
            seed = [None] * len(dataset)
        if not isinstance(seed, list):
            seed = [seed]
        if len(mode) != len(dataset):
            raise ValueError("Received " + str(len(dataset)) +
                             " dataset but " + str(len(mode)) + " modes.")
        if any([len(l) != len(dataset) for l in [batch_size, seed]]):
            raise ValueError("make sure each dataset has its iteration " + \
                        "batch size and number of batches.")
        for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size,
                                          num_batches, seed):
            try:
                it = d.iterator(mode=m,
                                batch_size=b,
                                num_batches=n,
                                topo=self.topo,
                                targets=self.require_label,
                                rng=sd)
            except ValueError as exc:
                raise ValueError("invalid iteration parameters in "
                                 "Monitor.add_dataset: " + str(exc))
            if it.stochastic:
                # must be a seed, not a random number generator
                # if it were a random number generator, different iterators using
                # it would update its state, so we would not get the same iterator
                # each time
                # Also, must not be None, because this makes the iterator pick
                # a seed based on the clock
                if sd is None:
                    raise TypeError(
                        "Monitor requires a seed when using stochastic iteration modes."
                    )
                if not isinstance(sd, (list, tuple, int)):
                    raise TypeError(
                        "Monitor requires a seed (not a random number generator) when using stochastic iteration modes."
                    )
            else:
                assert sd is None  # the iterator should catch this, but let's double-check

            if not d in self._datasets:
                self._datasets.append(d)
                self._iteration_mode.append(m)
                self._batch_size.append(b)
                self._num_batches.append(n)
                self._rng_seed.append(sd)

    def __call__(self):
        """
        Runs the model on the monitoring dataset in order to add one
        data point to each of the channels.
        """

        # If the channels have changed at all, we need to recompile the theano
        # functions used to compute them
        if self._dirty:
            self.redo_theano()

        model = self.model
        datasets = self._datasets

        # Set all channels' val_shared to 0
        self.begin_record_entry()

        for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode,
                                               self._batch_size,
                                               self._num_batches, self.accum,
                                               self._rng_seed,
                                               self.num_examples):
            if isinstance(d, basestring):
                d = yaml_parse.load(d)
                raise NotImplementedError()
                # need to put d back into self._datasets
            myiterator = d.iterator(mode=i,
                                    batch_size=b,
                                    num_batches=n,
                                    topo=self.topo,
                                    targets=self.require_label,
                                    rng=sd)

            actual_ne = 0
            for X in myiterator:
                if self.require_label:
                    X, y = X
                    self.run_prereqs(X, y, d)
                    a(X, y)
                else:
                    self.run_prereqs(X, None, d)
                    a(X)
                if X.ndim == 2:
                    actual_batch_size = X.shape[0]
                else:
                    actual_batch_size = X.shape[d.get_topo_batch_axis()]
                actual_ne += actual_batch_size
            # end for X
            if actual_ne != ne:
                raise RuntimeError(
                    "At compile time, your iterator said it had " + str(ne) +
                    " examples total, but at runtime it gave us " +
                    str(actual_ne) + ".")
        # end for d

        log.info("Monitoring step:")
        log.info("\tEpochs seen: %d" % self._epochs_seen)
        log.info("\tBatches seen: %d" % self._num_batches_seen)
        log.info("\tExamples seen: %d" % self._examples_seen)
        t = time.time() - self.t0
        for channel_name in sorted(self.channels.keys(),
                                   key=number_aware_alphabetical_key):
            channel = self.channels[channel_name]
            channel.time_record.append(t)
            channel.batch_record.append(self._num_batches_seen)
            channel.example_record.append(self._examples_seen)
            channel.epoch_record.append(self._epochs_seen)
            val = channel.val_shared.get_value()
            channel.val_record.append(val)
            # TODO: use logging infrastructure so that user can configure
            # formatting
            if abs(val) < 1e4:
                val_str = str(val)
            else:
                val_str = '%.3e' % val

            log.info("\t%s: %s" % (channel_name, val_str))

    def run_prereqs(self, X, y, dataset):
        if dataset not in self.prereqs:
            return
        for prereq in self.prereqs[dataset]:
            prereq(X, y)

    def get_batches_seen(self):
        """ Returns the number of batches the model has learned on (assuming
        that the learning code has been calling Monitor.report_batch correctly)
        """
        return self._num_batches_seen

    def get_epochs_seen(self):
        return self._epochs_seen

    def get_examples_seen(self):
        """ Returns the number of examples the model has learned on (assuming
        that the learning code has been calling Monitor.report_batch correctly)
        """
        return self._examples_seen

    def report_batch(self, num_examples):
        """ Call this whenever the model has learned on another batch of examples.
        Report how many examples were learned on. """
        self._examples_seen += num_examples
        self._num_batches_seen += 1

    def report_epoch(self):
        self._epochs_seen += 1

    def redo_theano(self):
        """
        Recompiles Theano functions used by this monitor.

        This is needed so that if new channels are added, Theano's
        optimizations make sure (to the extent that they can) that the new
        channels and old channels don't have any redundant calculations.

        It is also needed to regenerate Theano functions after pickling and
        unpickling, since Theano functions should not be pickled.
        """
        self._dirty = False

        init_names = dir(self)
        self.prereqs = OrderedDict()
        for channel in self.channels.values():
            if channel.prereqs is not None:
                dataset = channel.dataset
                if dataset not in self.prereqs:
                    self.prereqs[dataset] = []
                prereqs = self.prereqs[dataset]
                for prereq in channel.prereqs:
                    if prereq not in prereqs:
                        prereqs.append(prereq)

        updates = OrderedDict()
        for channel in self.channels.values():
            updates[channel.val_shared] = np.cast[config.floatX](0.0)
        with log_timing(log, "compiling begin_record_entry"):
            self.begin_record_entry = function(
                inputs=[],
                updates=updates,
                mode=self.theano_function_mode,
                name='Monitor.begin_record_entry')
        updates = OrderedDict()
        givens = OrderedDict()
        # Get the appropriate kind of theano variable to represent the data the model
        # acts on
        X = self.model.get_input_space().make_theano_batch(name="monitoring_X")
        if config.compute_test_value != 'off':
            m = self.model.get_test_batch_size()
            test_value = self.model.get_input_space().get_origin_batch(m)
            X.tag.test_value = np.cast[X.type.dtype](test_value)
        if self.require_label:
            Y = self.model.get_output_space().make_theano_batch(
                name="monitoring_Y")

        log.info('Monitored channels: ')
        for key in sorted(self.channels.keys()):
            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                mode.record.handle_line(
                    'compiling monitor including channel ' + key + '\n')
            log.info('\t%s' % key)
        it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \
              for d, i, n, b in safe_izip(self._datasets, self._iteration_mode,
                                    self._num_batches, self._batch_size)]
        self.num_examples = [
            np.cast[config.floatX](float(i.num_examples)) for i in it
        ]
        givens = [OrderedDict() for d in self._datasets]
        updates = [OrderedDict() for d in self._datasets]
        for channel in self.channels.values():
            index = self._datasets.index(channel.dataset)
            d = self._datasets[index]
            g = givens[index]
            cur_num_examples = self.num_examples[index]
            u = updates[index]
            if isinstance(channel.graph_input, (list, tuple)):
                channel_X, channel_Y = channel.graph_input
                assert channel_X not in g or g[channel_X] is X
                assert channel_Y not in g or g[channel_Y] is Y
                g[channel_X] = X
                g[channel_Y] = Y
            else:
                channel_X = channel.graph_input
                assert channel_X not in g or g[channel_X] is X
                g[channel_X] = X
            if n == 0:
                raise ValueError(
                    "Iterating over 0 examples results in divide by 0")
            if self.topo:
                batch_index = d.get_topo_batch_axis()
            else:
                batch_index = 0
            val = channel.val * T.cast(X.shape[batch_index],
                                       config.floatX) / cur_num_examples
            u[channel.val_shared] = channel.val_shared + val

        with log_timing(log, "Compiling accum"):
            # Check type of update expressions
            for up in updates:
                for key in up:
                    if key.dtype != up[key].dtype:
                        raise TypeError('Monitoring channel shared variable ' \
                                + key.name + ' has dtype ' + key.dtype + \
                                ' but is driven by an expression with type ' + \
                                up[key].dtype)

            self.accum = []
            for idx, packed in enumerate(safe_izip(givens, updates)):
                g, u = packed
                mode = self.theano_function_mode
                if mode is not None and hasattr(mode, 'record'):
                    for elem in g:
                        mode.record.handle_line('g key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('g val ' +
                                                var_descriptor(g[elem]) + '\n')
                    for elem in u:
                        mode.record.handle_line('u key ' +
                                                var_descriptor(elem) + '\n')
                        mode.record.handle_line('u val ' +
                                                var_descriptor(u[elem]) + '\n')
                function_name = 'Monitor.accum[%d]' % idx
                if self.require_label:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line('compiling supervised accum\n')
                    # Some channels may not depend on the data, ie, they might just monitor the model
                    # parameters, or some shared variable updated by the training algorithm, so we
                    # need to ignore the unused input error
                    self.accum.append(
                        function([X, Y],
                                 givens=g,
                                 updates=u,
                                 mode=self.theano_function_mode,
                                 name=function_name))
                else:
                    if mode is not None and hasattr(mode, 'record'):
                        mode.record.handle_line(
                            'compiling unsupervised accum\n')
                    self.accum.append(
                        function([X],
                                 givens=g,
                                 updates=u,
                                 mode=self.theano_function_mode,
                                 name=function_name))
            for a in self.accum:
                if mode is not None and hasattr(mode, 'record'):
                    for elem in a.maker.fgraph.outputs:
                        mode.record.handle_line('accum output ' +
                                                var_descriptor(elem) + '\n')
                log.info("graph size: %d" % len(a.maker.fgraph.toposort()))
        final_names = dir(self)
        self.register_names_to_del(
            [name for name in final_names if name not in init_names])

    def register_names_to_del(self, names):
        """
        Register names of fields that should be deleted before pickling.

        Parameters
        ----------
        names : list
            A list of attribute names as strings.
        """
        for name in names:
            if name not in self.names_to_del:
                self.names_to_del.append(name)

    def __getstate__(self):
        """
        In order to avoid pickling a copy of the dataset whenever a monitor
        is saved, the __getstate__ method replaces the dataset field with the
        dataset's yaml source. This is not a perfect solution because it won't
        work with job resuming, which would require saving the state of the
        dataset's random number generator.

        Like in the Model class, we also need to avoid saving any Theano
        functions, so we delete everything that can be regenerated with
        `redo_theano` by deleting the fields in `self.names_to_del`
        """

        # Patch old pickled monitors
        if not hasattr(self, '_datasets'):
            self._datasets = [self._dataset]
            del self._dataset

        temp = self._datasets

        if self._datasets:
            self._datasets = []
            for dataset in temp:
                if isinstance(dataset, basestring):
                    self._datasets.append(dataset)
                else:
                    try:
                        self._datasets.append(dataset.yaml_src)
                    except AttributeError:
                        warnings.warn(
                            'Trained model saved without indicating yaml_src')
        d = copy.copy(self.__dict__)
        self._datasets = temp
        for name in self.names_to_del:
            if name in d:
                del d[name]

        return d

    def __setstate__(self, d):

        # patch old pkl files
        if '_dataset' in d:
            d['_datasets'] = [d['_dataset']]
            del d['_dataset']

        self.__dict__.update(d)

    def add_channel(self, name, ipt, val, dataset=None, prereqs=None):
        """
        Asks the monitor to start tracking a new value.  Can be called even
        after the monitor is already in use.

        Parameters
        ----------
        name: str
            The display name in the monitor.
        ipt: tensor_like
            The symbolic tensor which should be clamped to the data.
            (or a (features,targets) list/tuple containing two symbolic tensors)
        val: tensor_like
            The value (function of `ipt`) to be tracked.
        dataset: A Dataset instance specifying which dataset to compute
            this channel on.
        prereqs: list of callables that take two numpy tensors
            (X and y, where y will be None if no labels are used)
            each prereq must be called exactly once per each new
            batch of data drawn *from dataset* before the channel
            value is computed
            if two channels provide a prereq with exactly the same
            id, that prereq will only be called once
        """

        if isinstance(val, (float, int, long)):
            val = np.cast[theano.config.floatX](val)

        val = T.as_tensor_variable(val)

        if not isinstance(ipt, (list, tuple)):
            tmp = [ipt]
        else:
            tmp = ipt
        inputs = theano.gof.graph.inputs([val])
        for elem in inputs:
            if not hasattr(elem, 'get_value') and not isinstance(
                    elem, theano.gof.graph.Constant):
                if elem not in tmp:
                    raise ValueError("Unspecified input: " + str(elem))

        mode = self.theano_function_mode
        if mode is not None and hasattr(mode, 'record'):
            mode.record.handle_line('Adding monitor channel ' + name + '\n')
            if isinstance(ipt, (list, tuple)):
                for elem in ipt:
                    mode.record.handle_line('Includes input var ' +
                                            var_descriptor(elem) + '\n')
            else:
                mode.record.handle_line(name + ' input var is ' +
                                        var_descriptor(ipt) + '\n')
            mode.record.handle_line('channel ' + name + ' is ' +
                                    var_descriptor(val) + '\n')

        if dataset is None:
            if len(self._datasets) == 1:
                dataset = self._datasets[0]
            elif len(self._datasets) == 0:
                raise ValueError(_err_no_data)
            else:
                raise ValueError(_err_ambig_data)

        try:
            self._datasets.index(dataset)
        except ValueError:
            raise ValueError("The dataset specified is not " + \
                "one of the monitor's datasets")

        if name in self.channels:
            raise ValueError("Tried to create the same channel twice (%s)" %
                             name)
        if isinstance(ipt, (list, tuple)):
            if dataset is not None:
                if not dataset.has_targets():
                    raise ValueError("Tried to create a channel ("+name \
                            +") that uses targets, but monitoring dataset has no targets")
            self.require_label = True
            assert len(ipt) == 2
        self.channels[name] = MonitorChannel(ipt, val, name, dataset, prereqs)
        self._dirty = True

    def _sanity_check(self):
        """
            Sometimes we serialize models and then load them somewhere else
            but still try to use their Monitor, and the Monitor is in a mangled
            state. I've added some calls to _sanity_check to try to catch when
            that happens. Not sure what to do for a long term fix. I think it
            requires making theano graphs serializable first.
        """
        for name in self.channels:
            channel = self.channels[name]
            assert hasattr(channel, 'prereqs')

    @classmethod
    def get_monitor(cls, model):
        """
        Returns a model's monitor. If the model doesn't have a monitor yet,
        installs one and returns that.

        Parameters
        ----------
        model : object
            An object that implements the `Model` interface specified in
            `pylearn2.models`.
        """

        if hasattr(model, 'monitor'):
            rval = model.monitor
            rval._sanity_check()
        else:
            rval = Monitor(model)
            model.monitor = rval

        return rval

    # TODO: find out if monitor.foo below are used anywhere, remove if not.
    @property
    def batch_size(self):
        return self._batch_size

    @property
    def num_batches(self):
        return self._num_batches

    def setup(self,
              dataset,
              cost,
              batch_size,
              num_batches=None,
              extra_costs=None,
              mode='sequential'):
        """
        Sets up the monitor for a cost minimization problem.
        Adds channels defined by both the model and the cost for
        the specified dataset(s), as well as a channel called 'objective'
        defined by the costs' __call__ method.

        dataset: a Dataset or dictionary mapping string names to Datasets
                    If string names are used, then for every dataset,
                    each channel defined by the model or cost will be
                    replicated with that dataset's name followed by an
                    underscore as the prefix.
                    For example, if your cost defines a channel called
                    'misclass', and datasets is {'train' : train_dataset,
                    'valid' : valid_dataset} you will get channels called
                    'train_misclass' and 'valid_misclass'.

        cost: a Cost

        """
        if dataset is None:
            return
        if isinstance(dataset, Dataset):
            dataset = {'': dataset}
        else:
            assert isinstance(dataset, dict)
            assert all(isinstance(key, str) for key in dataset)
            assert all(isinstance(dataset[key], Dataset) for key in dataset)

        if extra_costs is None:
            costs = {}
        else:
            costs = extra_costs
        assert '' not in costs
        costs[''] = cost

        supervised = any(cost.supervised for cost in costs.values())
        model = self.model

        X_space = model.get_input_space()
        X = X_space.make_theano_batch(name='monitor_X')

        if config.compute_test_value != 'off':
            X.tag.test_value = X_space.get_origin_batch(batch_size).astype(
                X.dtype)

        if supervised:
            Y_space = model.get_output_space()
            Y = Y_space.make_theano_batch(name='monitor_Y')

            if config.compute_test_value != 'off':
                Y.tag.test_value = Y_space.get_origin_batch(batch_size).astype(
                    Y.dtype)

            ipt = (X, Y)
        else:
            Y = None
            ipt = X
        custom_channels = {}
        for cost_name in costs:
            if cost_name == '':
                prefix = ''
            else:
                prefix = cost_name + '_'
            cost = costs[cost_name]
            raw_channels = cost.get_monitoring_channels(model, X, Y)
            channels = {}
            for name in raw_channels:
                channels[prefix + name] = raw_channels[name]
            custom_channels.update(channels)
        model_channels = model.get_monitoring_channels(X, Y)
        custom_channels.update(model_channels)

        if is_stochastic(mode):
            seed = [[2013, 02, 22]]
        else:
            seed = None

        for dataset_name in dataset:
            cur_dataset = dataset[dataset_name]
            self.add_dataset(dataset=cur_dataset,
                             mode=mode,
                             batch_size=batch_size,
                             num_batches=num_batches,
                             seed=seed)
            if dataset_name == '':
                dprefix = ''
            else:
                dprefix = dataset_name + '_'
            # These channel name 'objective' must not vary, since callbacks that respond to the
            # values in the monitor use the name to find it.
            for cost_name in costs:
                cost = costs[cost_name]
                cost_value = cost(model, X, Y)
                if cost_value is not None:
                    if cost_name == '':
                        name = dprefix + 'objective'
                    else:
                        name = dprefix + cost_name
                    self.add_channel(name=name,
                                     ipt=ipt,
                                     val=cost_value,
                                     dataset=cur_dataset)
            for key in custom_channels:
                self.add_channel(name=dprefix + key,
                                 ipt=ipt,
                                 val=custom_channels[key],
                                 dataset=cur_dataset)
Пример #58
0
    def get_gradients(self, model, data, **kwargs):
        """
        .. todo::

            WRITEME
        """
        self.get_data_specs(model)[0].validate(data)
        obj, scratch = self.base_cost.expr(model, data, return_locals=True,
                                           **kwargs)
        if self.supervised:
            assert isinstance(data, (list, tuple))
            assert len(data) == 2
            (X, Y) = data
        else:
            X = data

        H_hat = scratch['H_hat']
        terms = scratch['terms']
        hidden_layers = scratch['hidden_layers']

        grads = OrderedDict()

        assert len(H_hat) == len(terms)
        assert len(terms) == len(hidden_layers)
        num_layers = len(hidden_layers)
        for i in xrange(num_layers):
            state = H_hat[i]
            layer = model.hidden_layers[i]
            term = terms[i]

            if term == 0.:
                continue
            else:
                print 'term is ',term

            if i == 0:
                state_below = X
                layer_below = model.visible_layer
            else:
                layer_below = model.hidden_layers[i-1]
                state_below = H_hat[i-1]
            state_below = layer_below.upward_state(state_below)

            components = flatten(state)

            real_grads = T.grad(term, components)

            fake_state = layer.linear_feed_forward_approximation(state_below)

            fake_components = flatten(fake_state)
            real_grads = OrderedDict(safe_zip(fake_components, real_grads))

            params = list(layer.get_params())
            fake_grads = pylearn2.utils.grad(
                cost=None,
                consider_constant=flatten(state_below),
                wrt=params,
                known_grads=real_grads
            )

            for param, grad in safe_zip(params, fake_grads):
                if param in grads:
                    grads[param] = grads[param] + grad
                else:
                    grads[param] = grad

        return grads, OrderedDict()
Пример #59
0
 def get_monitoring_channels(self, data):
     rval = OrderedDict()
     if self.encoder is not None:
         rval = self.encoder.get_layer_monitoring_channels(state_below=data)
     return rval
Пример #60
0
class Cost(object):
    """
    Represents a cost that can be called either as a supervised cost or an
    unsupervised cost.
    """

    # If True, the data argument to expr and get_gradients must be a
    # (X, Y) pair, and Y cannot be None.
    supervised = False

    def expr(self, model, data, **kwargs):
        """
        Returns a theano expression for the cost function.

        Parameters
        ----------
        model: a pylearn2 Model instance
        data : a batch in cost.get_data_specs() form
        kwargs : dict
            Optional extra arguments. Not used by the base class.

        Returns a symbolic expression for a cost function applied to the
        minibatch of data.
        Optionally, may return None. This represents that the cost function
        is intractable but may be optimized via the get_gradients method.

        """
        raise NotImplementedError(
            str(type(self)) + " does not implement "
            "expr.")

    def get_gradients(self, model, data, **kwargs):
        """
        Provides the gradients of the cost function with respect to the model
        parameters. These are not necessarily those obtained by
        theano.tensor.grad--you may wish to use approximate or even
        intentionally incorrect gradients in some cases.

        Parameters
        ----------
        model : a pylearn2 Model instance
        data : a batch in cost.get_data_specs() form
        kwargs : dict
            Optional extra arguments, not used by the base class.

        Returns
        -------
        gradients: OrderedDict
            a dictionary mapping from the model's parameters
            to their gradients
            The default implementation is to compute the gradients
            using T.grad applied to the value returned by expr.
            However, subclasses may return other values for the gradient.
            For example, an intractable cost may return a sampling-based
            approximation to its gradient.
        updates: OrderedDict
            a dictionary mapping shared variables to updates that must
            be applied to them each time these gradients are computed.
            This is to facilitate computation of sampling-based approximate
            gradients.
            The parameters should never appear in the updates dictionary.
            This would imply that computing their gradient changes
            their value, thus making the gradient value outdated.
        """

        try:
            cost = self.expr(model=model, data=data, **kwargs)
        except TypeError, e:
            # If anybody knows how to add type(self) to the exception message
            # but still preserve the stack trace, please do so
            # The current code does neither
            e.message += " while calling " + str(type(self)) + ".expr"
            logger.error(type(self))
            logger.error(e.message)
            raise e

        if cost is None:
            raise NotImplementedError(
                str(type(self)) + " represents an intractable cost and "
                "does not provide a gradient "
                "approximation scheme.")

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        return gradients, updates