def get_lr_scalers(self):
        rval = OrderedDict()

        params = self.get_params()

        for layer in self.layers[:-1]:
            contrib = layer.get_lr_scalers()

            assert isinstance(contrib, OrderedDict)
            # No two layers can contend to scale a parameter
            assert not any([key in rval for key in contrib])
            # Don't try to scale anything that's not a parameter
            assert all([key in params for key in contrib])

            rval.update(contrib)

        for layer in self.layers[-1]:
            contrib = layer.get_lr_scalers()

            assert isinstance(contrib, OrderedDict)
            # No two layers can contend to scale a parameter
            assert not any([key in rval for key in contrib])
            # Don't try to scale anything that's not a parameter
            assert all([key in params for key in contrib])

            rval.update(contrib)

        assert all([isinstance(val, float) for val in rval.values()])

        return rval
示例#2
0
    def get_gradients(self, model, data, ** kwargs):
        #print 'get_gradients'
        chain_start = theano.shared(numpy.zeros(shape=(self.chain_num, model.n_vis), dtype=theano.config.floatX), name='chain_start', borrow=True)
        
        [act_hids, hid_mfs, hid_samples, act_vis, vis_mfs, vis_samples], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, 
		                        outputs_info=[None, None, None, None, None, chain_start], non_sequences=None, n_steps=self.k)
    
        chain_end = vis_samples[-1]
        scan_updates[chain_start] = chain_end
        
        pos_v = data 
        
        cost = -(- model.free_energy(pos_v).mean() + model.free_energy(chain_end).mean())

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, chain_end])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()
        
        updates.update(scan_updates) # manual added

        return gradients, updates
示例#3
0
    def get_gradients(self, model, data, ** kwargs):
        #print 'get_gradients'
        pos_v = data
        #pos_h = model.sample_h_given_v(pos_v)[-1]
        #chain_start = pos_v
        #h_samples = pos_h
        #print 'v_samples', v_samples.ndim
        [act_hids, hid_mfs, hid_samples, act_vis, vis_mfs, vis_samples], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, 
		                        outputs_info=[None, None, None, None, None, pos_v], non_sequences=None, n_steps=self.k)
        neg_v = vis_samples[-1]
        #neg_h = hid_samples[-1]
        
        cost = -(- model.free_energy(pos_v).mean() + model.free_energy(neg_v).mean())

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, neg_v])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()
        
        updates.update(scan_updates) # add scan_updates

        return gradients, updates
示例#4
0
    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=None):

        W, = self.transformer.get_params()

        assert W.ndim == 4

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=(1, 2, 3)))

        rval = OrderedDict([
                           ('kernel_norms_min', row_norms.min()),
                           ('kernel_norms_mean', row_norms.mean()),
                           ('kernel_norms_max', row_norms.max()),
                           ])

        orval = super(CudNNElemwise, self).get_layer_monitoring_channels(
            state_below,
            state,
            targets)

        rval.update(orval)

        cst = self.cost
        orval = self.nonlin.get_monitoring_channels_from_state(state,
                                                               targets,
                                                               cost_fn=cst)

        rval.update(orval)

        return rval
示例#5
0
    def get_monitoring_channels(self, model, data, **kwargs):
        self.get_data_specs(model)[0].validate(data)
        rval = OrderedDict()
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)

        for i, cost in enumerate(self.costs):
            cost_data = nested_data[i]
            try:
                channels = cost.get_monitoring_channels(model, cost_data, **kwargs)
                rval.update(channels)
            except TypeError:
                print (
                    "SumOfCosts.get_monitoring_channels encountered "
                    "TypeError while calling " + str(type(cost)) + ".get_monitoring_channels"
                )
                raise

            value = cost.expr(model, cost_data, **kwargs)
            if value is not None:
                name = ""
                if hasattr(value, "name") and value.name is not None:
                    name = "_" + value.name
                rval["term_" + str(i) + name] = value

        return rval
示例#6
0
    def get_gradients(self, model, data, ** kwargs):
        #print 'get_gradients'
        chain_start = theano.shared(numpy.zeros(shape=(self.chain_num, model.n_vis)), name=None, borrow=True)
        v_samples = chain_start
        
        for i in xrange(self.k):
            v_samples = model.gibbs_vhv(v_samples)[-1]
        chain_end = v_samples
        #print 'chain_end', chain_end.ndim
        chain_updates = {}
        chain_updates[chain_start] = chain_end
        
        pos_v = data
        #neg_v = self.get_neg_v(model)
        
        cost = -(- model.free_energy(pos_v).mean() + model.free_energy(chain_end).mean())

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[chain_end])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()
        
        updates.update(chain_updates) # manual added

        return gradients, updates
示例#7
0
    def get_monitoring_channels(self, model, data, ** kwargs):
        self.get_data_specs(model)[0].validate(data)
        rval = OrderedDict()
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)

        for i, cost in enumerate(self.costs):
            cost_data = nested_data[i]
            try:
                channels = cost.get_monitoring_channels(model, cost_data,
                                                        **kwargs)
                rval.update(channels)
            except TypeError:
                logger.error('SumOfCosts.get_monitoring_channels encountered '
                             'TypeError while calling {0}'
                             '.get_monitoring_channels'.format(type(cost)))
                raise

            value = cost.expr(model, cost_data, ** kwargs)
            if value is not None:
                name = ''
                if hasattr(value, 'name') and value.name is not None:
                    name = '_' + value.name
                rval['term_' + str(i) + name] = value

        return rval
    def get_lr_scalers(self):
        rval = OrderedDict()

        params = self.get_params()

        for layer in self.layers[:-1]:
            contrib = layer.get_lr_scalers()

            assert isinstance(contrib, OrderedDict)
            # No two layers can contend to scale a parameter
            assert not any([key in rval for key in contrib])
            # Don't try to scale anything that's not a parameter
            assert all([key in params for key in contrib])

            rval.update(contrib)

        for layer in self.layers[-1]:
            contrib = layer.get_lr_scalers()

            assert isinstance(contrib, OrderedDict)
            # No two layers can contend to scale a parameter
            assert not any([key in rval for key in contrib])
            # Don't try to scale anything that's not a parameter
            assert all([key in params for key in contrib])

            rval.update(contrib)

        assert all([isinstance(val, float) for val in rval.values()])

        return rval
示例#9
0
    def get_monitoring_channels(self, model, X, Y=None, **kwargs):
        if Y is None and self.supervised:
            raise ValueError("no targets provided while some of the " +
                             "costs in the sum are supervised costs")

        rval = OrderedDict()

        for i, cost in enumerate(self.costs):
            try:
                rval.update(cost.get_monitoring_channels(
                    model, X, Y, **kwargs))
            except TypeError:
                print 'SumOfCosts.get_monitoring_channels encountered TypeError while calling ' \
                        + str(type(cost))+'.get_monitoring_channels'
                raise

            Y_to_pass = Y
            if not cost.supervised:
                Y_to_pass = None

            value = cost(model, X, Y_to_pass, **kwargs)
            if value is not None:
                name = ''
                if hasattr(value, 'name') and value.name is not None:
                    name = '_' + value.name
                rval['term_' + str(i) + name] = value

        return rval
示例#10
0
文件: cost.py 项目: alito/pylearn2
    def get_monitoring_channels(self, model, X, Y=None, ** kwargs):
        if Y is  None and self.supervised:
            raise ValueError("no targets provided while some of the " +
                             "costs in the sum are supervised costs")

        rval = OrderedDict()

        for i, cost in enumerate(self.costs):
            try:
                rval.update(cost.get_monitoring_channels(model, X, Y, **kwargs))
            except TypeError:
                print 'SumOfCosts.get_monitoring_channels encountered TypeError while calling ' \
                        + str(type(cost))+'.get_monitoring_channels'
                raise

            Y_to_pass = Y
            if not cost.supervised:
                Y_to_pass = None

            value = cost(model, X, Y_to_pass, ** kwargs)
            if value is not None:
                name = ''
                if hasattr(value, 'name') and value.name is not None:
                    name = '_' + value.name
                rval['term_'+str(i)+name] = value

        return rval
示例#11
0
    def get_gradients(self, model, data, ** kwargs):
        indiv_results = []
        composite_specs, mapping = self.get_composite_specs_and_mapping(model)
        nested_data = mapping.nest(data)
        for cost, cost_data in safe_zip(self.costs, nested_data):
            result = cost.get_gradients(model, cost_data, ** kwargs)
            indiv_results.append(result)

        grads = OrderedDict()
        updates = OrderedDict()
        params = model.get_params()

        for coeff, packed in zip(self.coeffs, indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError("A shared variable (" +
                                     str(param) +
                                     ") that is not a parameter appeared "
                                     "a cost gradient dictionary.")
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates
示例#12
0
文件: hinge.py 项目: baucheng/facedet
    def get_monitoring_channels_from_state(self, state, target=None):
        warnings.warn("Layer.get_monitoring_channels_from_state is " + \
                    "deprecated. Use get_layer_monitoring_channels " + \
                    "instead. Layer.get_monitoring_channels_from_state " + \
                    "will be removed on or after september 24th 2014",
                    stacklevel=2)
        # channels that does not require state information
        W = self.W
        assert W.ndim == 2
        sq_W = T.sqr(W)
        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))
        rval = OrderedDict([
            ('row_norms_min', row_norms.min()),
            ('row_norms_mean', row_norms.mean()),
            ('row_norms_max', row_norms.max()),
            ('col_norms_min', col_norms.min()),
            ('col_norms_mean', col_norms.mean()),
            ('col_norms_max', col_norms.max()),
        ])

        mx = state.max(axis=1)
        rval.update(
            OrderedDict([('mean_max_class', mx.mean()),
                         ('max_max_class', mx.max()),
                         ('min_max_class', mx.min())]))
        if target is not None:
            y_hat = T.argmax(state, axis=1)
            y = T.argmax(target, axis=1)
            misclass = T.neq(y, y_hat).mean()
            misclass = T.cast(misclass, config.floatX)
            rval['misclass'] = misclass
            rval['nll'] = self.cost(Y_hat=state, Y=target)

        return rval
示例#13
0
    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=None):

        W, = self.transformer.get_params()

        assert W.ndim == 4

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=(1, 2, 3)))

        rval = OrderedDict([
                           ('kernel_norms_min', row_norms.min()),
                           ('kernel_norms_mean', row_norms.mean()),
                           ('kernel_norms_max', row_norms.max()),
                           ])

        orval = super(CorrMMElemwise, self).get_monitoring_channels_from_state(state,
                                                                            targets)

        rval.update(orval)

        cst = self.cost
        orval = self.nonlin.get_monitoring_channels_from_state(state,
                                                               targets,
                                                               cost_fn=cst)

        rval.update(orval)

        return rval
示例#14
0
    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=NotImplementedError):

        if self.no_affine:
            return OrderedDict()

        W_class = self.W_class
        W_cluster = self.W_cluster

        assert W_class.ndim == 3
        assert W_cluster.ndim == 2

        sq_W = T.sqr(W_cluster)
        sq_W_class = T.sqr(W_class)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        row_norms_class = T.sqrt(sq_W_class.sum(axis=1))
        col_norms_class = T.sqrt(sq_W_class.sum(axis=0))

        rval = OrderedDict([
                            ('row_norms_min'  , row_norms.min()),
                            ('row_norms_mean' , row_norms.mean()),
                            ('row_norms_max'  , row_norms.max()),
                            ('col_norms_min'  , col_norms.min()),
                            ('col_norms_mean' , col_norms.mean()),
                            ('col_norms_max'  , col_norms.max()),
                            ('class_row_norms_min'  , row_norms_class.min()),
                            ('class_row_norms_mean' , row_norms_class.mean()),
                            ('class_row_norms_max'  , row_norms_class.max()),
                            ('class_col_norms_min'  , col_norms_class.min()),
                            ('class_col_norms_mean' , col_norms_class.mean()),
                            ('class_col_norms_max'  , col_norms_class.max()),
                            ])


        if (state_below is not None) or (state is not None):
            if state is None:

                #for value in get_debug_values(state_below):
                    #print 'value is'+ value
                state=self.fprop (state_below,targets)
            #print state
            probclass, probcluster = state
            mx = probclass.max(axis=1)
            rval.update(OrderedDict([('mean_max_class',mx.mean()),
                                     ('max_max_class' , mx.max()),
                                     ('min_max_class' , mx.min())
                                    ]))
            if targets is not None:
                rval['nll'] = self.cost(Y=targets,Y_hat=(probclass,probcluster))
                rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32'))
                rval['entropy'] = rval['nll']/np.log(2).astype('float32')
        return rval
示例#15
0
    def get_gradients(self, model, data, **kwargs):
        cost = self._cost(model, data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs="ignore", consider_constant=[self.sampler.particles])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        sampler_updates = self.sampler.updates()
        updates.update(sampler_updates)
        return gradients, updates
示例#16
0
文件: mlp.py 项目: ecastrow/nice
    def get_layer_monitoring_channels(self,
                                      state_below=None,
                                      state=None,
                                      targets=None):
        rval = OrderedDict()

        S = T.exp(-self.D)
        rval.update(
            OrderedDict([
                ('S_stddev', S.std()), ('S_mean', S.mean()),
                ('S_over_1_stdev', 1.0 * (S > (S.mean() + S.std())).sum()),
                ('S_over_2_stdev', 1.0 * (S > (S.mean() + 2 * S.std())).sum())
            ]))

        return rval
示例#17
0
    def get_gradients(self, model, data, **kwargs):
        cost = self._cost(model,data,**kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore',
                       consider_constant = [self.sampler.particles])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()

        sampler_updates = self.sampler.updates()
        updates.update(sampler_updates)
        return gradients, updates
示例#18
0
    def get_layer_monitoring_channels(self,
                                      state_below=None,
                                      state=None,
                                      targets=None):

        # channels that does not require state information
        #         if self.no_affine:
        #             rval = OrderedDict()
        #
        #         W = self.W
        #
        #         assert W.ndim == 2
        #
        #         sq_W = T.sqr(W)
        #
        #         row_norms = T.sqrt(sq_W.sum(axis=1))
        #         col_norms = T.sqrt(sq_W.sum(axis=0))
        #
        #         rval = OrderedDict([('row_norms_min',  row_norms.min()),
        #                             ('row_norms_mean', row_norms.mean()),
        #                             ('row_norms_max',  row_norms.max()),
        #                             ('col_norms_min',  col_norms.min()),
        #                             ('col_norms_mean', col_norms.mean()),
        #                             ('col_norms_max',  col_norms.max()), ])

        rval = OrderedDict()
        if (state_below is not None) or (state is not None):
            if state is None:
                state = self.fprop(state_below)

            mx = state.max(axis=1)

            rval.update(
                OrderedDict([('mean_max_class', mx.mean()),
                             ('max_max_class', mx.max()),
                             ('min_max_class', mx.min())]))

            if targets is not None:
                y_hat = self.target_convert(T.argmax(state, axis=1))
                #Assume target is in [0,1] as binary one-hot
                y = self.target_convert(T.argmax(targets, axis=1))
                misclass = T.neq(y, y_hat).mean()
                misclass = T.cast(misclass, config.floatX)
                rval['misclass'] = misclass
                rval['nll'] = self.cost(Y_hat=state, Y=targets)

        return rval
示例#19
0
    def get_gradients(self, model, data, ** kwargs):

        cost_cd, cost_ci = model.cost_from_X(data)
        params_dict = model.get_params()
        params = list(params_dict)

        zero_grads = []
        if self.zero_ci_grad_for_cd:
            #how to get this in less explicit way, i.e. using only dict?
            print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
            assert model.layers[-1].M in params_dict
            assert model.layers[-1].m in params_dict
            zero_grads = [model.layers[-1].M, model.layers[-1].m]

        grads_cd = T.grad(cost_cd, params, disconnected_inputs = 'ignore', consider_constant=zero_grads)
        grads_ci = T.grad(cost_ci, params, disconnected_inputs = 'ignore')

        gradients_cd = OrderedDict(izip(params, grads_cd))
        gradients_ci = OrderedDict(izip(params, grads_ci))

        indiv_results = []
        indiv_results.append((gradients_cd, OrderedDict()))
        indiv_results.append((gradients_ci, OrderedDict()))

        grads = OrderedDict()
        updates = OrderedDict()
        params = model.get_params()

        for coeff, packed in zip([self.coeff_cd, self.coeff_ci], indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError("A shared variable ("+str(param)+") that is not a parameter appeared in a cost gradient dictionary.")
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates
示例#20
0
    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=None):

        # channels that does not require state information
#         if self.no_affine:
#             rval = OrderedDict()
#
#         W = self.W
# 
#         assert W.ndim == 2
# 
#         sq_W = T.sqr(W)
# 
#         row_norms = T.sqrt(sq_W.sum(axis=1))
#         col_norms = T.sqrt(sq_W.sum(axis=0))
# 
#         rval = OrderedDict([('row_norms_min',  row_norms.min()),
#                             ('row_norms_mean', row_norms.mean()),
#                             ('row_norms_max',  row_norms.max()),
#                             ('col_norms_min',  col_norms.min()),
#                             ('col_norms_mean', col_norms.mean()),
#                             ('col_norms_max',  col_norms.max()), ])

        rval = OrderedDict()
        if (state_below is not None) or (state is not None):
            if state is None:
                state = self.fprop(state_below)

            mx = state.max(axis=1)

            rval.update(OrderedDict([
                                ('mean_max_class', mx.mean()),
                                ('max_max_class', mx.max()),
                                ('min_max_class', mx.min())]))

            if targets is not None:
                y_hat = self.target_convert(T.argmax(state, axis=1))
                #Assume target is in [0,1] as binary one-hot
                y = self.target_convert(T.argmax(targets, axis=1))
                misclass = T.neq(y, y_hat).mean()
                misclass = T.cast(misclass, config.floatX)
                rval['misclass'] = misclass
                rval['nll'] = self.cost(Y_hat=state, Y=targets)

        return rval
示例#21
0
文件: mlp.py 项目: rahul003/pylearn2
    def get_layer_monitoring_channels(self, state_below=None,
                                    state=None, targets=None):

        # channels that does not require state information
        if self.no_affine:
            rval = OrderedDict()

        W = self.W

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        rval = OrderedDict([('row_norms_min',  row_norms.min()),
                            ('row_norms_mean', row_norms.mean()),
                            ('row_norms_max',  row_norms.max()),
                            ('col_norms_min',  col_norms.min()),
                            ('col_norms_mean', col_norms.mean()),
                            ('col_norms_max',  col_norms.max()), ])

        if (state_below is not None) or (state is not None):
            if state is None:
                state = self.fprop(state_below)

            mx = state.max(axis=1)

            rval.update(OrderedDict([('mean_max_class', mx.mean()),
                                ('max_max_class', mx.max()),
                                ('min_max_class', mx.min())]))

            if targets is not None:
                y_hat = T.argmax(state, axis=1)
                y = T.argmax(targets, axis=1)
                misclass = T.neq(y, y_hat).mean()
                misclass = T.cast(misclass, config.floatX)
                rval['misclass'] = misclass
                rval['nll'] = self.cost(Y_hat=state, Y=targets)
                rval['perplexity'] = 2 ** (rval['nll'] / T.log(2))

        return rval
示例#22
0
    def get_gradients(self, model, data, ** kwargs):
        #print 'get_gradients'
        pos_v = data
        [h_mean, h_sample, v_mean, v_sample], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, 
		                        outputs_info=[None, None, None, pos_v], non_sequences=None, n_steps=self.k)
        neg_v = v_sample[-1]
        
        cost = -(- model.free_energy(pos_v).mean() + model.free_energy(neg_v).mean())

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, neg_v])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()
        
        updates.update(scan_updates) # add scan_updates

        return gradients, updates
示例#23
0
    def get_gradients(self, model, X, Y=None, **kwargs):

        if Y is None and self.supervised:
            raise ValueError("no targets provided while some of the " +
                             "costs in the sum are supervised costs")

        indiv_results = []
        for cost in self.costs:
            if cost.supervised:
                Y_to_pass = Y
            else:
                Y_to_pass = None
            result = cost.get_gradients(model, X, Y_to_pass, **kwargs)
            indiv_results.append(result)

        grads = OrderedDict()
        updates = OrderedDict()

        params = model.get_params()

        for coeff, packed in zip(self.coeffs, indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError(
                        "A shared variable (" + str(param) +
                        ") that is not a parameter appeared in a cost gradient dictionary."
                    )
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates
示例#24
0
文件: cost.py 项目: alito/pylearn2
    def get_gradients(self, model, X, Y=None, ** kwargs):

        if Y is None and self.supervised:
            raise ValueError("no targets provided while some of the " +
                             "costs in the sum are supervised costs")

        indiv_results = []
        for cost in self.costs:
            if cost.supervised:
                Y_to_pass = Y
            else:
                Y_to_pass = None
            result = cost.get_gradients(model, X, Y_to_pass, ** kwargs)
            indiv_results.append(result)


        grads = OrderedDict()
        updates = OrderedDict()

        params = model.get_params()

        for coeff, packed in zip(self.coeffs, indiv_results):
            g, u = packed
            for param in g:
                if param not in params:
                    raise ValueError("A shared variable ("+str(param)+") that is not a parameter appeared in a cost gradient dictionary.")
            for param in g:
                assert param.ndim == g[param].ndim
                v = coeff * g[param]
                if param not in grads:
                    grads[param] = v
                else:
                    grads[param] = grads[param] + v
                assert grads[param].ndim == param.ndim
            assert not any([state in updates for state in u])
            assert not any([state in params for state in u])
            updates.update(u)

        return grads, updates
示例#25
0
文件: crbm.py 项目: zanghu/MyDNNCodes
    def get_gradients(self, model, data, ** kwargs):
        #print 'get_gradients'
        pos_v = data
        v_samples = pos_v
        [h_mean, h_samples, pool_mean, pool_samples, vis_mean, vis_samples], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, 
		                        outputs_info=[None, None, None, None, None, v_samples], non_sequences=None, n_steps=self.k)
        pos_h = h_mean[0]
        neg_v = vis_samples[-1]
        neg_h = model.sample_hp_given_v(v=neg_v, sample=False)[0]
        
        cost = -(- model.energy(pos_v, pos_h).mean() + model.energy(neg_v, neg_h).mean())

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, pos_h, neg_v, neg_h])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()
        
        updates.update(scan_updates) # add scan_updates

        return gradients, updates
示例#26
0
    def get_gradients(self, model, data, ** kwargs):
        """cd算法是近似计算导数,而非直接求导,因此重写get_gradient()"""
        pos_v = data
        #v_samples = data
        [h_mean, h_sample, v_mean, v_sample], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, 
		                        outputs_info=[None, None, None, pos_v], non_sequences=None, n_steps=self.k)
        pos_h = h_mean[0]
        neg_v = v_sample[-1]
        neg_h = model.propup(neg_v)
        
        cost = -(- model.energy(pos_v, pos_h).mean() + model.energy(neg_v, neg_h).mean())

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, pos_h, neg_v, neg_h])

        gradients = OrderedDict(izip(params, grads))

        updates = OrderedDict()
        
        updates.update(scan_updates) # add scan_updates

        return gradients, updates
示例#27
0
文件: hinge.py 项目: baucheng/facedet
    def get_layer_monitoring_channels(self,
                                      state_below=None,
                                      state=None,
                                      targets=None):

        # channels that does not require state information
        W = self.W
        assert W.ndim == 2
        sq_W = T.sqr(W)
        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))
        rval = OrderedDict([
            ('row_norms_min', row_norms.min()),
            ('row_norms_mean', row_norms.mean()),
            ('row_norms_max', row_norms.max()),
            ('col_norms_min', col_norms.min()),
            ('col_norms_mean', col_norms.mean()),
            ('col_norms_max', col_norms.max()),
        ])

        if (state_below is not None) or (state is not None):
            if state is None:
                state = self.fprop(state_below)
            mx = state.max(axis=1)
            rval.update(
                OrderedDict([('mean_max_class', mx.mean()),
                             ('max_max_class', mx.max()),
                             ('min_max_class', mx.min())]))

            if targets is not None:
                y_hat = T.argmax(state, axis=1)
                y = T.argmax(targets, axis=1)
                misclass = T.neq(y, y_hat).mean()
                misclass = T.cast(misclass, config.floatX)
                rval['misclass'] = misclass
                rval['nll'] = self.cost(Y_hat=state, Y=targets)
        return rval
示例#28
0
    def _get_givens_subset(self, subset, batch_slice):
        """
        This translates a batch slice of start and end indices into the actual data from the given subset.

        Parameters
        ----------
        subset : int
            The subset to use - determined in opendeep.data.datasets as TRAIN, VALID, or TEST attributes.
        batch_slice : symbolic slice
            The symbolic slice to grab from the data.

        Returns
        -------
        OrderedDict
            The givens to provide to a function where it sets the input variable to the actual batch representation
            of data from the dataset: (input_variable: data[batch])
        """
        # translate the data_idx into the givens for the model
        # first get the lists of input variables the model requires - inputs and targets
        model_inputs = raise_to_list(self.model.get_inputs())
        model_targets = raise_to_list(self.model.get_targets())
        givens = None
        if self.dataset.getSubset(subset)[0] is not None:
            # grab the data and labels
            data, labels = self.dataset.getSubset(subset)
            # create the givens for the input function as pairs of (input_variable: sliced_data)
            givens = OrderedDict(zip(model_inputs, [data[batch_slice]]))
            # include labels as well if they are required by the model
            if model_targets is not None and len(model_targets) > 0:
                if labels is None:
                    log.error("No labels in the dataset!")
                    raise AssertionError, "No lables in the dataset!"
                givens.update(OrderedDict(zip(model_targets, [labels[batch_slice]])))
        else:
            log.warning("Dataset doesn't have subset %s" % get_subset_strings(subset))

        return givens
示例#29
0
class SGD_Optimizer():
    def __init__(self,params,inputs,costs,updates_old=None,consider_constant=[],momentum=True):
        """
        params: parameters of the model
        inputs: list of symbolic inputs to the graph
        costs: list of costs to be evaluated. The first element MUST be the objective.
        updates_old: OrderedDict from previous graphs that need to be accounted for by SGD, typically when scan is used.
        consider_constant: list of theano variables that are passed on to the grad method. Typically RBM.
        """
        self.inputs = inputs
        self.params = params
        self.momentum = momentum
        if self.momentum:
            self.params_mom = []
            for param in self.params:
                param_init = theano.shared(value=numpy.zeros(param.get_value().shape,dtype=theano.config.floatX),name=param.name+'_mom')
                self.params_mom.append(param_init)
        self.costs = costs 
        self.num_costs = len(costs)
        assert (isinstance(costs,list)), "The costs given to the SGD class must be a list, even for one element."
        self.updates_old = updates_old
        self.consider_constant = consider_constant
        self.build_train_fn()

    def build_train_fn(self,):
        self.lr_theano = T.scalar('lr')
        self.grad_inputs = self.inputs + [self.lr_theano]
        if self.momentum:
            self.mom_theano = T.scalar('mom')
            self.grad_inputs = self.grad_inputs + [self.mom_theano]
        
        self.gparams = T.grad(self.costs[0],self.params,consider_constant=self.consider_constant)
        if not self.momentum:
            print 'Building SGD optimization graph without momentum'
            updates = OrderedDict((i, i - self.lr_theano*j) for i, j in zip(self.params, self.gparams))
        else:
            print 'Building SGD optimization graph with momentum'
            updates = OrderedDict()
            for param,param_mom,gparam in zip(self.params,self.params_mom,self.gparams):
                param_inc = self.mom_theano * param_mom - self.lr_theano * gparam
                updates[param_mom] = param_inc
                updates[param] = param + param_inc
        self.calc_cost = theano.function(self.inputs,self.costs)
        if self.updates_old:
            updates_old = copy.copy(updates_old) #To avoid updating the model dict if updates dict belongs to model class, very unlikely case.
            self.updates_old.update(updates)
        else:
            self.updates_old = OrderedDict()
            self.updates_old.update(updates)

        self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old)

    def train(self,train_set,valid_set=None,learning_rate=0.1,num_epochs=500,save=False,output_folder=None,lr_update=None,mom_rate=0.9):
        self.best_cost = numpy.inf
        self.init_lr = learning_rate
        self.lr = numpy.array(learning_rate)
        self.mom_rate = mom_rate
        self.output_folder = output_folder
        self.train_set = train_set
        self.valid_set = valid_set
        self.save = save
        self.lr_update = lr_update
        try:
            for u in xrange(num_epochs):
                cost = []
                for i in self.train_set.iterate(True): 
                    inputs = i + [self.lr]
                    if self.momentum:
                        inputs = inputs + [self.mom_rate]
                    cost.append(self.f(*inputs))
                mean_costs = numpy.mean(cost,axis=0)
                print '  Epoch %i   ' %(u+1)
                print '***Train Results***'
                for i in xrange(self.num_costs):
                    print "Cost %i: %f"%(i,mean_costs[i])

                if not valid_set:
                    this_cost = numpy.absolute(numpy.mean(cost, axis=0))
                    if this_cost < best_cost:
                        best_cost = this_cost
                        print 'Best Params!'
                        if save:
                            self.save_model()
                    sys.stdout.flush()     
                else:
                    self.perform_validation()
                
                if lr_update:
                    self.update_lr(u+1,begin_anneal=1)

        except KeyboardInterrupt: 
            print 'Training interrupted.'
    
    def perform_validation(self,):
        cost = []
        for i in self.valid_set.iterate(True): 
            cost.append(self.calc_cost(*i))
        mean_costs = numpy.mean(cost,axis=0)
        print '***Validation Results***'
        for i in xrange(self.num_costs):
            print "Cost %i: %f"%(i,mean_costs[i])
     
        this_cost = numpy.absolute(numpy.mean(cost, axis=0))[1] #Using accuracy as metric
        if this_cost < self.best_cost:
            self.best_cost = this_cost
            print 'Best Params!'
            if self.save:
                self.save_model()

    def save_model(self,):
        best_params = [param.get_value().copy() for param in self.params]
        if not self.output_folder:
            cPickle.dump(best_params,open('best_params.pickle','w'))
        else:
            if not os.path.exists(self.output_folder):
                os.makedirs(self.output_folder)
            save_path = os.path.join(self.output_folder,'best_params.pickle')
            cPickle.dump(best_params,open(save_path,'w'))


    def update_lr(self,count,update_type='annealed',begin_anneal=500.,min_lr=0.01,decay_factor=1.2):
        if update_type=='annealed':
            scale_factor = float(begin_anneal)/count
            self.lr = self.init_lr*min(1.,scale_factor)
        if update_type=='exponential':
            new_lr = float(self.init_lr)/(decay_factor**count)
            if new_lr < min_lr:
                self.lr = min_lr
            else:
                self.lr = new_lr
    def setup_training(self):
        """
        Sets up training function.
        """
        
        training_batch_size = self.mini_batch_size
        
        cost = self.cnn.get_default_cost()
        
        data_specs = cost.get_data_specs(self.cnn)
        mapping = DataSpecsMapping(data_specs)
        space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
        source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
        
        #theano_args contains information about the shape of each layer
        theano_args = []
        for space, source in safe_zip(space_tuple, source_tuple):
            name = '%s[%s]' % (self.__class__.__name__, source)
            arg = space.make_theano_batch(name=name,
                        batch_size=training_batch_size).astype("float32")
            theano_args.append(arg)
        theano_args = tuple(theano_args)
        
        y_hat = self.cnn.fprop(theano_args[0])
        
        #function used for faster fprop
        self.fprop_func = theano.function([theano_args[0]], y_hat)
        
        cost = self.cnn.cost(theano_args[1], y_hat)
       
        #params is the list of layers in the NN
        params = list(self.cnn.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))
        
        rms_vals_dict = OrderedDict(izip(params, self.rms_vals))

        momentum_vals_dict = OrderedDict(izip(params, self.momentum_vals))
        
        grad_vals_dict = OrderedDict(izip(params, self.grad_vals))

        grad_update = OrderedDict()

        grad_update.update(dict(safe_zip(self.grad_vals, [gradients[param]
                                                    for param in params])))


        #function used for getting gradients
        #this is so that we only calculate gradients once, then
        #the same values are used for updating momentum, rmsprop, and training
        self.grad_update_func = theano.function(theano_args, updates=grad_update,
                                                on_unused_input='ignore')

        updates = OrderedDict()
        
        updates.update(dict(safe_zip(params, [param - self.learning_rate * 
                                (grad_vals_dict[param] / 
                                T.sqrt(rms_vals_dict[param] + 1e-8)) +
                                (self.momentum_step_size * 
                                momentum_vals_dict[param])
                                        for param in params])))
                                                    
        rmsprop_updates = OrderedDict()
        
        #rmsprop update function
        rmsprop_updates.update(dict(safe_zip(self.rms_vals, [(rms_vals_dict[param] * .9) + 
                                            (T.sqr(grad_vals_dict[param]) * .1)
                                                for param in params])))
        
        self.training = theano.function([], updates=updates, 
                                        on_unused_input='ignore')
                                        
        self.rmsprop_update = theano.function([], updates=rmsprop_updates,
                                                on_unused_input='ignore')
        
        momentum_updates = OrderedDict()

        #momentum update function
        momentum_updates.update(dict(safe_zip(self.momentum_vals, [-self.learning_rate * 
                                            (grad_vals_dict[param] / T.sqrt(rms_vals_dict[param] + 
                                                            1e-8)) + (self.momentum_step_size * 
                                                                momentum_vals_dict[param])
                                                                    for param in params])))

        self.momentum_update = theano.function([], updates=momentum_updates, 
                                                            on_unused_input='ignore')


        temp = T.tensor4()
        
        #function used for shuffling dimensions into c01b format
        self.dimshuf_func = theano.function([temp], temp.dimshuffle(1, 2, 3, 0))
        

        #functions to get grads and costs for debugging
        self.grads_func = theano.function(theano_args, grads)
        self.cost_function = theano.function(theano_args, cost)
示例#31
0
    def __init__(self, objective, params, inputs = None,
            param_constrainers = None, max_iter = -1,
            lr_scalers = None, verbose = 0, tol = None,
            init_alpha = None, min_init_alpha = 1e-3,
            reset_alpha = True, conjugate = False,
            reset_conjugate = True, gradients = None,
            gradient_updates = None, line_search_mode = None,
            accumulate = False, theano_function_mode=None):
        """
        Parameters
        ----------
        objective : tensor_like
            A theano expression to be minimized should be a function of \
            params and, if provided, inputs
        params : list
            A list of theano shared variables. These are the optimization \
            variables
        inputs : list, optional
            A list of theano variables to serve as inputs to the graph.
        param_constrainers : list
            A list of callables to be called on all updates dictionaries to \
            be applied to params. This is how you implement constrained \
            optimization.
        reset_alpha : bool
            If True, reverts to using init_alpha after each call. If False, \
            the final set of alphas is used at the start of the next call to \
            minimize.
        conjugate : bool
            If True, tries to pick conjugate gradient directions. For the \
            directions to be truly conjugate, you must use line_search_mode = \
            'exhaustive' and the objective function must be quadratic. \
            Using line_search_mode = 'exhaustive' on a non-quadratic \
            objective function implements nonlinear conjugate gradient descent.
        reset_conjugate : bool
            Has no effect unless conjugate == True. If reset_conjugate == \
            True, reverts to direction of steepest descent for the first \
            step in each call to minimize. Otherwise, tries to make the new \
            search direction conjugate to the last one (even though the \
            objective function might be totally different on each call to \
            minimize)
        gradients : WRITEME
            If None, compute the gradients of obj using T.grad otherwise, a \
            dictionary mapping from params to expressions for their gradients \
            (this allows you to use approximate gradients computed with \
            something other than T.grad)
        gradient_updates : dict
            A dictionary of shared variable updates to run each time the \
            gradient is computed

        Notes
        -----
        Calling the ``minimize'' method with values for for ``inputs'' will
        update ``params'' to minimize ``objective''.
        """

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha  = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [ param for param in params ]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX( param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            print 'batch gradient class compiling gradient function'
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates = updates)
        else:
            self._compute_grad = function(inputs, updates = updates,
                    mode=self.theano_function_mode,
                    name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            print 'done. Took ',t2-t1

        if self.verbose:
            print 'batch gradient class compiling objective function'
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs, obj, mode=self.theano_function_mode,
                    name='BatchGradientDescent.obj')

        if self.verbose:
            print 'done'

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name = 'alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha], updates=goto_updates,
                mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode,
                name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g], g * norm)
                for g in grad_to_old_grad]), mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [ grad_to_old_grad[g] for g in grad_ordered]

            def dot_product(x, y):
                return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            """

            beta_pr is the Polak-Ribiere formula for beta.
            According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            (ie, it is meant to revert to steepest descent when you have traveled far enough that
            the objective function is behaving non-quadratically enough that the conjugate gradient
            formulas aren't working anymore)

            http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            """

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function([], updates=make_conjugate_updates,
                    mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')


        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
示例#32
0
    def train(self, continue_training=False):
        """
        This method performs the training!!!
        :param continue_training:
        :type continue_training:
        :return:
        :rtype:
        """
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        log.info("%s params: %s", str(type(self.model)), str(self.params))

        ###############################################
        # theano index variable to use on the dataset #
        ###############################################
        # index to a [mini]batch - both start and end
        data_idx = T.iscalar('data_index')
        data_end_idx = T.iscalar('data_end_index')
        batch_slice = slice(data_idx, data_end_idx)

        # compute number of minibatches for training, validation and testing
        # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset
        # could be a list of shared variables (like multiple sequences from files)
        train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN))
        valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID))
        test_data_shapes  = raise_to_list(self.dataset.getDataShape(TEST))

        # train_batches is going to be lists of tuples that contain the start and end indices for train data
        train_data_lens = [shape[0] for shape in train_data_shapes]
        self.train_batches = self.get_batch_indices(train_data_lens)

        if valid_data_shapes is not None:
            valid_data_lens = [shape[0] for shape in valid_data_shapes]
            self.valid_batches = self.get_batch_indices(valid_data_lens)
        else:
            self.valid_batches = None
        if test_data_shapes is not None:
            test_data_lens = [shape[0] for shape in test_data_shapes]
            self.test_batches = self.get_batch_indices(test_data_lens)
        else:
            self.test_batches = None

        # translate the data_idx into the givens for the model
        model_inputs = raise_to_list(self.model.get_inputs())
        model_targets = raise_to_list(self.model.get_targets())

        train_data, train_labels = self.dataset.getSubset(TRAIN)
        train_givens = OrderedDict(zip(model_inputs, [train_data[batch_slice]]))
        if model_targets is not None and len(model_targets) > 0:
            train_givens.update(OrderedDict(zip(model_targets, [train_labels[batch_slice]])))

        valid_data, valid_labels = self.dataset.getSubset(VALID)
        valid_givens = OrderedDict(zip(model_inputs, [valid_data[batch_slice]]))
        if model_targets is not None and len(model_targets) > 0:
            valid_givens.update(OrderedDict(zip(model_targets, [valid_labels[batch_slice]])))

        test_data, test_labels = self.dataset.getSubset(TEST)
        test_givens = OrderedDict(zip(model_inputs, [test_data[batch_slice]]))
        if model_targets is not None and len(model_targets) > 0:
            test_givens.update(OrderedDict(zip(model_targets, [test_labels[batch_slice]])))

        # Now time to create the training cost functions for the model - make sure to handle the possible
        # list of costs used for pretraining of certain parts of the model.
        train_costs = raise_to_list(self.model.get_train_cost())
        self.train_functions = []
        for i, train_cost in enumerate(train_costs):
            # Now create the training cost function for the model to use while training - update parameters
            # gradient!
            gradients, _ = self.model.get_gradient(cost=train_cost)

            # Calculate the optimizer updates each run
            # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta!
            # It tells how to update the params each training epoch
            gradient_updates = self.get_updates(gradients)

            # Combine the updates from the model also if applicable
            train_updates = self.model.get_updates()
            if train_updates:
                train_updates.update(gradient_updates)
            else:
                train_updates = gradient_updates

            # Compile the training function!
            log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_costs),
                     str(type(self.model)))
            t = time.time()

            f_learn = function(inputs=[data_idx, data_end_idx],
                               updates=train_updates,
                               outputs=train_cost,
                               givens=train_givens,
                               name='f_learn_%d' % i)

            log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))
            self.train_functions.append(f_learn)

        # grab the expression(s) to use to monitor different model values during training
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        self.monitors = OrderedDict(self.model.get_monitors())
        self.monitor_names = self.monitors.keys()
        if len(self.monitors.keys()) > 0:
            self.train_monitor_function = function(
                inputs=[data_idx, data_end_idx],
                updates=self.model.get_updates(),
                outputs=self.monitors.values(),
                givens=train_givens,
                name="train_monitor_function"
            )
        if len(self.monitors.keys()) > 0:
            self.valid_monitor_function = function(
                inputs=[data_idx, data_end_idx],
                updates=self.model.get_updates(),
                outputs=self.monitors.values(),
                givens=valid_givens,
                name="valid_monitor_function"
            )
        if len(self.monitors.keys()) > 0:
            self.test_monitor_function = function(
                inputs=[data_idx, data_end_idx],
                updates=self.model.get_updates(),
                outputs=self.monitors.values(),
                givens=test_givens,
                name="test_monitor_function"
            )
        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        self.noise_switches = raise_to_list(self.model.get_noise_switch())

        ##################
        # start training #
        ##################
        # make sure to deal with a list of train_cost functions - for layer-wise pretraining!
        # this list of training functions was created during __init__()
        start_time = time.time()
        for func_i, train_function in enumerate(self.train_functions):
            log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------",
                     str(type(self.model)), func_i + 1, len(self.train_functions), self.n_epoch, str(continue_training))

            log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN))
            if self.dataset.hasSubset(VALID):
                log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID))
            if self.dataset.hasSubset(TEST):
                log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST))

            self.STOP = False
            self.epoch_counter = 0
            if not continue_training:
                # reset the learning rate
                if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay:
                    self.learning_rate_decay.reset()
                # reset the other model decaying functions
                for decay_param in self.model.get_decay_params():
                    decay_param.reset()

            self.times = []
            self.best_cost = numpy.inf
            self.best_params = None
            self.patience = 0

            t = time.time()

            while not self.STOP:
                try:
                    self.STOP = self._perform_one_epoch(train_function)
                except KeyboardInterrupt:
                    log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                    self.STOP = True

            # save params
            if self.best_params is not None:
                log.debug("Restoring best model parameters...")
                set_shared_values(self.params, self.best_params)
            log.debug("Saving model parameters...")
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

            log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

        log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
                 str(type(self.model)), make_time_units_string(time.time() - start_time))
    def __init__(self,
                 objective,
                 params,
                 inputs=None,
                 param_constrainers=None,
                 max_iter=-1,
                 lr_scalers=None,
                 verbose=0,
                 tol=None,
                 init_alpha=None,
                 min_init_alpha=1e-3,
                 reset_alpha=True,
                 conjugate=False,
                 reset_conjugate=True,
                 gradients=None,
                 gradient_updates=None,
                 line_search_mode=None,
                 accumulate=False,
                 theano_function_mode=None):
        """
        objective: a theano expression to be minimized
                       should be a function of params and,
                       if provided, inputs
            params: A list of theano shared variables.
                    These are the optimization variables
            inputs: (Optional) A list of theano variables
                    to serve as inputs to the graph.
            param_constrainers: (Optional) A list of callables
                    to be called on all updates dictionaries to
                    be applied to params. This is how you implement
                    constrained optimization.
            reset_alpha: If True, reverts to using init_alpha after
                        each call. If False, the final set of alphas
                        is used at the start of the next call to minimize.
            conjugate: If True, tries to pick conjugate gradient directions.
                       For the directions to be truly conjugate, you must use
                       line_search_mode = 'exhaustive' and the objective function
                       must be quadratic.
                       Using line_search_mode = 'exhaustive' on a non-quadratic objective
                       function implements nonlinear conjugate gradient descent.
            reset_conjugate:
                    has no effect unless conjugate == True
                    if reset_conjugate == True,
                        reverts to direction of steepest descent for the first
                        step in each call to minimize.
                    otherwise, tries to make the new search direction
                    conjugate to the last one (even though the objective function
                    might be totally different on each call to minimize)
            gradients: if None, compute the gradients of obj using T.grad
                    otherwise, a dictionary mapping from params to expressions
                    for their gradients (this allows you to use approximate
                    gradients computed with something other than T.grad)
            gradient_updates: a dictionary of shared variable updates to run
                each time the gradient is computed

            Calling the ``minimize'' method with values for
            for ``inputs'' will update ``params'' to minimize
            ``objective''.
        """

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            print 'batch gradient class compiling gradient function'
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            print 'done. Took ', t2 - t1

        if self.verbose:
            print 'batch gradient class compiling objective function'
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs,
                                obj,
                                mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            print 'done'

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha],
                                    updates=goto_updates,
                                    mode=self.theano_function_mode,
                                    name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(
            sum([
                T.sqr(elem).sum()
                for elem in self.param_to_grad_shared.values()
            ]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (
            1. - self.new_weight) * self.ave_grad_size

        self._normalize_grad = function(
            [],
            norm,
            updates=normalize_grad_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(),
                                                 'old_' + elem.name)

            self._store_old_grad = function(
                [norm],
                updates=OrderedDict([(grad_to_old_grad[g], g * norm)
                                     for g in grad_to_old_grad]),
                mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g] for g in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                            for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)
            """

            beta_pr is the Polak-Ribiere formula for beta.
            According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            (ie, it is meant to revert to steepest descent when you have traveled far enough that
            the objective function is behaving non-quadratically enough that the conjugate gradient
            formulas aren't working anymore)

            http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            """

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g])
                                      for g in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function(
                [],
                updates=make_conjugate_updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
示例#34
0
class Recurrent(Layer):
    """
    A recurrent neural network layer using the hyperbolic tangent
    activation function, passing on all hidden states or a selection
    of them to the next layer.

    The hidden state is initialized to zeros.

    Parameters
    ----------
    dim : int
        The number of elements in the hidden layer
    layer_name : str
        The name of the layer. All layers in an MLP must have a unique name.
    irange : float
        Initializes each weight randomly in U(-irange, irange)
    irange : float
        The input-to-hidden weight matrix is initialized with weights in
        the uniform interval (-irange, irange). The hidden-to-hidden
        matrix weights are sampled in the same manner, unless the argument
        svd is set to True (see below).
    indices : slice, list of integers or integer, optional
        If specified this layer will return only the given hidden
        states. If an integer is given, it will not return a
        SequenceSpace. Otherwise, it will return a SequenceSpace of
        fixed length. Note that a SequenceSpace of fixed length
        can be flattened by using the FlattenerLayer.
        Note: For now only [-1] is supported.
    init_bias : float, optional
        Set an initial bias to be added at each time step. Defaults to 0.
    svd : bool, optional
        Use singular value decomposition to factorize the hidden-to-hidden
        transition matrix with weights in U(-irange, irange) into matrices
        U*s*V, where U is orthogonal. This orthogonal matrix is used to
        initialize the weight matrix. Defaults to True.
    nonlinearity : theano function, optional
        Defaults to tensor.tanh, the non-linearity to be applied to the
        hidden state after each update
    """
    def __init__(self, dim, layer_name, irange, indices=None,
                 init_bias=0., svd=True, nonlinearity=tensor.tanh):
        self.rnn_friendly = True
        self._scan_updates = OrderedDict()
        self.__dict__.update(locals())
        del self.self
        super(Recurrent, self).__init__()

    @wraps(Layer.set_input_space)
    def set_input_space(self, space):
        if (not isinstance(space, SequenceSpace) or
                not isinstance(space.space, VectorSpace)):
            raise ValueError("Recurrent layer needs a SequenceSpace("
                             "VectorSpace) as input but received  %s instead"
                             % (space))
        self.input_space = space

        if self.indices is not None:
            if len(self.indices) > 1:
                raise ValueError("Only indices = [-1] is supported right now")
                self.output_space = CompositeSpace(
                    [VectorSpace(dim=self.dim) for _
                     in range(len(self.indices))]
                )
            else:
                assert self.indices == [-1], "Only indices = [-1] works now"
                self.output_space = VectorSpace(dim=self.dim)
        else:
            self.output_space = SequenceSpace(VectorSpace(dim=self.dim))

        # Initialize the parameters
        rng = self.mlp.rng
        if self.irange is None:
            raise ValueError("Recurrent layer requires an irange value in "
                             "order to initialize its weight matrices")

        # U is the hidden-to-hidden transition matrix
        U = rng.uniform(-self.irange, self.irange, (self.dim, self.dim))
        if self.svd:
            U = self.mlp.rng.randn(self.dim, self.dim)
            U, s, V = np.linalg.svd(U, full_matrices=True, compute_uv=True)

        # W is the input-to-hidden matrix
        W = rng.uniform(-self.irange, self.irange,
                        (self.input_space.dim, self.dim))

        self._params = [sharedX(W, name=(self.layer_name + '_W')),
                        sharedX(U, name=(self.layer_name + '_U')),
                        sharedX(np.zeros(self.dim) + self.init_bias,
                                name=self.layer_name + '_b')]

    @wraps(Layer.get_layer_monitoring_channels)
    def get_layer_monitoring_channels(self, state_below=None, state=None,
                                      targets=None):
        W, U, b = self._params
        sq_W = tensor.sqr(W)
        sq_U = tensor.sqr(U)
        row_norms = tensor.sqrt(sq_W.sum(axis=1))
        col_norms = tensor.sqrt(sq_W.sum(axis=0))
        u_row_norms = tensor.sqrt(sq_U.sum(axis=1))
        u_col_norms = tensor.sqrt(sq_U.sum(axis=0))

        rval = OrderedDict([('W_row_norms_min',  row_norms.min()),
                            ('W_row_norms_mean', row_norms.mean()),
                            ('W_row_norms_max',  row_norms.max()),
                            ('W_col_norms_min',  col_norms.min()),
                            ('W_col_norms_mean', col_norms.mean()),
                            ('W_col_norms_max',  col_norms.max()),
                            ('U_row_norms_min', u_row_norms.min()),
                            ('U_row_norms_mean', u_row_norms.mean()),
                            ('U_row_norms_max', u_row_norms.max()),
                            ('U_col_norms_min', u_col_norms.min()),
                            ('U_col_norms_mean', u_col_norms.mean()),
                            ('U_col_norms_max', u_col_norms.max())])

        if (state is not None) or (state_below is not None):
            if state is None:
                state = self.fprop(state_below)
            state, _ = state
            state_below, _ = state_below

            mx = state.max(axis=0)
            mean = state.mean(axis=0)
            mn = state.min(axis=0)
            rg = mx - mn

            rval['range_x_max_u'] = rg.max()
            rval['range_x_mean_u'] = rg.mean()
            rval['range_x_min_u'] = rg.min()

            rval['max_x_max_u'] = mx.max()
            rval['max_x_mean_u'] = mx.mean()
            rval['max_x_min_u'] = mx.min()

            rval['mean_x_max_u'] = mean.max()
            rval['mean_x_mean_u'] = mean.mean()
            rval['mean_x_min_u'] = mean.min()

            rval['min_x_max_u'] = mn.max()
            rval['min_x_mean_u'] = mn.mean()
            rval['min_x_min_u'] = mn.min()

        return rval

    @wraps(Layer._modify_updates)
    def _modify_updates(self, updates):
        # When random variables are used in the scan function the updates
        # dictionary returned by scan might not be empty, and needs to be
        # added to the updates dictionary before compiling the training
        # function
        if any(key in updates for key in self._scan_updates):
            # Don't think this is possible, but let's check anyway
            raise ValueError("A single shared variable is being updated by "
                             "multiple scan functions")
        updates.update(self._scan_updates)

    @wraps(Layer.fprop)
    def fprop(self, state_below):
        state_below, mask = state_below

        # z0 is the initial hidden state which is (batch size, output dim)
        z0 = tensor.alloc(np.cast[config.floatX](0), state_below.shape[1],
                          self.dim)
        if self.dim == 1:
            # This should fix the bug described in Theano issue #1772
            z0 = tensor.unbroadcast(z0, 1)

        # Later we will add a noise function
        W, U, b = self._params

        # It is faster to do the input-to-hidden matrix multiplications
        # outside of scan
        state_below = tensor.dot(state_below, W) + b

        def fprop_step(state_below, mask, state_before, U):
            z = self.nonlinearity(state_below +
                                  tensor.dot(state_before, U))

            # Only update the state for non-masked data, otherwise
            # just carry on the previous state until the end
            z = mask[:, None] * z + (1 - mask[:, None]) * state_before
            return z

        z, updates = scan(fn=fprop_step, sequences=[state_below, mask],
                          outputs_info=[z0], non_sequences=[U])
        self._scan_updates.update(updates)

        if self.indices is not None:
            if len(self.indices) > 1:
                return [z[i] for i in self.indices]
            else:
                return z[self.indices[0]]
        else:
            return (z, mask)
示例#35
0
文件: SGD.py 项目: sidsig/RNADE
class SGD_Optimiser:
    def __init__(self,params,inputs,costs,updates_old=None,consider_constant=[],momentum=False,patience=20,custom_grads=False,custom_grad_dict=None):
        """
        params: list containing the parameters of the model
        inputs: list of symbolic inputs to the graph
        costs: list of costs to be evaluated. The first element MUST be the objective.
        updates_old: OrderedDict from previous graphs that need to be accounted for by SGD, typically when scan is used.
        consider_constant: list of theano variables that are passed on to the grad method. Typically RBM.
        """
        self.inputs = inputs
        self.params = params
        self.momentum = momentum
        self.max_patience = patience
        self.patience = 0
        if self.momentum:
            self.params_mom = []
            for param in self.params:
                param_init = theano.shared(value=numpy.zeros(param.get_value().shape,dtype=theano.config.floatX),)
                self.params_mom.append(param_init)
        self.costs = costs 
        self.custom_grads = custom_grads
        self.custom_grad_dict = custom_grad_dict
        self.num_costs = len(costs)
        assert (isinstance(costs,list)), "The costs given to the SGD class must be a list, even for one element."
        self.updates_old = updates_old
        self.consider_constant = consider_constant
        self.build_train_fn()

    def build_train_fn(self,):
        self.lr_theano = T.scalar('lr')
        self.grad_inputs = self.inputs + [self.lr_theano]
        if self.momentum:
            self.mom_theano = T.scalar('mom')
            self.grad_inputs = self.grad_inputs + [self.mom_theano]
        if self.custom_grads:
            self.gparams = []
            for param in self.params:
                self.gparams.append(self.custom_grad_dict[param.name])
        else:
            self.gparams = T.grad(self.costs[0],self.params,consider_constant=self.consider_constant)
    
        if not self.momentum:
            print 'Building SGD optimization graph without momentum'
            updates = OrderedDict((i, i - self.lr_theano*j) for i, j in zip(self.params, self.gparams))
        else:
            print 'Building SGD optimization graph with momentum'
            updates = OrderedDict()
            for param,param_mom,gparam in zip(self.params,self.params_mom,self.gparams):
                param_inc = self.mom_theano * param_mom - self.lr_theano * gparam
                updates[param_mom] = param_inc
                updates[param] = param + param_inc
        self.calc_cost = theano.function(self.inputs,self.costs)
        if self.updates_old:
            updates_old = copy.copy(updates_old) #To avoid updating the model dict if updates dict belongs to model class, very unlikely case.
            self.updates_old.update(updates)
        else:
            self.updates_old = OrderedDict()
            self.updates_old.update(updates)

        self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old)

    def train(self,train_set,valid_set=None,learning_rate=0.1,num_epochs=500,save=False,output_folder=None,lr_update=True,
              mom_rate=0.9,update_type='linear',begin_anneal=50,start=2):
        print 'Initializing training.'
        self.best_cost = numpy.inf
        self.init_lr = learning_rate
        self.lr = numpy.array(learning_rate)
        self.mom_rate = mom_rate
        self.output_folder = output_folder
        self.train_set = train_set
        self.valid_set = valid_set
        self.save = save
        self.lr_update = lr_update
        self.stop_train = False
        self.train_costs = []
        self.valid_costs = []
        self.num_epochs = num_epochs
        self.start = start - 1 #subtracting one for zero index. 
        try:
            for u in xrange(num_epochs):
                cost = []
                for i in self.train_set.iterate(True): 
                    inputs = i + [self.lr]
                    if self.momentum:
                        inputs = inputs + [self.mom_rate]
                    cost_no_update = self.calc_cost(*i)
                    #print cost_no_update
                    if numpy.isnan(cost_no_update):
                        print 'Cost was NaN for a particular batch!'
                        break
                    else:
                        cost.append(self.f(*inputs))
                if numpy.isnan(cost_no_update):
                    #pdb.set_trace()
                    epochs = [i for i in xrange(len(self.train_costs))]
                    costs = numpy.array(self.train_costs).reshape(-1)
                    plot(epochs,costs)
                    xlabel('epoch')
                    ylabel('negative log-likelihood')
                    title('Training on red wine dataset')
                    if self.custom_grads:
                        savefig('cost_custom.png')
                    else:
                        savefig('cost_theano.png')
                    break
                mean_costs = numpy.mean(cost,axis=0)                
                if numpy.isnan(mean_costs[0]):
                    print 'Training cost is NaN.'
                    print 'Breaking from training early, the last saved set of parameters is still usable!'
                    break
                print '  Epoch %i   ' %(u+1)
                print '***Train Results***'
                for i in xrange(self.num_costs):
                    print "Cost %i: %f"%(i,mean_costs[i])
                self.train_costs.append(mean_costs)
                if not valid_set:
                    this_cost = numpy.absolute(numpy.mean(cost, axis=0))
                    if this_cost < self.best_cost:
                        self.best_cost = this_cost
                        print 'Best Params!'
                        if save:
                            self.save_model()
                    sys.stdout.flush()     
                else:
                    self.perform_validation()
                
                if self.stop_train:
                    print 'Stopping training early.'
                    break

                if lr_update:
                    self.update_lr(u+1,update_type='linear',start=self.start,num_iterations=self.num_epochs)
            print 'Training completed!'
            epochs = [i for i in xrange(len(self.train_costs))]
            costs = numpy.array(self.train_costs).reshape(-1)
            plot(epochs,costs)
            xlabel('epoch')
            ylabel('negative log-likelihood')
            title('Training on red wine dataset')
            
        except KeyboardInterrupt: 
            print 'Training interrupted.'
    
    def perform_validation(self,):
        cost = []
        for i in self.valid_set.iterate(True): 
            cost.append(self.calc_cost(*i))
        mean_costs = numpy.mean(cost,axis=0)
        self.valid_costs.append(mean_costs)
        print '***Validation Results***'
        for i in xrange(self.num_costs):
            print "Cost %i: %f"%(i,mean_costs[i])
        
        this_cost = numpy.absolute(numpy.mean(cost, axis=0))[0] #Using accuracy as metric
        if this_cost < self.best_cost:
            self.patience = 0
            self.best_cost = this_cost
            print 'Best Params!'
            if self.save:
                self.save_model()
        else:
            self.patience+=1
            print 'Patience: %d/%d'%(self.patience,self.max_patience)
            if self.patience >= self.max_patience:
                self.stop_train = True


    def save_model(self,):
        best_params = [param.get_value().copy() for param in self.params]
        if not self.output_folder:
            cPickle.dump(best_params,open('best_params.pickle','w'))
        else:
            if not os.path.exists(self.output_folder):
                os.makedirs(self.output_folder)
            save_path = os.path.join(self.output_folder,'best_params.pickle')
            cPickle.dump(best_params,open(save_path,'w'))


    def update_lr(self,count,update_type='annealed',begin_anneal=500.,min_lr=0.01,decay_factor=1.2,start=2,num_iterations=1000):
        if update_type=='annealed':
            scale_factor = float(begin_anneal)/count
            self.lr = self.init_lr*min(1.,scale_factor)
        elif update_type=='exponential':
            new_lr = float(self.init_lr)/(decay_factor**count)
            if new_lr < min_lr:
                self.lr = min_lr
            else:
                self.lr = new_lr
        elif update_type == 'linear':
            slope = self.init_lr/(num_iterations - start)
            if count >= start:
                self.lr = self.init_lr - count * slope
                print 'Updated lr: ',self.lr
示例#36
0
文件: SGD.py 项目: chimera0/NIPS-2014
class SGD_Optimiser:
    def __init__(self,
                 params,
                 inputs,
                 costs,
                 updates_old=None,
                 consider_constant=[],
                 momentum=False,
                 patience=20,
                 custom_grads=False,
                 custom_grad_dict=None,
                 state=None,
                 clip_gradients=False,
                 grad_threshold=50.):
        """
        params: list containing the parameters of the model
        inputs: list of symbolic inputs to the graph
        costs: list of costs to be evaluated. The first element MUST be the objective.
        updates_old: OrderedDict from previous graphs that need to be accounted for by SGD, typically when scan is used.
        consider_constant: list of theano variables that are passed on to the grad method. Typically RBM.
        """
        self.inputs = inputs
        self.params = params
        self.momentum = momentum
        self.max_patience = patience
        self.patience = 0
        if self.momentum:
            self.params_mom = []
            for param in self.params:
                param_init = theano.shared(value=numpy.zeros(
                    param.get_value().shape, dtype=theano.config.floatX), )
                self.params_mom.append(param_init)
        self.costs = costs
        self.custom_grads = custom_grads
        self.custom_grad_dict = custom_grad_dict
        self.num_costs = len(costs)
        assert (
            isinstance(costs, list)
        ), "The costs given to the SGD class must be a list, even for one element."
        self.updates_old = updates_old
        self.consider_constant = consider_constant
        self.clip_gradients = clip_gradients
        self.grad_threshold = grad_threshold
        self.build_train_fn()
        #self.save_model() #saving pre-trained model
        self.state = state

    def build_train_fn(self, ):
        self.lr_theano = T.scalar('lr')
        self.grad_inputs = self.inputs + [self.lr_theano]
        if self.momentum:
            self.mom_theano = T.scalar('mom')
            self.grad_inputs = self.grad_inputs + [self.mom_theano]
        print 'Calculating gradients. This might take a while depending on the model...'
        if self.custom_grads:
            self.gparams = []
            for param in self.params:
                self.gparams.append(self.custom_grad_dict[param.name])
        else:
            if self.clip_gradients:
                self.gradient_clipping()
            else:
                self.gparams = T.grad(self.costs[0],
                                      self.params,
                                      consider_constant=self.consider_constant)
        print 'Done calculating gradients.'

        if not self.momentum:
            print 'Building SGD optimization graph without momentum'
            updates = OrderedDict((i, i - self.lr_theano * j)
                                  for i, j in zip(self.params, self.gparams))
        else:
            print 'Building SGD optimization graph with momentum'
            updates = OrderedDict()
            for param, param_mom, gparam in zip(self.params, self.params_mom,
                                                self.gparams):
                param_inc = self.mom_theano * param_mom - self.lr_theano * gparam
                updates[param_mom] = param_inc
                updates[param] = param + param_inc
        self.calc_cost = theano.function(self.inputs, self.costs)
        if self.updates_old:
            updates_old = copy.copy(
                updates_old
            )  #To avoid updating the model dict if updates dict belongs to model class, very unlikely case.
            self.updates_old.update(updates)
        else:
            self.updates_old = OrderedDict()
            self.updates_old.update(updates)

        self.f = theano.function(self.grad_inputs,
                                 self.costs,
                                 updates=self.updates_old)

    def gradient_clipping(self, threshold=1.):
        print 'Including Gradient clipping'
        gparams = T.grad(self.costs[0],
                         self.params,
                         consider_constant=self.consider_constant)
        self.gparams = []
        for gparam in gparams:
            norm_gparam = T.sqrt((gparam**2).sum())
            clipped_gparam = T.switch(norm_gparam > self.grad_threshold,
                                      (self.grad_threshold / norm_gparam) *
                                      gparam, gparam)
            self.gparams.append(clipped_gparam)

    def train(self,
              train_set,
              valid_set=None,
              learning_rate=0.1,
              num_epochs=500,
              save=False,
              output_folder=None,
              lr_update=True,
              mom_rate=0.9,
              update_type='linear',
              begin_anneal=50,
              start=2,
              filename=None):
        print 'Initializing training.'
        self.best_cost = numpy.inf
        self.init_lr = learning_rate
        self.lr = numpy.array(learning_rate)
        self.mom_rate = mom_rate
        self.output_folder = output_folder
        self.train_set = train_set
        self.valid_set = valid_set
        self.save = save
        self.lr_update = lr_update
        self.stop_train = False
        self.train_costs = []
        self.valid_costs = []
        self.num_epochs = num_epochs
        self.start = start - 1  #subtracting one for zero index.
        self.filename = filename
        try:
            for u in xrange(num_epochs):
                cost = []
                for i in self.train_set.iterate(True):
                    inputs = i + [self.lr]
                    if self.momentum:
                        inputs = inputs + [self.mom_rate]
                    cost_no_update = self.calc_cost(
                        *i)[0]  #The first cost in the list is the objective
                    #print cost_no_update
                    if numpy.isnan(cost_no_update):
                        print 'Cost was NaN for a particular batch!'
                        break
                    else:
                        cost.append(self.f(*inputs))
                if numpy.isnan(cost_no_update):
                    break
                mean_costs = numpy.mean(cost, axis=0)
                if numpy.isnan(mean_costs[0]):
                    print 'Training cost is NaN.'
                    print 'Breaking from training early, the last saved set of parameters is still usable!'
                    break
                print '  Epoch %i   ' % (u + 1)
                print '***Train Results***'
                for i in xrange(self.num_costs):
                    print "Cost %i: %f" % (i, mean_costs[i])
                self.train_costs.append(mean_costs)
                if not valid_set:
                    this_cost = numpy.absolute(numpy.mean(cost, axis=0))
                    if this_cost < self.best_cost:
                        self.best_cost = this_cost
                        print 'Best Params!'
                        if save:
                            self.save_model()
                    sys.stdout.flush()
                else:
                    self.perform_validation()

                if self.stop_train:
                    print 'Stopping training early.'
                    break

                if lr_update:
                    self.update_lr(u + 1,
                                   update_type='linear',
                                   start=self.start,
                                   num_iterations=self.num_epochs)
            print 'Training completed!'

        except KeyboardInterrupt:
            print 'Training interrupted.'

    def perform_validation(self, ):
        cost = []
        for i in self.valid_set.iterate(True):
            cost.append(self.calc_cost(*i))
        mean_costs = numpy.mean(cost, axis=0)
        self.valid_costs.append(mean_costs)
        print '***Validation Results***'
        for i in xrange(self.num_costs):
            print "Cost %i: %f" % (i, mean_costs[i])

        this_cost = numpy.absolute(numpy.mean(
            cost, axis=0))[0]  #The first cost is the objective function
        if this_cost < self.best_cost:
            self.patience = 0
            self.best_cost = this_cost
            print 'Best Params!'
            if self.save:
                self.save_model()
        else:
            self.patience += 1
            print 'Patience: %d/%d' % (self.patience, self.max_patience)
            if self.patience >= self.max_patience:
                self.stop_train = True

    def save_model(self, filename=None):
        print 'Saving model parameters.'
        best_params = [param.get_value().copy() for param in self.params]
        if not self.output_folder:
            if not filename:
                cPickle.dump(best_params, open('best_params.pickle', 'w'))
            else:
                cPickle.dump(best_params, open(filename, 'w'))
        else:
            if not os.path.exists(self.output_folder):
                os.makedirs(self.output_folder)
            if not filename:
                save_path = os.path.join(self.output_folder,
                                         'best_params.pickle')
            else:
                save_path = os.path.join(self.output_folder, filename)
            cPickle.dump(best_params, open(save_path, 'w'))

    def update_lr(self,
                  count,
                  update_type='annealed',
                  begin_anneal=500.,
                  min_lr=0.01,
                  decay_factor=1.2,
                  start=2,
                  num_iterations=1000):
        if update_type == 'annealed':
            scale_factor = float(begin_anneal) / count
            self.lr = self.init_lr * min(1., scale_factor)
        elif update_type == 'exponential':
            new_lr = float(self.init_lr) / (decay_factor**count)
            if new_lr < min_lr:
                self.lr = min_lr
            else:
                self.lr = new_lr
        elif update_type == 'linear':
            slope = self.init_lr / (
                num_iterations - start
            )  #Ensure this is never zero, num_iterations must be > 2
            if count >= start:
                self.lr = self.init_lr - count * slope
                print 'Updated lr: ', self.lr
示例#37
0
文件: optimizer.py 项目: rohit22/top
    def compile(self):
        print "$> Compiling optimizer."

        #TODO: automate "install" of new methods.
        # instead of this switch, should check if the input string is
        # the name of a valid method
        if self.method.lower() == 'sgd':
            updates = top.up.sgd(self.p,
                                 cost=self.cost,
                                 lr=self.lr,
                                 momentum=self.m,
                                 lr_rate=self.lr_rate,
                                 m_rate=self.m_rate,
                                 consider_cosntant=self.cc)
        elif self.method.lower() == 'rmsprop':
            updates = top.up.rmsprop(self.p,
                                     self.cost,
                                     lr=self.lr,
                                     momentum=self.m,
                                     lr_rate=self.lr_rate,
                                     m_rate=self.m_rate,
                                     consider_constant=self.cc,
                                     grad_clip=self.grad_clip)
        elif self.method.lower() == 'adam':
            updates = top.up.adam(self.p,
                                  self.cost,
                                  lr=self.lr,
                                  grad_clip=self.grad_clip)
        elif self.method.lower() == 'adagrad':
            updates = top.up.adagrad(self.p,
                                     self.cost,
                                     lr=self.lr,
                                     lr_rate=self.lr_rate)
        else:
            raise NotImplementedError("Optimization method not implemented!")

        updates = OrderedDict(updates)
        if self.extra_updates is not None:
            updates.update(self.extra_updates)

        # This may seem weird, but I was getting bugs without this if-else
        if self.input == []:
            # Return cost and update params
            self.f = theano.function([],
                                     self.cost,
                                     updates=updates,
                                     givens=self.givens,
                                     allow_input_downcast=True)
            # Return cost without updating params, use this for testing
            self.g = theano.function([],
                                     self.cost,
                                     givens=self.givens,
                                     allow_input_downcast=True)
        else:
            if not isinstance(self.input, list):
                self.input = [self.input]
            self.f = theano.function(self.input,
                                     self.cost,
                                     updates=updates,
                                     givens=self.givens,
                                     allow_input_downcast=True)
            self.g = theano.function(self.input,
                                     self.cost,
                                     givens=self.givens,
                                     allow_input_downcast=True)

        return self
 def setup_training(self):
     """
     Sets up training function.
     """
     
     training_batch_size = self.mini_batch_size
     
     cost = self.cnn.get_default_cost()
     
     data_specs = cost.get_data_specs(self.cnn)
     mapping = DataSpecsMapping(data_specs)
     space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
     source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
     
     theano_args = []
     for space, source in safe_zip(space_tuple, source_tuple):
         name = '%s[%s]' % (self.__class__.__name__, source)
         arg = space.make_theano_batch(name=name,
                     batch_size=training_batch_size).astype("float32")
         theano_args.append(arg)
     theano_args = tuple(theano_args)
     
     y_hat = self.cnn.fprop(theano_args[0])
     
     self.fprop_func = theano.function([theano_args[0]], y_hat)
     
     cost = self.cnn.cost(theano_args[1], y_hat)
     
     lr_scalers = self.cnn.get_lr_scalers()
     
     params = list(self.cnn.get_params())
     grads = T.grad(cost, params, disconnected_inputs='ignore')
     
     gradients = OrderedDict(izip(params, grads))
     
     rms_vals_dict = OrderedDict(izip(params, self.rms_vals))
     
     updates = OrderedDict()
     
     updates.update(dict(safe_zip(params, [param - self.learning_rate * 
                             (gradients[param] / 
                             T.sqrt(rms_vals_dict[param] + 1e-8)) 
                             for param in params])))
                                                 
     rmsprop_updates = OrderedDict()
     
     rmsprop_updates.update(dict(safe_zip(self.rms_vals, [(rms_vals_dict[param] * .9) + 
                                         (T.sqr(gradients[param]) * .1)
                                             for param in params])))
     
     self.training = theano.function(theano_args, updates=updates, 
                                     on_unused_input='ignore')
                                     
     self.rmsprop_update = theano.function(theano_args, updates=rmsprop_updates,
                                             on_unused_input='ignore')
     
     temp = T.tensor4()
     
     self.dimshuf_func = theano.function([temp], temp.dimshuffle(1, 2, 3, 0))
     
     #self.grads_func = theano.function(theano_args, grads)
                                     
     self.cost_function = theano.function(theano_args, cost)
示例#39
0
            n_nit_sot += 1

    # Step 5.5 all other arguments including extra inputs
    other_scan_args = []
    other_inner_args = []

    other_scan_args += [arg for arg in non_seqs
                        if (not isinstance(arg, SharedVariable) and
                            not isinstance(arg, tensor.Constant))]

    # Step 5.6 all shared variables with no update rules
    other_inner_args += [safe_new(arg, '_copy') for arg in non_seqs
                         if (not isinstance(arg, SharedVariable) and
                             not isinstance(arg, tensor.Constant))]

    givens.update(OrderedDict(zip(other_scan_args, other_inner_args)))

    if strict:
        non_seqs_set = set(non_sequences if non_sequences != None else [])

        other_shared_scan_args = [arg.variable for arg
                            in dummy_f.maker.expanded_inputs
                            if (isinstance(arg.variable, SharedVariable) and
                                not arg.update and
                                arg.variable in non_seqs_set)]
        other_shared_inner_args = [safe_new(arg.variable, '_copy') for arg
                            in dummy_f.maker.expanded_inputs
                            if (isinstance(arg.variable, SharedVariable) and
                                not arg.update and
                                arg.variable in non_seqs_set)]
    else:
示例#40
0
文件: dpcn.py 项目: EderSantana/mdpcn
class ConvSparseCoding(ConvElemwise):
    '''
        Parameters for the optimization/feedforward operation:
        lr      : learning rate
        n_steps : number of steps or uptades of the hidden code
        truncate: truncate the gradient after this number (default -1 which 
                  means do not truncate)
    '''
    
    def __init__(self, batch_size, x_axes=['b', 'c', 0, 1], 
                 fprop_code=True, lr=.01, n_steps=10, 
                 truncate=-1, *args, **kwargs):
        
        super(ConvSparseCoding, self).__init__(*args, **kwargs)
        self.batch_size = batch_size
        self.fprop_code = fprop_code
        self.n_steps = n_steps
        self.truncate = truncate
        self.lr = lr
        self._scan_updates = OrderedDict()
    
    def initialize_x_space(self,rng):
        """
        This function initializes the coding space and dimmensions
        
        X is how I generally call the sparse code variables. 
        Thus, X_space has its dimmensions

        """
        dummy_batch_size = self.mlp.batch_size

        if dummy_batch_size is None:
            dummy_batch_size = self.batch_size
        dummy_detector =\
                sharedX(self.detector_space.get_origin_batch(dummy_batch_size))
        
        if self.pool_type is not None:
            assert self.pool_type in ['max', 'mean']
            if self.pool_type == 'max':
                dummy_p = max_pool(dummy_detector,
                                   self.pool_shape)
                '''
                                   pool_stride=self.pool_stride,
                                   image_shape=self.detector_space)
                '''
            elif self.pool_type == 'mean':
                dummy_p = mean_pool(dummy_detector,
                                    self.pool_shape)
                '''
                                    pool_stride=self.pool_stride,
                                    image_shape=self.detector_shape)
                '''
            dummy_p = dummy_p.eval()
            self.x_space = Conv2DSpace(shape=[dummy_p.shape[2],
                                              dummy_p.shape[3]],
                                            num_channels=
                                                self.output_channels,
                                            axes=('b', 'c', 0, 1))
        else:
            dummy_detector = dummy_detector.eval()
            self.x_space = Conv2DSpace(shape=[dummy_detector.shape[2],
                                            dummy_detector.shape[3]],
                                            num_channels=self.output_channels,
                                            axes=('b', 'c', 0, 1))
        
        X = rng.normal(0, .001, size=(dummy_batch_size,
                                     self.output_channels,
                                     self.detector_space.shape[0],
                                     self.detector_space.shape[1]))
        
        self.X = sharedX(X, self.layer_name+'_X')

        logger.info('Code space: {0}'.format(self.x_space.shape))

    @wraps(ConvElemwise.initialize_transformer)
    def initialize_transformer(self, rng):
        """
        This function initializes the transformer of the class. Re-running
        this function will reset the transformer.

        X is how I generally call the sparse code variables. 
        Thus, X_space has its dimmensions

        Parameters
        ----------
        rng : object
            random number generator object.
        """
         
        if self.irange is not None:
            assert self.sparse_init is None
            self.transformer = conv2d.make_random_conv2D(
                    irange=self.irange,
                    input_space=self.x_space,
                    output_space=self.input_space,
                    kernel_shape=self.kernel_shape,
                    subsample=self.kernel_stride,
                    border_mode=self.border_mode,
                    rng=rng)
        elif self.sparse_init is not None:
            self.transformer = conv2d.make_sparse_random_conv2D(
                    num_nonzero=self.sparse_init,
                    input_space=self.X_space,
                    output_space=self.detector_space,
                    kernel_shape=self.kernel_shape,
                    subsample=self.kernel_stride,
                    border_mode=self.border_mode,
                    rng=rng)

            
    def get_local_cost(self, state_below):
        er = T.sqr(state_below - self.transformer.lmul(self.X)).sum()
        l1 = T.sqrt( T.sqr(self.X) + 1e-6).sum()
        return er + .1 * l1   


    @wraps(ConvElemwise.initialize_output_space)
    def initialize_output_space(self):
        
        if self.fprop_code is True:
            self.output_space = self.x_space
        else:
            self.output_space = self.input_space

        logger.info('Output space: {0}'.format(self.output_space.shape))
    
    @wraps(Layer.set_input_space)
    def set_input_space(self, space):
        """ Note: this function will reset the parameters! """

        self.input_space = space

        if not isinstance(space, Conv2DSpace):
            raise BadInputSpaceError(self.__class__.__name__ +
                                     ".set_input_space "
                                     "expected a Conv2DSpace, got " +
                                     str(space) + " of type " +
                                     str(type(space)))

        rng = self.mlp.rng

        output_shape = [(self.input_space.shape[0] + self.kernel_shape[0])
                            / self.kernel_stride[0] - 1,
                            (self.input_space.shape[1] + self.kernel_shape[1])
                            / self.kernel_stride[1] - 1]

        self.detector_space = Conv2DSpace(shape=output_shape,
                                          num_channels=self.output_channels,
                                          axes=('b', 'c', 0, 1))

        self.initialize_x_space(rng)
        self.initialize_transformer(rng)

        W, = self.transformer.get_params()
        W.name = self.layer_name + '_W'

        if self.tied_b:
            self.b = sharedX(np.zeros((self.detector_space.num_channels)) +
                             self.init_bias)
        else:
            self.b = sharedX(self.detector_space.get_origin() + self.init_bias)

        self.b.name = self.layer_name + '_b'

        logger.info('Input shape: {0}'.format(self.input_space.shape))
        logger.info('Detector space: {0}'.format(self.detector_space.shape))

        self.initialize_output_space()


    def _renormW(self):
        A = self.transformer.get_params()[0].get_value(borrow=True)
        Ashape = A.shape
        A = A.reshape((Ashape[0]*Ashape[1],Ashape[2]*Ashape[3]))
        A = np.dot(A.T, np.diag(1./np.sqrt(np.sum(A**2, axis=1)))).T
        A = A.reshape(Ashape)
        self.transformer.get_params()[0].set_value( A )
    
    def get_sparse_code(self, state_below):

        def _optimization_step(Xt, accum, vt, S):
                
            '''
            Note that this is the RMSprop update. 
            Thus, we running gradient updates inside scan (the dream)
            
            TODO: put this a better place.
            I tried to make if a method of self, but I'm not sure how to tell 
            theano.scan that the first argument of the function is a non_sequence
            '''
            
            rho = .9
            momentum = .9
            lr = self.lr
            Y = self.transformer.lmul(Xt) #T.dot(Xt, self.W) #+ self.b
            err = (S - Y) ** 2
            l1 = T.sqrt(Xt**2 + 1e-6)
            cost = err.sum() + .1 * l1.sum()
            #cost = self.get_local_cost(S)
            gX = T.grad(cost, Xt)
            new_accum = rho * accum + (1-rho) * gX**2
            v = momentum * vt  - lr * gX / T.sqrt(new_accum + 1e-8)
            X = Xt + momentum * v - lr * gX / T.sqrt(new_accum + 1e-8)
            return [X, new_accum, v]

        # Renorm W
        self._renormW()
        
        rng = self.mlp.rng
        #X = rng.randn(self.batch_size, self.dim)
        #self.X = sharedX(X, 'SparseCodingLinear_X')
        accum = T.zeros_like(self.X)
        vt = T.zeros_like(self.X)
        [Xfinal,_,_], updates = theano.scan(fn=_optimization_step,
                     outputs_info=[self.X, accum, vt], 
                     non_sequences=[state_below], 
                     n_steps=self.n_steps, truncate_gradient=self.truncate)
            
        self._scan_updates.update(updates)
        
        self.Xout = Xfinal[-1]
        #self.Xout = (2*T.ge(self.Xout, 0.)-1) * T.maximum(abs(self.Xout) - .01, 0.)
        self.state_below = state_below
        #self.local_reconstruction_error = \
        #        ((state_below - T.dot(self.Xout, self.W) - 0*self.b) ** 2).sum() + \
        #                   .1 * T.sqrt(self.Xout**2 + 1e-6).sum()
        
        return self.Xout
    
    @wraps(Layer._modify_updates)
    def _modify_updates(self, updates):
        updates.update(self._scan_updates)

    def get_nonlin_output(self, state_below):
        rval = max_pool(self.X, self.pool_shape)
        rval = self.nonlin.apply(rval)
        return rval


    @wraps(Layer.fprop)
    def fprop(self, state_below):

        self.input_space.validate(state_below)
        rval = self.get_sparse_code(state_below)

        if self.fprop_code == True:
            #rval = T.switch(rval > 0., rval, 0.)
            rval = self.get_nonlin_output(state_below)
        else:
            # Fprops the filtered input instead
            rval = self.transformer.lmul(rval)

        self.output_space.validate(rval)
        
        return rval
示例#41
0
class Training(PickleMixin, TheanoMixin):
    """
    WRITEME

    Parameters
    ----------
    .. todo::
    """
    def __init__(self,
                 name,
                 data,
                 model,
                 optimizer,
                 cost,
                 outputs,
                 debug_print=0,
                 trainlog=None,
                 extension=None):
        self.name = name
        self.data = data
        self.model = model
        self.optimizer = optimizer
        self.inputs = model.inputs
        self.cost = cost
        self.outputs = tolist(outputs)
        self.updates = OrderedDict()
        self.updates.update(model.updates)
        self.extension = extension
        self.debug_print = debug_print
        lr_scalers = OrderedDict()
        for node in self.model.nodes:
            lr_scalers[node.name] = node.lr_scaler
        self.optimizer.lr_scalers = lr_scalers

        t0 = time.time()
        self.cost_fn = self.build_training_graph()
        print "Elapsed compilation time: %f" % (time.time() - t0)
        if self.debug_print:
            from theano.printing import debugprint
            debugprint(self.cost_fn)
        if trainlog is None:
            self.trainlog = TrainLog()
        else:
            self.trainlog = trainlog
        self.endloop = 0

    def build_training_graph(self):

        self.run_extension('ext_regularize_pre_grad')
        self.grads = OrderedDict(
            izip(self.model.params.values(),
                 T.grad(self.cost, self.model.params.values())))
        self.run_extension('ext_grad')
        grads = self.optimizer.get_updates(self.grads)

        for key, val in grads.items():
            self.updates[key] = val

        self.run_extension('ext_regularize_post_grad')

        return self.build_theano_graph(self.inputs, self.outputs, self.updates)

    def run(self):
        logger.info("Entering main loop")
        while self.run_epoch():
            pass
        logger.info("Terminating main loop")

    def run_epoch(self):

        for batch in self.data:
            self.run_extension('ext_monitor')
            self.run_extension('ext_save')
            batch_t0 = time.time()
            this_cost = self.cost_fn(*batch)
            self.trainlog.monitor['time'].append(time.time() - batch_t0)
            self.trainlog.monitor['update'].append(this_cost)
            self.trainlog.batch_seen += 1
            self.run_extension('ext_schedule')

        self.trainlog.epoch_seen += 1
        self.run_extension('ext_term')

        if self.end_training():
            self.run_extension('ext_monitor')
            self.run_extension('ext_save')
            return False

        return True

    def find_extension(self, name):

        try:
            exts = [
                extension for extension in self.extension
                if extension.name == name
            ]
            if len(exts) > 0:
                return_val = 1
            else:
                return_val = 0
            return return_val, exts
        except:
            return (0, None)

    def run_extension(self, name):
        tok, exts = self.find_extension(name)
        if tok:
            for ext in exts:
                ext.exe(self)

    def end_training(self):
        return self.endloop
示例#42
0
文件: __init__.py 项目: Beronx86/cle
class Training(PickleMixin, TheanoMixin):
    """
    WRITEME

    Parameters
    ----------
    .. todo::
    """
    def __init__(self,
                 name,
                 data,
                 model,
                 optimizer,
                 cost,
                 outputs,
                 debug_print=0,
                 trainlog=None,
                 extension=None):
        self.name = name
        self.data = data
        self.model = model
        self.optimizer = optimizer
        self.inputs = model.inputs
        self.cost = cost
        self.outputs = tolist(outputs)
        self.updates = OrderedDict()
        self.updates.update(model.updates)
        self.extension = extension
        self.debug_print = debug_print
        lr_scalers = OrderedDict()
        for node in self.model.nodes:
            lr_scalers[node.name] = node.lr_scaler
        self.optimizer.lr_scalers = lr_scalers

        t0 = time.time()
        self.cost_fn = self.build_training_graph()
        print "Elapsed compilation time: %f" % (time.time() - t0)
        if self.debug_print:
            from theano.printing import debugprint
            debugprint(self.cost_fn)
        if trainlog is None:
            self.trainlog = TrainLog()
        else:
            self.trainlog = trainlog
        self.endloop = 0

    def build_training_graph(self):

        self.run_extension('ext_regularize_pre_grad')
        self.grads = OrderedDict(izip(self.model.params.values(),
                                      T.grad(self.cost, self.model.params.values())))
        self.run_extension('ext_grad')
        grads = self.optimizer.get_updates(self.grads)

        for key, val in grads.items():
            self.updates[key] = val

        self.run_extension('ext_regularize_post_grad')

        return self.build_theano_graph(self.inputs, self.outputs, self.updates)

    def run(self):
        logger.info("Entering main loop")
        while self.run_epoch():
            pass
        logger.info("Terminating main loop")

    def run_epoch(self):

        for batch in self.data:
            self.run_extension('ext_monitor')
            self.run_extension('ext_save')
            batch_t0 = time.time()
            this_cost = self.cost_fn(*batch)
            self.trainlog.monitor['time'].append(time.time() - batch_t0)
            self.trainlog.monitor['update'].append(this_cost)
            self.trainlog.batch_seen += 1
            self.run_extension('ext_schedule')

        self.trainlog.epoch_seen += 1
        self.run_extension('ext_term')

        if self.end_training():
            self.run_extension('ext_monitor')
            self.run_extension('ext_save')
            return False

        return True

    def find_extension(self, name):

        try:
            exts = [extension for extension in self.extension
                    if extension.name == name]
            if len(exts) > 0:
                return_val = 1
            else:
                return_val = 0
            return return_val, exts
        except:
            return (0, None)

    def run_extension(self, name):
        tok, exts = self.find_extension(name)
        if tok:
            for ext in exts:
                ext.exe(self)

    def end_training(self):
        return self.endloop
示例#43
0
        def get_func(learn_discriminator, learn_generator, dont_you_fucking_dare_touch_the_generator=False):

            updates = OrderedDict()

            assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator)

            if learn_discriminator:
                cur_params = model.discriminator.get_params()
            else:
                cur_params = model.generator.get_params()

            def check():
                for param in params:
                    if param not in cur_params:
                        assert param not in updates

            cur_grads = OrderedDict()
            for param in cur_params:
                cur_grads[param] = grads[param]

            for param in grads:
                if grads[param].name is None and cost_value is not None:
                    grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
                                         {'costname': cost_value.name,
                                          'paramname': param.name})
                assert grads[param].dtype == param.dtype

            cur_lr_scalers = OrderedDict()
            for param in cur_params:
                if param in lr_scalers:
                    lr_scaler = lr_scalers[param]
                    cur_lr_scalers[param] = lr_scaler

            log.info('Parameter and initial learning rate summary:')
            for param in cur_params:
                param_name = param.name
                if param_name is None:
                    param_name = 'anon_param'
                lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.)
                log.info('\t' + param_name + ': ' + str(lr))

            updates.update(self.learning_rule.get_updates(
                    learning_rate, cur_grads, cur_lr_scalers))
            check()

            for param in cur_params:
                if updates[param].name is None:
                    updates[param].name = 'sgd_update(' + param.name + ')'
            check()
            model.modify_updates(updates)
            check()
            for param in cur_params:
                update = updates[param]
                if update.name is None:
                    update.name = 'censor(sgd_update(' + param.name + '))'
                for update_val in get_debug_values(update):
                    if np.any(np.isinf(update_val)):
                        raise ValueError("debug value of %s contains infs" %
                                update.name)
                    if np.any(np.isnan(update_val)):
                        raise ValueError("debug value of %s contains nans" %
                                update.name)

            check()

            if dont_you_fucking_dare_touch_the_generator:
                for param in model.generator.get_params():
                    assert param not in updates

            with log_timing(log, 'Compiling sgd_update'):
                return function(theano_args,
                                           updates=updates,
                                           name='sgd_update',
                                           on_unused_input='ignore',
                                           mode=self.theano_function_mode)
示例#44
0
    def __init__(self, objective, params, inputs = None,
            param_constrainers = None, max_iter = -1,
            lr_scalers = None, verbose = 0, tol = None,
            init_alpha = None, min_init_alpha = 1e-3,
            reset_alpha = True, conjugate = False,
            reset_conjugate = True, gradients = None,
            gradient_updates = None, line_search_mode = None,
            accumulate = False, theano_function_mode=None):

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha  = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [ param for param in params ]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX( param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            logger.info('batch gradient class compiling gradient function')
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates = updates)
        else:
            self._compute_grad = function(inputs, updates = updates,
                    mode=self.theano_function_mode,
                    name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            logger.info('done. Took {0}'.format(t2-t1))

        if self.verbose:
            logger.info('batch gradient class compiling objective function')
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs, obj, mode=self.theano_function_mode,
                    name='BatchGradientDescent.obj')

        if self.verbose:
            logger.info('done')

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name = 'alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha], updates=goto_updates,
                mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size

        self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode,
                name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name)

            self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g_], g_ * norm)
                for g_ in grad_to_old_grad]), mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered]

            def dot_product(x, y):
                return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            #beta_pr is the Polak-Ribiere formula for beta.
            #According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            #but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            #(ie, it is meant to revert to steepest descent when you have traveled far enough that
            #the objective function is behaving non-quadratically enough that the conjugate gradient
            #formulas aren't working anymore)

            #http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function([], updates=make_conjugate_updates,
                    mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')


        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
示例#45
0
class SGD_Optimizer():
    def __init__(self,
                 params,
                 inputs,
                 costs,
                 updates_old=None,
                 consider_constant=[],
                 momentum=True):
        """
        params: parameters of the model
        inputs: list of symbolic inputs to the graph
        costs: list of costs to be evaluated. The first element MUST be the objective.
        updates_old: OrderedDict from previous graphs that need to be accounted for by SGD, typically when scan is used.
        consider_constant: list of theano variables that are passed on to the grad method. Typically RBM.
        """
        self.inputs = inputs
        self.params = params
        self.momentum = momentum
        if self.momentum:
            self.params_mom = []
            for param in self.params:
                param_init = theano.shared(value=numpy.zeros(
                    param.get_value().shape, dtype=theano.config.floatX),
                                           name=param.name + '_mom')
                self.params_mom.append(param_init)
        self.costs = costs
        self.num_costs = len(costs)
        assert (
            isinstance(costs, list)
        ), "The costs given to the SGD class must be a list, even for one element."
        self.updates_old = updates_old
        self.consider_constant = consider_constant
        self.build_train_fn()

    def build_train_fn(self, ):
        self.lr_theano = T.scalar('lr')
        self.grad_inputs = self.inputs + [self.lr_theano]
        if self.momentum:
            self.mom_theano = T.scalar('mom')
            self.grad_inputs = self.grad_inputs + [self.mom_theano]

        self.gparams = T.grad(self.costs[0],
                              self.params,
                              consider_constant=self.consider_constant)
        if not self.momentum:
            print 'Building SGD optimization graph without momentum'
            updates = OrderedDict((i, i - self.lr_theano * j)
                                  for i, j in zip(self.params, self.gparams))
        else:
            print 'Building SGD optimization graph with momentum'
            updates = OrderedDict()
            for param, param_mom, gparam in zip(self.params, self.params_mom,
                                                self.gparams):
                param_inc = self.mom_theano * param_mom - self.lr_theano * gparam
                updates[param_mom] = param_inc
                updates[param] = param + param_inc
        self.calc_cost = theano.function(self.inputs, self.costs)
        if self.updates_old:
            updates_old = copy.copy(
                updates_old
            )  #To avoid updating the model dict if updates dict belongs to model class, very unlikely case.
            self.updates_old.update(updates)
        else:
            self.updates_old = OrderedDict()
            self.updates_old.update(updates)

        self.f = theano.function(self.grad_inputs,
                                 self.costs,
                                 updates=self.updates_old)

    def train(self,
              train_set,
              valid_set=None,
              learning_rate=0.1,
              num_epochs=500,
              save=False,
              output_folder=None,
              lr_update=None,
              mom_rate=0.9):
        self.best_cost = numpy.inf
        self.init_lr = learning_rate
        self.lr = numpy.array(learning_rate)
        self.mom_rate = mom_rate
        self.output_folder = output_folder
        self.train_set = train_set
        self.valid_set = valid_set
        self.save = save
        self.lr_update = lr_update
        try:
            for u in xrange(num_epochs):
                cost = []
                for i in self.train_set.iterate(True):
                    inputs = i + [self.lr]
                    if self.momentum:
                        inputs = inputs + [self.mom_rate]
                    cost.append(self.f(*inputs))
                mean_costs = numpy.mean(cost, axis=0)
                print '  Epoch %i   ' % (u + 1)
                print '***Train Results***'
                for i in xrange(self.num_costs):
                    print "Cost %i: %f" % (i, mean_costs[i])

                if not valid_set:
                    this_cost = numpy.absolute(numpy.mean(cost, axis=0))
                    if this_cost < best_cost:
                        best_cost = this_cost
                        print 'Best Params!'
                        if save:
                            self.save_model()
                    sys.stdout.flush()
                else:
                    self.perform_validation()

                if lr_update:
                    self.update_lr(u + 1, begin_anneal=1)

        except KeyboardInterrupt:
            print 'Training interrupted.'

    def perform_validation(self, ):
        cost = []
        for i in self.valid_set.iterate(True):
            cost.append(self.calc_cost(*i))
        mean_costs = numpy.mean(cost, axis=0)
        print '***Validation Results***'
        for i in xrange(self.num_costs):
            print "Cost %i: %f" % (i, mean_costs[i])

        this_cost = numpy.absolute(numpy.mean(
            cost, axis=0))[1]  #Using accuracy as metric
        if this_cost < self.best_cost:
            self.best_cost = this_cost
            print 'Best Params!'
            if self.save:
                self.save_model()

    def save_model(self, ):
        best_params = [param.get_value().copy() for param in self.params]
        if not self.output_folder:
            cPickle.dump(best_params, open('best_params.pickle', 'w'))
        else:
            if not os.path.exists(self.output_folder):
                os.makedirs(self.output_folder)
            save_path = os.path.join(self.output_folder, 'best_params.pickle')
            cPickle.dump(best_params, open(save_path, 'w'))

    def update_lr(self,
                  count,
                  update_type='annealed',
                  begin_anneal=500.,
                  min_lr=0.01,
                  decay_factor=1.2):
        if update_type == 'annealed':
            scale_factor = float(begin_anneal) / count
            self.lr = self.init_lr * min(1., scale_factor)
        if update_type == 'exponential':
            new_lr = float(self.init_lr) / (decay_factor**count)
            if new_lr < min_lr:
                self.lr = min_lr
            else:
                self.lr = new_lr
示例#46
0
        def get_func(learn_discriminator, learn_generator):

            updates = OrderedDict()

            assert (learn_discriminator or learn_generator
                    ) and not (learn_discriminator and learn_generator)

            if learn_discriminator:
                cur_params = model.discriminator.get_params()
            else:
                cur_params = model.generator.get_params()

            cur_grads = OrderedDict()
            for param in cur_params:
                cur_grads[param] = grads[param]

            for param in grads:
                if grads[param].name is None and cost_value is not None:
                    grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
                                         {
                                             'costname': cost_value.name,
                                             'paramname': param.name
                                         })
                assert grads[param].dtype == param.dtype

            cur_lr_scalers = OrderedDict()
            for param in cur_params:
                if param in lr_scalers:
                    lr_scaler = lr_scalers[param]
                    cur_lr_scalers[param] = lr_scaler

            log.info('Parameter and initial learning rate summary:')
            for param in cur_params:
                param_name = param.name
                if param_name is None:
                    param_name = 'anon_param'
                lr = learning_rate.get_value() * cur_lr_scalers.get(param, 1.)
                log.info('\t' + param_name + ': ' + str(lr))

            if self.learning_rule:
                updates.update(
                    self.learning_rule.get_updates(learning_rate, cur_grads,
                                                   cur_lr_scalers))
            else:
                # Use standard SGD updates with fixed learning rate.
                updates.update( dict(safe_zip(params, [param - learning_rate * \
                    lr_scalers.get(param, 1.) * grads[param]
                                        for param in params])))

            for param in cur_params:
                if updates[param].name is None:
                    updates[param].name = 'sgd_update(' + param.name + ')'
            model.modify_updates(updates)
            for param in cur_params:
                update = updates[param]
                if update.name is None:
                    update.name = 'censor(sgd_update(' + param.name + '))'
                for update_val in get_debug_values(update):
                    if np.any(np.isinf(update_val)):
                        raise ValueError("debug value of %s contains infs" %
                                         update.name)
                    if np.any(np.isnan(update_val)):
                        raise ValueError("debug value of %s contains nans" %
                                         update.name)

            with log_timing(log, 'Compiling sgd_update'):
                return function(theano_args,
                                updates=updates,
                                name='sgd_update',
                                on_unused_input='ignore',
                                mode=self.theano_function_mode)
示例#47
0
    def rescale_dropout_fprop(self,
                              state_below,
                              default_input_include_prob=0.5,
                              input_include_probs=None,
                              default_input_scale=2.,
                              input_scales=None,
                              per_example=True):
        """
        Returns the output of the MLP, when applying dropout to the input and
        intermediate layers. Each input to each layer is randomly included or
        excluded for each example. The probability of inclusion is independent
        for each input and each example. Each layer uses
        `default_input_include_prob` unless that layer's name appears as a key
        in input_include_probs, in which case the input inclusion probability
        is given by the corresponding value.

        Each feature is also multiplied by a scale factor. The scale factor for
        each layer's input scale is determined by the same scheme as the input
        probabilities.

        Parameters
        ----------
        state_below : WRITEME
            The input to the MLP
        default_input_include_prob : WRITEME
        input_include_probs : WRITEME
        default_input_scale : WRITEME
        input_scales : WRITEME
        per_example : bool, optional
            Sample a different mask value for every example in a batch.
            Defaults to `True`. If `False`, sample one mask per mini-batch.
        """

        warnings.warn("dropout doesn't use fixed_var_descr so it won't work "
                      "with algorithms that make more than one theano "
                      "function call per batch, such as BGD. Implementing "
                      "fixed_var descr could increase the memory usage "
                      "though.")

        if input_include_probs is None:
            input_include_probs = {}

        if input_scales is None:
            input_scales = {}

        self._validate_layer_names(list(input_include_probs.keys()))
        self._validate_layer_names(list(input_scales.keys()))

        theano_rng = MRG_RandomStreams(max(self.rng.randint(2**15), 1))

        dynamic_scale = OrderedDict()

        for layer in self.layers:
            layer_name = layer.layer_name

            if layer_name in input_include_probs:
                include_prob = input_include_probs[layer_name]
            else:
                include_prob = default_input_include_prob

            if layer_name in input_scales:
                scale = input_scales[layer_name]
            else:
                scale = default_input_scale

            state_below = self.apply_dropout(
                state=state_below,
                include_prob=include_prob,
                theano_rng=theano_rng,
                scale=scale,
                mask_value=layer.dropout_input_mask_value,
                input_space=layer.get_input_space(),
                per_example=per_example)
            if hasattr(layer, 'dynamic_scale'):
                dynamic_scale.update(layer.dynamic_scale(state_below))
            else:
                print 'skipping', layer.layer_name
            state_below = layer.fprop(state_below)

        return state_below, dynamic_scale
示例#48
0
class Training(PickleMixin, TheanoMixin):
    """
    WRITEME

    Parameters
    ----------
    .. todo::
    """
    def __init__(self,
                 name,
                 data,
                 model,
                 optimizer,
                 cost,
                 outputs,
                 n_steps,
                 debug_print=0,
                 trainlog=None,
                 extension=None,
                 lr_iterations=None,
                 decay_schedule=2,
                 k_speedOfconvergence=40):
        #picklelized?
        self.name = name  # yes
        self.data = data  # no
        self.model = model  #yes
        self.optimizer = optimizer  #no
        self.inputs = model.inputs  #no
        self.cost = cost  #yes
        self.outputs = tolist(outputs)  #no
        self.updates = OrderedDict()  # no
        self.updates.update(model.updates)  #???
        self.extension = extension  #no
        self.debug_print = debug_print  #no
        lr_scalers = OrderedDict()  #yes
        for node in self.model.nodes:  #should
            lr_scalers[node.name] = node.lr_scaler
        self.optimizer.lr_scalers = lr_scalers  #should
        self.nBernoulli = np.ones((n_steps, ))  #yes
        t0 = time.time()
        self.cost_fn = self.build_training_graph()  # no but should
        print "Elapsed compilation time: %f" % (time.time() - t0)
        if self.debug_print:  #no
            from theano.printing import debugprint
            debugprint(self.cost_fn)
        if trainlog is None:  #yes
            self.trainlog = TrainLog()
        else:
            self.trainlog = trainlog
        self.endloop = 0  #no
        self.lr_iterations = lr_iterations  #yes
        self.lastBatchlastPoch = 0  #yes
        self.decay_schedule = decay_schedule  #yes
        self.k = k_speedOfconvergence  #yes
        self.schedRate = 1  #yes
        self.n_steps = n_steps  #yes

    def restore(self,
                data,
                optimizer,
                cost,
                outputs,
                n_steps,
                debug_print=0,
                trainlog=None,
                extension=None,
                lr_iterations=None,
                decay_schedule=2,
                k_speedOfconvergence=40):
        self.data = data
        self.optimizer = optimizer
        self.inputs = self.model.inputs
        self.cost = cost
        self.outputs = tolist(outputs)
        #self.updates = OrderedDict()
        #self.updates.update(self.model.updates)
        self.updates = self.model.updates
        self.extension = extension
        self.debug_print = debug_print
        lr_scalers = OrderedDict()
        for node in self.model.nodes:
            lr_scalers[node.name] = node.lr_scaler
        self.optimizer.lr_scalers = lr_scalers
        self.nBernoulli = np.ones((n_steps, ))
        t0 = time.time()
        self.cost_fn = self.build_training_graph()
        print "Elapsed compilation time: %f" % (time.time() - t0)
        if self.debug_print:
            from theano.printing import debugprint
            debugprint(self.cost_fn)
        if trainlog is None:
            self.trainlog = TrainLog()
        else:
            self.trainlog = trainlog
        self.endloop = 0
        self.lr_iterations = lr_iterations
        self.lastBatchlastPoch = 0
        self.decay_schedule = decay_schedule
        self.k = k_speedOfconvergence
        self.schedRate = 1
        self.n_steps = n_steps

    '''
    def restore(self,
                data,
                cost,
                model,
                optimizer,
                k_speedOfconvergence = 40):
        self.data = data
        self.cost = cost
        self.model = model
        self.optimizer = optimizer
        self.inputs = model.inputs
        lr_scalers = OrderedDict()
        for node in self.model.nodes:
            lr_scalers[node.name] = node.lr_scaler
        self.cost_fn = self.build_training_graph()
        self.k = k_speedOfconvergence
    '''

    def build_training_graph(self):

        self.run_extension('ext_regularize_pre_grad')
        self.grads = OrderedDict(
            izip(self.model.params.values(),
                 T.grad(self.cost, self.model.params.values())))
        self.run_extension('ext_grad')
        grads = self.optimizer.get_updates(self.grads)

        for key, val in grads.items():
            self.updates[key] = val

        self.run_extension('ext_regularize_post_grad')
        print(type(self.inputs), len(self.inputs))
        #self.inputs.append(self.nBernoulli)
        return self.build_theano_graph(self.inputs, self.outputs, self.updates)

    def run(self):
        logger.info("Entering main loop")
        while self.run_epoch():
            pass
        logger.info("Terminating main loop")

    def run_epoch(self):
        self.trainlog.lastBatchlastEpoch = self.trainlog.batch_seen

        for batch in self.data:
            self.run_extension('ext_monitor')
            self.run_extension('ext_save')
            batch_t0 = time.time()
            nBernoulli = [
                np.random.binomial(1, self.schedRate)
                for i in range(self.n_steps)
            ]
            nBernoulli = np.asarray(nBernoulli)
            nBernoulli = np.reshape(nBernoulli, (self.n_steps, ))
            batchAux = (batch + (nBernoulli, ))

            this_cost = self.cost_fn(*batchAux)
            self.trainlog.monitor['time'].append(time.time() - batch_t0)
            self.trainlog.monitor['update'].append(this_cost)
            self.trainlog.batch_seen += 1
            self.run_extension('ext_schedule')

        self.trainlog.epoch_seen += 1
        first = self.trainlog.epoch_seen / float(self.k)
        second = self.k + exp(first)
        self.schedRate = self.k / second
        for limit, lr_it in self.lr_iterations.items():
            if (limit < self.trainlog.epoch_seen):
                self.optimizer.lr.set_value(lr_it)
        print("Epoch: {} - seched rate: {}".format(self.trainlog.epoch_seen,
                                                   self.schedRate))
        self.run_extension('ext_term')  ## changes the value of endloop

        if self.end_training():
            self.run_extension('ext_monitor')
            self.run_extension('ext_save')
            return False

        return True

    def find_extension(self, name):

        try:
            exts = [
                extension for extension in self.extension
                if extension.name == name
            ]
            if len(exts) > 0:
                return_val = 1
            else:
                return_val = 0
            return return_val, exts
        except:
            return (0, None)

    def run_extension(self, name):
        tok, exts = self.find_extension(name)
        if tok:
            for ext in exts:
                ext.exe(self)

    def end_training(self):
        return self.endloop