示例#1
0
    def compile_run_fn(self):
        """
        This is a helper function to compile the f_run function for computing the model's outputs given inputs.
        Compile and set the f_run function used for `run()`.

        It sets the `self.f_run` attribute to the f_run function.

        .. note::
            The run function defaults like so::

                self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                      outputs = raise_to_list(self.get_outputs()),
                                      updates = self.get_updates(),
                                      name    = 'f_run')
        """
        if not hasattr(self, 'f_run'):
            log.debug("Compiling f_run...")
            t = time.time()
            outputs = raise_to_list(self.get_outputs())
            if outputs is not None and len(outputs) == 1:
                outputs = outputs[0]
            self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                  outputs = outputs,
                                  updates = self.get_updates(),
                                  name    = 'f_run')
            log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t))
        else:
            log.warn('f_run already exists!')
示例#2
0
    def compile_run_fn(self):
        """
        This is a helper function to compile the f_run function for computing the model's outputs given inputs.
        Compile and set the f_run function used for `run()`.

        It sets the `self.f_run` attribute to the f_run function.

        .. note::
            The run function defaults like so::

                self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                      outputs = self.get_outputs(),
                                      updates = self.get_updates(),
                                      name    = 'f_run')

        Returns
        -------
        Theano function
            The compiled theano function for running the model.
        """
        if not getattr(self, 'f_run', None):
            log.debug("Compiling f_run...")
            t = time.time()
            self.f_run = function(inputs=raise_to_list(self.get_inputs()),
                                  outputs=self.get_outputs(),
                                  updates=self.get_updates(),
                                  name='f_run')
            log.debug("Compilation done. Took %s",
                      make_time_units_string(time.time() - t))
        else:
            log.debug('f_run already exists!')

        return self.f_run
示例#3
0
    def generate(self, initial=None, n_steps=None):
        """
        Generate visible inputs from the model for `n_steps` and starting at recurrent hidden state `initial`.

        Parameters
        ----------
        initial : tensor
            Recurrent hidden state to start generation from.
        n_steps : int
            Number of generation steps to do.

        Returns
        -------
        tuple(array_like, array_like)
            The generated inputs and the ending recurrent hidden states.
        """
        # compile the generate function!
        if not hasattr(self, 'f_generate'):
            log.debug("compiling f_generate...")
            self.f_generate = function(inputs=[self.generate_u0, self.n_steps],
                                       outputs=[self.x_ts, self.u_t],
                                       updates=self.updates_generate)
            log.debug("compilation done!")

        initial = initial or self.u0.eval()
        n_steps = n_steps or self.generate_n_steps
        return self.f_generate(initial, n_steps)
示例#4
0
文件: model.py 项目: ml-lab/OpenDeep
    def compile_run_fn(self):
        """
        This is a helper function to compile the f_run function for computing the model's outputs given inputs.
        Compile and set the f_run function used for `run()`.

        It sets the `self.f_run` attribute to the f_run function.

        .. note::
            The run function defaults like so::

                self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                      outputs = raise_to_list(self.get_outputs()),
                                      updates = self.get_updates(),
                                      name    = 'f_run')
        """
        if not hasattr(self, 'f_run'):
            log.debug("Compiling f_run...")
            t = time.time()
            outputs = raise_to_list(self.get_outputs())
            if outputs is not None and len(outputs) == 1:
                outputs = outputs[0]
            self.f_run = function(inputs=raise_to_list(self.get_inputs()),
                                  outputs=outputs,
                                  updates=self.get_updates(),
                                  name='f_run')
            log.debug("Compilation done. Took %s",
                      make_time_units_string(time.time() - t))
        else:
            log.warn('f_run already exists!')
示例#5
0
    def generate(self, initial=None, n_steps=None):
        """
        Generate visible inputs from the model for `n_steps` and starting at recurrent hidden state `initial`.

        Parameters
        ----------
        initial : tensor
            Recurrent hidden state to start generation from.
        n_steps : int
            Number of generation steps to do.

        Returns
        -------
        tuple(array_like, array_like)
            The generated inputs and the ending recurrent hidden states.
        """
        # compile the generate function!
        if not hasattr(self, 'f_generate'):
            log.debug("compiling f_generate...")
            self.f_generate = function(inputs=[self.generate_u0, self.n_steps],
                                       outputs=[self.x_ts, self.u_t],
                                       updates=self.updates_generate)
            log.debug("compilation done!")

        initial = initial or self.u0.eval()
        n_steps = n_steps or self.generate_n_steps
        return self.f_generate(initial, n_steps)
示例#6
0
    def compile_run_fn(self):
        """
        This is a helper function to compile the f_run function for computing the model's outputs given inputs.
        Compile and set the f_run function used for `run()`.

        It sets the `self.f_run` attribute to the f_run function.

        .. note::
            The run function defaults like so::

                self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                      outputs = self.get_outputs(),
                                      updates = self.get_updates(),
                                      name    = 'f_run')

        Returns
        -------
        Theano function
            The compiled theano function for running the model.
        """
        if not getattr(self, 'f_run', None):
            log.debug("Compiling f_run...")
            t = time.time()
            self.f_run = function(inputs  = raise_to_list(self.get_inputs()),
                                  outputs = self.get_outputs(),
                                  updates = self.get_updates(),
                                  name    = 'f_run')
            log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t))
        else:
            log.debug('f_run already exists!')

        return self.f_run
def _compile_csl_fn():
    """
    BUG HERE, not doing properly by chains (still has the bug, I don't see it)
    This is taking too much GPU mem

    mean: N(# of chains)*K(samples per chain)*D(data dim)
    minibatch: M(# of examples)*D (data dim)
    M * N matrix where each element is LL of one example against one chain.

    This function is for computing CSL over parallel chains of minibatches.

    Returns
    -------
    theano function
        Function computing M * N matrix where each element is LL of one example against one chain.
    """
    # when means is a 3D tensor (N, K, D)
    # When there are N chains, each chain having K samples of dimension D
    log.debug('building theano fn for Bernoulli CSL')
    means = T.tensor3('chains')
    minibatch = T.matrix('inputs')

    # how many chains CSL average over
    N = 5
    # minibatch size
    M = 10
    # data dim
    D = 784
    minibatch.tag.test_value = as_floatX(numpy.random.binomial(1, 0.5, size=(M, D)))
    # chain length
    K = 100
    means.tag.test_value = as_floatX(numpy.random.uniform(size=(N, K, D)))

    # computing LL

    # the length of each chain
    sample_size = means.shape[1]

    _minibatch = minibatch.dimshuffle(0, 'x', 'x', 1)
    _means = means.dimshuffle('x', 0, 1, 2)

    A = T.log(sample_size)
    B = _minibatch * T.log(_means) + (1. - _minibatch) * T.log(1. - _means)
    C = B.sum(axis=3)
    D = log_sum_exp_theano(C, axis=2)
    E = D - A
    # G = E.mean(axis=1)
    f = function(
        inputs=[minibatch, means],
        outputs=E,
        name='CSL_independent_bernoulli_fn'
    )
    return f
def _compile_csl_fn():
    """
    BUG HERE, not doing properly by chains (still has the bug, I don't see it)
    This is taking too much GPU mem

    mean: N(# of chains)*K(samples per chain)*D(data dim)
    minibatch: M(# of examples)*D (data dim)
    M * N matrix where each element is LL of one example against one chain.

    This function is for computing CSL over parallel chains of minibatches.

    Returns
    -------
    theano function
        Function computing M * N matrix where each element is LL of one example against one chain.
    """
    # when means is a 3D tensor (N, K, D)
    # When there are N chains, each chain having K samples of dimension D
    log.debug('building theano fn for Bernoulli CSL')
    means = T.tensor3('chains')
    minibatch = T.matrix('inputs')

    # how many chains CSL average over
    N = 5
    # minibatch size
    M = 10
    # data dim
    D = 784
    minibatch.tag.test_value = as_floatX(
        numpy.random.binomial(1, 0.5, size=(M, D)))
    # chain length
    K = 100
    means.tag.test_value = as_floatX(numpy.random.uniform(size=(N, K, D)))

    # computing LL

    # the length of each chain
    sample_size = means.shape[1]

    _minibatch = minibatch.dimshuffle(0, 'x', 'x', 1)
    _means = means.dimshuffle('x', 0, 1, 2)

    A = T.log(sample_size)
    B = _minibatch * T.log(_means) + (1. - _minibatch) * T.log(1. - _means)
    C = B.sum(axis=3)
    D = log_sum_exp_theano(C, axis=2)
    E = D - A
    # G = E.mean(axis=1)
    f = function(inputs=[minibatch, means],
                 outputs=E,
                 name='CSL_independent_bernoulli_fn')
    return f
示例#9
0
    def run(self, input):
        """
        This method will return the Prototype's output (run through the `f_run` function), given an input. The input
        comes from all unique inputs to the models in the Prototype as calculated from `get_inputs()` and the outputs
        computed similarly from `get_outputs`.

        Try to avoid re-compiling the theano function created for run - check a `hasattr(self, 'f_run')` or
        something similar first.

        Parameters
        ----------
        input: array_like
            Theano/numpy tensor-like object that is the input into the model's computation graph.

        Returns
        -------
        array_like
            Theano/numpy tensor-like object that is the output of the model's computation graph.
        """
        # set the noise switches off for running! we assume unseen data is noisy anyway :)
        old_switch_vals = []
        if len(self.get_switches()) > 0:
            log.debug("Turning off %s noise switches, resetting them after run!", str(len(self.get_switches())))
            old_switch_vals = [switch.get_value() for switch in self.get_switches()]
            [switch.set_value(0.) for switch in self.get_switches()]

        # make sure the input is raised to a list - we are going to splat it!
        input = raise_to_list(input)
        # first check if we already made an f_run function
        if hasattr(self, 'f_run'):
            output = self.f_run(*input)
        # otherwise, compile it!
        else:
            inputs = raise_to_list(self.get_inputs())
            outputs = raise_to_list(self.get_outputs())
            if outputs is not None and len(outputs) == 1:
                outputs = outputs[0]
            updates = self.get_updates()
            t = time.time()
            log.info("Compiling f_run...")
            self.f_run = function(inputs=inputs, outputs=outputs, updates=updates, name="f_run")
            log.info("Compilation done! Took %s", make_time_units_string(time.time() - t))
            output = self.f_run(*input)

        # reset any switches to how they were!
        if len(self.get_switches()) > 0:
            [switch.set_value(val) for switch, val in zip(self.get_switches(), old_switch_vals)]

        return output
def _compile_csl_fn_v2(mu):
    """
    p(x) = sum_h p(x|h)p(h) where p(x|h) is independent Bernoulli with
    a vector mu, mu_i for dim_i

    This function is for computing CSL over minibatches (in a single chain).

    Parameters
    ----------
    mu : array_like
        mu is (N,D) numpy array

    Returns
    -------
    theano function
        Function computing the Bernoulli CSL log likelihood.
    """
    #
    log.debug('building theano fn for Bernoulli CSL')
    x = T.fmatrix('inputs')
    x.tag.test_value = as_floatX(numpy.random.uniform(size=(10, 784)))
    mu = numpy.clip(mu, 1e-10, (1 - (1e-5)))
    mu = mu[None, :, :]
    inner_1 = numpy.log(mu)
    inner_2 = numpy.log(1. - mu)

    k = mu.shape[1]
    D = mu.shape[2]

    # there are two terms in the log(p(x|mu))

    term_1 = -T.log(k)
    c = T.sum(x.dimshuffle(0, 'x', 1) * inner_1 +
              (1. - x.dimshuffle(0, 'x', 1)) * inner_2,
              axis=2)
    debug = c.sum(axis=1)
    term_2 = log_sum_exp_theano(c, axis=1)

    log_likelihood = term_1 + term_2
    f = function([x], log_likelihood, name='CSL_independent_bernoulli_fn')
    return f
示例#11
0
def _compile_csl_fn_v2(mu):
    """
    p(x) = sum_h p(x|h)p(h) where p(x|h) is independent Bernoulli with
    a vector mu, mu_i for dim_i

    This function is for computing CSL over minibatches (in a single chain).

    Parameters
    ----------
    mu : array_like
        mu is (N,D) numpy array

    Returns
    -------
    theano function
        Function computing the Bernoulli CSL log likelihood.
    """
    #
    log.debug('building theano fn for Bernoulli CSL')
    x = T.fmatrix('inputs')
    x.tag.test_value = as_floatX(numpy.random.uniform(size=(10, 784)))
    mu = numpy.clip(mu, 1e-10, (1 - (1e-5)))
    mu = mu[None, :, :]
    inner_1 = numpy.log(mu)
    inner_2 = numpy.log(1. - mu)

    k = mu.shape[1]
    D = mu.shape[2]

    # there are two terms in the log(p(x|mu))

    term_1 = -T.log(k)
    c = T.sum(x.dimshuffle(0, 'x', 1) * inner_1 +
              (1. - x.dimshuffle(0, 'x', 1)) * inner_2,
              axis=2)
    debug = c.sum(axis=1)
    term_2 = log_sum_exp_theano(c, axis=1)

    log_likelihood = term_1 + term_2
    f = function([x], log_likelihood, name='CSL_independent_bernoulli_fn')
    return f
示例#12
0
    def generate(self, initial=None, n_steps=None):
        """
        Generate visible inputs from the model for n_steps and starting at recurrent hidden state initial

        :param initial: recurrent hidden state to start generation from
        :type initial: tensor

        :param n_steps: number of generation steps to do
        :type n_steps: int

        :return: the generated inputs and the ending recurrent hidden state
        :rtype: matrix, matrix
        """
        # compile the generate function!
        if not hasattr(self, 'f_generate'):
            self.f_generate = function(inputs=[self. generate_u0, self.n_steps],
                                       outputs=[self.v_ts, self.u_t],
                                       updates=self.updates_generate)

        initial = initial or self.u0.eval()
        n_steps = n_steps or self.generate_n_steps
        return self.f_generate(initial, n_steps)
    def build_computation_graph(self):
        #################
        # Build the GSN #
        #################
        log.debug("Building GSN graphs...")

        # GSN for training - with noise specified in initialization
        # if there is no hiddens_hook, build the GSN normally using the input X
        if not self.hiddens_flag:
            p_X_chain, _ = self.build_gsn(add_noise=self.add_noise)

        # if there is a hiddens_hook, we want to change the order layers are updated and make this purely
        # generative from the hiddens
        else:
            p_X_chain, _, = self.build_gsn(hiddens=self.hiddens,
                                           add_noise=self.add_noise,
                                           reverse=True)

        # GSN for prediction - same as above but no noise
        # deal with hiddens_hook exactly as above.
        if not self.hiddens_flag:
            p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False)
        else:
            p_X_chain_recon, recon_hiddens = self.build_gsn(
                hiddens=self.hiddens, add_noise=False, reverse=True)

        ####################
        # Costs and output #
        ####################
        log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN')
        # use the noisy ones for training cost
        costs = [
            self.cost_function(output=rX, target=self.X, **self.cost_args)
            for rX in p_X_chain
        ]
        self.show_cost = costs[-1]  # for a monitor to show progress
        cost = numpy.sum(
            costs
        )  # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH

        # use the non-noisy graph for prediction
        gsn_costs_recon = [
            self.cost_function(output=rX, target=self.X, **self.cost_args)
            for rX in p_X_chain_recon
        ]
        # another monitor, same as self.show_cost but on the non-noisy graph.
        self.monitor = gsn_costs_recon[-1]
        # this should be considered the main output of the computation, the sample after the
        # last walkback from the non-noisy graph.
        output = p_X_chain_recon[-1]
        # these should be considered the model's hidden representation - the hidden representation after
        # the last walkback from the non-noisy graph.
        hiddens = recon_hiddens

        train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0)
        train_mse = T.mean(train_mse)

        mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0)
        mse = T.mean(mse)

        monitors = OrderedDict([('noisy_recon_cost', self.show_cost),
                                ('recon_cost', self.monitor), ('mse', mse),
                                ('train_mse', train_mse)])

        ############
        # Sampling #
        ############
        # the input to the sampling function
        X_sample = T.matrix("X_sampling")
        self.network_state_input = [X_sample] + [
            T.matrix("H_sampling_" + str(i + 1)) for i in range(self.layers)
        ]

        # "Output" state of the network (noisy)
        # initialized with input, then we apply updates
        self.network_state_output = [X_sample] + self.network_state_input[1:]
        visible_pX_chain = []

        # ONE update
        log.debug("Performing one walkback in network state sampling.")
        self.update_layers(self.network_state_output,
                           visible_pX_chain,
                           add_noise=True,
                           reverse=False)

        #####################################################
        #     Create the run and monitor functions      #
        #####################################################
        log.debug("Compiling functions...")
        t = time.time()

        # doesn't make sense to have this if there is a hiddens_hook
        if not self.hiddens_flag:
            # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy
            # computation graph
            log.debug("f_run...")
            self.f_run = function(inputs=[self.X],
                                  outputs=output,
                                  name='gsn_f_run')

        # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the
        # input to f_run)
        log.debug("f_noise...")
        self.f_noise = function(inputs=[self.X],
                                outputs=self.input_noise(self.X),
                                name='gsn_f_noise')

        # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood
        # or visualization)
        log.debug("f_sample...")
        if self.layers == 1:
            self.f_sample = function(inputs=[X_sample],
                                     outputs=visible_pX_chain[-1],
                                     name='gsn_f_sample_single_layer')
        else:
            # WHY IS THERE A WARNING????
            # because the first odd layers are not used -> directly computed FROM THE EVEN layers
            # unused input = warn
            self.f_sample = function(inputs=self.network_state_input,
                                     outputs=self.network_state_output +
                                     visible_pX_chain,
                                     name='gsn_f_sample')

        log.debug("GSN compiling done. Took %s",
                  make_time_units_string(time.time() - t))

        return cost, monitors, output, hiddens
示例#14
0
    def train(self, monitor_channels=None, train_outservice=None, plot=None, additional_cost=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        train_outservice : OutService, optional
            The OutService to use for the automatically created train_cost monitor. Default of None just outputs
            to logs.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        additional_cost : theano expression or list(theano expression), optional
            Any additional cost expressions to use during training (things like regularization). These will be summed
            with the existing cost.
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #####################################################
        # handle additional costs (normally regularization) #
        #####################################################
        # Create the gradient updates for the model - make sure to handle the possible
        # list of costs used for pretraining of certain parts of the model.
        train_costs = raise_to_list(self.model.get_train_cost())
        # deal with any other additional costs (like regularization, etc.)
        if additional_cost is not None:
            additional_costs = raise_to_list(additional_cost)
            if len(additional_costs) > 1:
                additional_cost = T.sum(additional_costs)

        #########################
        # gradients and updates #
        #########################
        train_updates = []
        self.gradients = []
        for i, train_cost in enumerate(train_costs):
            # Now create the training cost function for the model to use while training - update parameters
            # gradient!
            if len(train_costs) > 1 and additional_cost is not None:
                log.warning("additional_cost will double count with gradients during layer-wise pretraining!")
                warnings.warn("additional_cost will double count with gradients during layer-wise pretraining!")
            # TODO: additional_cost will double count with gradients during layer-wise pretraining.
            # Need to somehow make w.r.t. params appropriate for the individual training costs.
            gradients, _ = self.model.get_gradient(cost=train_cost, additional_cost=additional_cost)
            # clip gradients if we want.
            gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)
            # append to list
            self.gradients.append(gradients)

            # Calculate the optimizer updates each run
            # This is where the magic happens for a lot of sub-implementations of SGD!
            # It tells how to update the params each training epoch
            gradient_updates = self.get_updates(gradients)

            # Combine the updates from the model also if applicable
            updates = self.model.get_updates()
            if updates:
                updates.update(gradient_updates)
            else:
                updates = gradient_updates
            train_updates.append(updates)

        # grab the model parameters to use during training
        self.params = self.model.get_params()
        log.info("%s params: %s", str(type(self.model)), str(self.params))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])
        # finally deal with an outservice provided to monitor training cost
        self.train_outservice = train_outservice
        # remove redundant files made by the fileservice for the train monitor.
        # TODO: THIS FEELS LIKE A HACK. I don't like it.
        if isinstance(self.train_outservice, FileService):
            os.remove(self.train_outservice.valid_filename)
            os.remove(self.train_outservice.test_filename)

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs()) + raise_to_list(self.model.get_targets())
        train_functions = []
        for i, (updates, train_cost) in enumerate(zip(train_updates, train_costs)):
            # Compile the training function!
            log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates),
                     str(type(self.model)))
            t = time.time()

            f_learn = function(inputs=function_input,
                               updates=updates,
                               outputs=[train_cost] + list(self.train_monitors_dict.values()),
                               name='f_learn_%d' % i)

            log.info('f_learn %d compilation took %s', i + 1, make_time_units_string(time.time() - t))
            train_functions.append(f_learn)

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        # make sure to deal with a list of train_cost functions - for layer-wise pretraining!
        # this list of training functions was created during __init__()
        start_time = time.time()
        for func_i, train_function in enumerate(train_functions):
            log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS-----------",
                     str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch)

            self.STOP = False
            self.epoch_counter = 0
            # reset any decay params
            for decay_param in self.get_decay_params():
                decay_param.reset()

            self.times = []
            self.best_cost = numpy.inf
            self.best_params = None
            self.patience = 0

            t = time.time()

            while not self.STOP:
                try:
                    self.STOP = self._perform_one_epoch(train_function, plot)
                except KeyboardInterrupt:
                    log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                    self.STOP = True

            # save params
            if self.best_params is not None:
                log.debug("Restoring best model parameters...")
                set_shared_values(self.params, self.best_params)
            log.debug("Saving model parameters...")
            self.model.save_params('trained_epoch_' + str(self.epoch_counter))

            log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

        log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
                 str(type(self.model)), make_time_units_string(time.time() - start_time))
示例#15
0
    def _build_computation_graph(self):
        ###################### BUILD NETWORK ##########################
        # whether or not to mirror the input images before feeding them into the network
        if self.flag_datalayer:
            layer_1_input = mirror_images(
                input=self.x,
                image_shape=(self.batch_size, 3, 256, 256),  # bc01 format
                cropsize=227,
                rand=self.rand,
                flag_rand=self.rand_crop)
        else:
            layer_1_input = self.x  # 4D tensor (going to be in bc01 format)

        # Start with 5 convolutional pooling layers
        log.debug("convpool layer 1...")
        convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227,
                                                      227), layer_1_input),
                                        filter_shape=(96, 3, 11, 11),
                                        convstride=4,
                                        padsize=0,
                                        group=1,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer1.get_params()

        log.debug("convpool layer 2...")
        convpool_layer2 = ConvPoolLayer(inputs_hook=((
            self.batch_size,
            96,
            27,
            27,
        ), convpool_layer1.get_outputs()),
                                        filter_shape=(256, 96, 5, 5),
                                        convstride=1,
                                        padsize=2,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.1,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer2.get_params()

        log.debug("convpool layer 3...")
        convpool_layer3 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 256, 13, 13),
                         convpool_layer2.get_outputs()),
            filter_shape=(384, 256, 3, 3),
            convstride=1,
            padsize=1,
            group=1,
            poolsize=1,
            poolstride=0,
            bias_init=0.0,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer3.get_params()

        log.debug("convpool layer 4...")
        convpool_layer4 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 384, 13, 13),
                         convpool_layer3.get_outputs()),
            filter_shape=(384, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=1,
            poolstride=0,
            bias_init=0.1,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer4.get_params()

        log.debug("convpool layer 5...")
        convpool_layer5 = ConvPoolLayer(
            inputs_hook=((self.batch_size, 384, 13, 13),
                         convpool_layer4.get_outputs()),
            filter_shape=(256, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=3,
            poolstride=2,
            bias_init=0.0,
            local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer5.get_params()

        # Now onto the fully-connected layers!
        fc_config = {
            'activation':
            'rectifier',  # type of activation function to use for output
            'weights_init':
            'gaussian',  # either 'gaussian' or 'uniform' - how to initialize weights
            'weights_mean': 0.0,  # mean for gaussian weights init
            'weights_std':
            0.005,  # standard deviation for gaussian weights init
            'bias_init': 0.0  # how to initialize the bias parameter
        }
        log.debug("fully connected layer 1 (model layer 6)...")
        # we want to have dropout applied to the training version, but not the test version.
        fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2)
        fc_layer6 = Dense(inputs_hook=(9216, fc_layer6_input),
                          output_size=4096,
                          noise='dropout',
                          noise_level=0.5,
                          **fc_config)
        # Add this layer's parameters!
        self.params += fc_layer6.get_params()
        # Add the dropout noise switch
        self.noise_switches += fc_layer6.get_noise_switch()

        log.debug("fully connected layer 2 (model layer 7)...")
        fc_layer7 = Dense(inputs_hook=(4096, fc_layer6.get_outputs()),
                          output_size=4096,
                          noise='dropout',
                          noise_level=0.5,
                          **fc_config)

        # Add this layer's parameters!
        self.params += fc_layer7.get_params()
        # Add the dropout noise switch
        self.noise_switches += fc_layer7.get_noise_switch()

        # last layer is a softmax prediction output layer
        softmax_config = {
            'weights_init': 'gaussian',
            'weights_mean': 0.0,
            'weights_std': 0.005,
            'bias_init': 0.0
        }
        log.debug("softmax classification layer (model layer 8)...")
        softmax_layer8 = SoftmaxLayer(inputs_hook=(4096,
                                                   fc_layer7.get_outputs()),
                                      output_size=1000,
                                      **softmax_config)

        # Add this layer's parameters!
        self.params += softmax_layer8.get_params()

        # finally the softmax output from the whole thing!
        self.output = softmax_layer8.get_outputs()
        self.targets = softmax_layer8.get_targets()

        #####################
        # Cost and monitors #
        #####################
        self.train_cost = softmax_layer8.negative_log_likelihood()
        cost = softmax_layer8.negative_log_likelihood()
        errors = softmax_layer8.errors()
        train_errors = softmax_layer8.errors()

        self.monitors = OrderedDict([('cost', cost), ('errors', errors),
                                     ('dropout_errors', train_errors)])

        #########################
        # Compile the functions #
        #########################
        log.debug("Compiling functions!")
        t = time.time()
        log.debug("f_run...")
        # use the actual argmax from the classification
        self.f_run = function(inputs=[self.x],
                              outputs=softmax_layer8.get_argmax_prediction())
        log.debug("compilation took %s",
                  make_time_units_string(time.time() - t))
示例#16
0
    def train(self, monitor_channels=None, plot=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #########################
        # gradients and updates #
        #########################
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        # Now create the training cost function for the model to use while training - update parameters
        # gradient!
        # First find the basic variables that will be updated
        params = set()
        for param in self.params.values():
            params.update(base_variables(param))
        params = list(params)
        gradients = grad(cost=self.loss_expression, wrt=params)
        # now create the dictionary mapping the parameter with its gradient
        gradients = OrderedDict(
            [(param, g) for param, g in zip(params, gradients)]
        )
        # clip gradients if we want.
        gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(gradients)

        # Combine the updates from the model also if applicable
        updates = self.model.get_updates()
        if updates:
            updates.update(gradient_updates)
        else:
            updates = gradient_updates

        log.info("%s params: %s", self.model._classname, str(list(self.params.keys())))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs())
        if self.loss_targets is not None:
            function_input += self.loss_targets
        # Compile the training function!
        log.info('Compiling f_learn function for model %s...', self.model._classname)
        t = time.time()

        f_learn = function(inputs=function_input,
                           updates=updates,
                           outputs=[self.loss_expression] + list(self.train_monitors_dict.values()),
                           name='f_learn')

        log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        log.info("-----------TRAINING %s FOR %d EPOCHS-----------",
                 self.model._classname, self.n_epoch)

        self.STOP = False
        self.epoch_counter = 0
        # reset any decay params
        for decay_param in self.get_decay_params():
            decay_param.reset()

        self.times = []
        self.best_cost = numpy.inf
        self.best_params = None
        self.patience = 0

        t = time.time()

        while not self.STOP:
            try:
                self.STOP = self._perform_one_epoch(f_learn, plot)
            except KeyboardInterrupt:
                log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                self.STOP = True

        # save params
        if self.best_params is not None:
            log.debug("Restoring best model parameters...")
            self.model.set_param_values(self.best_params, borrow=False)
        log.debug("Saving model parameters...")
        self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
示例#17
0
    def _build_computation_graph(self):
        ###################### BUILD NETWORK ##########################
        # whether or not to mirror the input images before feeding them into the network
        if self.flag_datalayer:
            layer_1_input = mirror_images(input=self.x,
                                          image_shape=(self.batch_size, 3, 256, 256),  # bc01 format
                                          cropsize=227,
                                          rand=self.rand,
                                          flag_rand=self.rand_crop)
        else:
            layer_1_input = self.x  # 4D tensor (going to be in bc01 format)

        # Start with 5 convolutional pooling layers
        log.debug("convpool layer 1...")
        convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input),
                                        filter_shape=(96, 3, 11, 11),
                                        convstride=4,
                                        padsize=0,
                                        group=1,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer1.get_params()

        log.debug("convpool layer 2...")
        convpool_layer2 = ConvPoolLayer(inputs_hook=((self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()),
                                        filter_shape=(256, 96, 5, 5),
                                        convstride=1,
                                        padsize=2,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.1,
                                        local_response_normalization=True)
        # Add this layer's parameters!
        self.params += convpool_layer2.get_params()

        log.debug("convpool layer 3...")
        convpool_layer3 = ConvPoolLayer(inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()),
                                        filter_shape=(384, 256, 3, 3),
                                        convstride=1,
                                        padsize=1,
                                        group=1,
                                        poolsize=1,
                                        poolstride=0,
                                        bias_init=0.0,
                                        local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer3.get_params()

        log.debug("convpool layer 4...")
        convpool_layer4 = ConvPoolLayer(inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()),
                                        filter_shape=(384, 384, 3, 3),
                                        convstride=1,
                                        padsize=1,
                                        group=2,
                                        poolsize=1,
                                        poolstride=0,
                                        bias_init=0.1,
                                        local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer4.get_params()

        log.debug("convpool layer 5...")
        convpool_layer5 = ConvPoolLayer(inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()),
                                        filter_shape=(256, 384, 3, 3),
                                        convstride=1,
                                        padsize=1,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        local_response_normalization=False)
        # Add this layer's parameters!
        self.params += convpool_layer5.get_params()

        # Now onto the fully-connected layers!
        fc_config = {
            'activation': 'rectifier',  # type of activation function to use for output
            'weights_init': 'gaussian',  # either 'gaussian' or 'uniform' - how to initialize weights
            'weights_mean': 0.0,  # mean for gaussian weights init
            'weights_std': 0.005,  # standard deviation for gaussian weights init
            'bias_init': 0.0  # how to initialize the bias parameter
        }
        log.debug("fully connected layer 1 (model layer 6)...")
        # we want to have dropout applied to the training version, but not the test version.
        fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2)
        fc_layer6 = Dense(inputs_hook=(9216, fc_layer6_input),
                               output_size=4096,
                               noise='dropout',
                               noise_level=0.5,
                               **fc_config)
        # Add this layer's parameters!
        self.params += fc_layer6.get_params()
        # Add the dropout noise switch
        self.noise_switches += fc_layer6.get_switches()

        log.debug("fully connected layer 2 (model layer 7)...")
        fc_layer7 = Dense(inputs_hook=(4096, fc_layer6.get_outputs()),
                               output_size=4096,
                               noise='dropout',
                               noise_level=0.5,
                               **fc_config)

        # Add this layer's parameters!
        self.params += fc_layer7.get_params()
        # Add the dropout noise switch
        self.noise_switches += fc_layer7.get_switches()

        # last layer is a softmax prediction output layer
        softmax_config = {
            'weights_init': 'gaussian',
            'weights_mean': 0.0,
            'weights_std': 0.005,
            'bias_init': 0.0
        }
        log.debug("softmax classification layer (model layer 8)...")
        softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()),
                                      output_size=1000,
                                      **softmax_config)

        # Add this layer's parameters!
        self.params += softmax_layer8.get_params()

        # finally the softmax output from the whole thing!
        self.output = softmax_layer8.get_outputs()
        self.targets = softmax_layer8.get_targets()

        #####################
        # Cost and monitors #
        #####################
        self.train_cost = softmax_layer8.negative_log_likelihood()
        cost = softmax_layer8.negative_log_likelihood()
        errors = softmax_layer8.errors()
        train_errors = softmax_layer8.errors()

        self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)])

        #########################
        # Compile the functions #
        #########################
        log.debug("Compiling functions!")
        t = time.time()
        log.debug("f_run...")
        # use the actual argmax from the classification
        self.f_run = function(inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction())
        log.debug("compilation took %s", make_time_units_string(time.time() - t))
示例#18
0
    def train(self, monitor_channels=None, train_outservice=None, plot=None, additional_cost=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        train_outservice : OutService, optional
            The OutService to use for the automatically created train_cost monitor. Default of None just outputs
            to logs.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        additional_cost : theano expression or list(theano expression), optional
            Any additional cost expressions to use during training (things like regularization). These will be summed
            with the existing cost.
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #####################################################
        # handle additional costs (normally regularization) #
        #####################################################
        # Create the gradient updates for the model - make sure to handle the possible
        # list of costs used for pretraining of certain parts of the model.
        train_costs = raise_to_list(self.model.get_train_cost())
        # deal with any other additional costs (like regularization, etc.)
        if additional_cost is not None:
            additional_costs = raise_to_list(additional_cost)
            if len(additional_costs) > 1:
                additional_cost = T.sum(additional_costs)

        #########################
        # gradients and updates #
        #########################
        train_updates = []
        self.gradients = []
        for i, train_cost in enumerate(train_costs):
            # Now create the training cost function for the model to use while training - update parameters
            # gradient!
            if len(train_costs) > 1 and additional_cost is not None:
                log.warning("additional_cost will double count with gradients during layer-wise pretraining!")
                warnings.warn("additional_cost will double count with gradients during layer-wise pretraining!")
            # TODO: additional_cost will double count with gradients during layer-wise pretraining.
            # Need to somehow make w.r.t. params appropriate for the individual training costs.
            gradients, _ = self.model.get_gradient(cost=train_cost, additional_cost=additional_cost)
            # clip gradients if we want.
            gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)
            # append to list
            self.gradients.append(gradients)

            # Calculate the optimizer updates each run
            # This is where the magic happens for a lot of sub-implementations of SGD!
            # It tells how to update the params each training epoch
            gradient_updates = self.get_updates(gradients)

            # Combine the updates from the model also if applicable
            updates = self.model.get_updates()
            if updates:
                updates.update(gradient_updates)
            else:
                updates = gradient_updates
            train_updates.append(updates)

        # grab the model parameters to use during training
        self.params = self.model.get_params()
        log.info("%s params: %s", str(type(self.model)), str(self.params))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])
        # finally deal with an outservice provided to monitor training cost
        self.train_outservice = train_outservice
        # remove redundant files made by the fileservice for the train monitor.
        # TODO: THIS FEELS LIKE A HACK. I don't like it.
        if isinstance(self.train_outservice, FileService):
            os.remove(self.train_outservice.valid_filename)
            os.remove(self.train_outservice.test_filename)

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs()) + raise_to_list(self.model.get_targets())
        train_functions = []
        for i, (updates, train_cost) in enumerate(zip(train_updates, train_costs)):
            # Compile the training function!
            log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates),
                     str(type(self.model)))
            t = time.time()

            f_learn = function(inputs=function_input,
                               updates=updates,
                               outputs=[train_cost] + list(self.train_monitors_dict.values()),
                               name='f_learn_%d' % i)

            log.info('f_learn %d compilation took %s', i + 1, make_time_units_string(time.time() - t))
            train_functions.append(f_learn)

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        # make sure to deal with a list of train_cost functions - for layer-wise pretraining!
        # this list of training functions was created during __init__()
        start_time = time.time()
        for func_i, train_function in enumerate(train_functions):
            log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS-----------",
                     str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch)

            self.STOP = False
            self.epoch_counter = 0
            # reset any decay params
            for decay_param in self.get_decay_params():
                decay_param.reset()

            self.times = []
            self.best_cost = numpy.inf
            self.best_params = None
            self.patience = 0

            t = time.time()

            while not self.STOP:
                try:
                    self.STOP = self._perform_one_epoch(train_function, plot)
                except KeyboardInterrupt:
                    log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                    self.STOP = True

            # save params
            if self.best_params is not None:
                log.debug("Restoring best model parameters...")
                set_shared_values(self.params, self.best_params)
            log.debug("Saving model parameters...")
            self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')

            log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))

        log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
                 str(type(self.model)), make_time_units_string(time.time() - start_time))
示例#19
0
    def train(self, monitor_channels=None, train_outservice=None, plot=None):
        """
        This method performs the training!!!
        It is an online training method that goes over minibatches from the dataset for a number of epochs,
        updating parameters after each minibatch.

        You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully.

        Parameters
        ----------
        monitor_channels : list(MonitorsChannel or Monitor), optional
            The list of channels or monitors containing monitor expressions/variables to compile and evaluate
            on the data.
        train_outservice : OutService, optional
            The OutService to use for the automatically created train_cost monitor. Default of None just outputs
            to logs.
        plot : Plot, optional
            The Plot object to use if we want to graph the outputs (uses bokeh server).
        """
        if not self.model:
            log.error("No self.model for the Optimizer!")
            raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
                                 "was called from the Model. Try initializing the Optimizer with the model param "
                                 "and calling optimizer.train().")

        #########################
        # gradients and updates #
        #########################
        # grab the model parameters to use during training
        self.params = self.model.get_params()
        # Now create the training cost function for the model to use while training - update parameters
        # gradient!
        gradients = grad(cost=self.loss_expression, wrt=list(self.params.values()))
        # now create the dictionary mapping the parameter with its gradient
        gradients = OrderedDict(
            [(param, g) for param, g in zip(list(self.params.values()), gradients)]
        )
        # clip gradients if we want.
        gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(gradients)

        # Combine the updates from the model also if applicable
        updates = self.model.get_updates()
        if updates:
            updates.update(gradient_updates)
        else:
            updates = gradient_updates

        log.info("%s params: %s", self.model._classname, str(list(self.params.keys())))

        ############
        # monitors #
        ############
        # deal with the monitor channels if they were given (or take them from the plot)
        if monitor_channels is None and plot is not None and len(plot.channels) > 0:
            monitor_channels = plot.channels
        self.train_monitors_dict = {}
        self.valid_monitors_dict = {}
        self.test_monitors_dict = {}
        self.train_monitors_outservice_dict = {}
        self.valid_monitors_outservice_dict = {}
        self.test_monitors_outservice_dict = {}
        if monitor_channels:
            # collapse the appropriate monitors into their (name, expression, out_service) tuples
            train_collapsed = collapse_channels(monitor_channels, train=True)
            valid_collapsed = collapse_channels(monitor_channels, valid=True)
            test_collapsed  = collapse_channels(monitor_channels, test=True)
            # get name: expression dictionary
            self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
            self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
            self.test_monitors_dict  = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
            # get name: outservice dictionary
            self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
            self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
            self.test_monitors_outservice_dict  = OrderedDict([(name, out) for name, _, out in test_collapsed])
        # finally deal with an outservice provided to monitor training cost
        self.train_outservice = train_outservice
        # remove redundant files made by the fileservice for the train monitor.
        # TODO: THIS FEELS LIKE A HACK. I don't like it.
        if isinstance(self.train_outservice, FileService):
            os.remove(self.train_outservice.valid_filename)
            os.remove(self.train_outservice.test_filename)

        #######################################
        # compile train and monitor functions #
        #######################################
        function_input = raise_to_list(self.model.get_inputs())
        if self.loss_targets is not None:
            function_input += self.loss_targets
        # Compile the training function!
        log.info('Compiling f_learn function for model %s...', self.model._classname)
        t = time.time()

        f_learn = function(inputs=function_input,
                           updates=updates,
                           outputs=[self.loss_expression] + list(self.train_monitors_dict.values()),
                           name='f_learn')

        log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))

        # figure out if we want valid and test (monitors)
        self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0)
        self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0)
        # Now compile the monitor functions!
        log.debug("Compiling monitor functions...")
        monitor_t = time.time()
        # valid monitors
        if self.valid_flag:
            self.valid_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.valid_monitors_dict.values()),
                name='valid_monitor_function'
            )
        else:
            self.valid_monitor_function = None

        # test monitors
        if self.test_flag:
            self.test_monitor_function = function(
                inputs=function_input,
                updates=self.model.get_updates(),
                outputs=list(self.test_monitors_dict.values()),
                name='test_monitor_function'
            )
        else:
            self.test_monitor_function = None

        log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))

        ##################
        # start training #
        ##################
        log.info("-----------TRAINING %s FOR %d EPOCHS-----------",
                 self.model._classname, self.n_epoch)

        self.STOP = False
        self.epoch_counter = 0
        # reset any decay params
        for decay_param in self.get_decay_params():
            decay_param.reset()

        self.times = []
        self.best_cost = numpy.inf
        self.best_params = None
        self.patience = 0

        t = time.time()

        while not self.STOP:
            try:
                self.STOP = self._perform_one_epoch(f_learn, plot)
            except KeyboardInterrupt:
                log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
                self.STOP = True

        # save params
        if self.best_params is not None:
            log.debug("Restoring best model parameters...")
            for best_param, param_value in self.best_params.items():
                self.params[best_param].set_value(param_value, borrow=False)
        log.debug("Saving model parameters...")
        self.model.save_params('trained_epoch_' + str(self.epoch_counter))

        log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
    def build_computation_graph(self):
        #################
        # Build the GSN #
        #################
        log.debug("Building GSN graphs...")

        # GSN for training - with noise specified in initialization
        # if there is no hiddens_hook, build the GSN normally using the input X
        if not self.hiddens_flag:
            p_X_chain, _ = self.build_gsn(add_noise=self.add_noise)

        # if there is a hiddens_hook, we want to change the order layers are updated and make this purely
        # generative from the hiddens
        else:
            p_X_chain, _,  = self.build_gsn(hiddens=self.hiddens, add_noise=self.add_noise, reverse=True)

        # GSN for prediction - same as above but no noise
        # deal with hiddens_hook exactly as above.
        if not self.hiddens_flag:
            p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False)
        else:
            p_X_chain_recon, recon_hiddens = self.build_gsn(hiddens=self.hiddens, add_noise=False, reverse=True)

        ####################
        # Costs and output #
        ####################
        log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN')
        # use the noisy ones for training cost
        costs          = [self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain]
        self.show_cost = costs[-1]  # for a monitor to show progress
        cost           = numpy.sum(costs)  # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH

        # use the non-noisy graph for prediction
        gsn_costs_recon = [self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain_recon]
        # another monitor, same as self.show_cost but on the non-noisy graph.
        self.monitor    = gsn_costs_recon[-1]
        # this should be considered the main output of the computation, the sample after the
        # last walkback from the non-noisy graph.
        output     = p_X_chain_recon[-1]
        # these should be considered the model's hidden representation - the hidden representation after
        # the last walkback from the non-noisy graph.
        hiddens    = recon_hiddens

        train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0)
        train_mse = T.mean(train_mse)

        mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0)
        mse = T.mean(mse)

        monitors = OrderedDict([('noisy_recon_cost', self.show_cost),
                                ('recon_cost', self.monitor),
                                ('mse', mse),
                                ('train_mse', train_mse)])

        ############
        # Sampling #
        ############
        # the input to the sampling function
        X_sample = T.matrix("X_sampling")
        self.network_state_input = [X_sample] + [T.matrix("H_sampling_"+str(i+1)) for i in range(self.layers)]
       
        # "Output" state of the network (noisy)
        # initialized with input, then we apply updates
        self.network_state_output = [X_sample] + self.network_state_input[1:]
        visible_pX_chain = []
    
        # ONE update
        log.debug("Performing one walkback in network state sampling.")
        self.update_layers(self.network_state_output, visible_pX_chain, add_noise=True, reverse=False)

        #####################################################
        #     Create the run and monitor functions      #
        #####################################################
        log.debug("Compiling functions...")
        t = time.time()

        # doesn't make sense to have this if there is a hiddens_hook
        if not self.hiddens_flag:
            # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy
            # computation graph
            log.debug("f_run...")
            self.f_run = function(inputs  = [self.X],
                                  outputs = output,
                                  name    = 'gsn_f_run')

        # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the
        # input to f_run)
        log.debug("f_noise...")
        self.f_noise = function(inputs  = [self.X],
                                outputs = self.input_noise(self.X),
                                name    = 'gsn_f_noise')

        # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood
        # or visualization)
        log.debug("f_sample...")
        if self.layers == 1: 
            self.f_sample = function(inputs  = [X_sample],
                                     outputs = visible_pX_chain[-1],
                                     name    = 'gsn_f_sample_single_layer')
        else:
            # WHY IS THERE A WARNING????
            # because the first odd layers are not used -> directly computed FROM THE EVEN layers
            # unused input = warn
            self.f_sample = function(inputs  = self.network_state_input,
                                     outputs = self.network_state_output + visible_pX_chain,
                                     name    = 'gsn_f_sample')

        log.debug("GSN compiling done. Took %s", make_time_units_string(time.time() - t))

        return cost, monitors, output, hiddens