Пример #1
0
def project_to_feature_space(rbm, train_set):

    if not hasattr(rbm, '__iter__'):
        rbm = (rbm, )

    curr_train_set = train_set

    for curr_rbm in rbm:
        X = T.matrix()
        func = theano.function([X], curr_rbm.activation_h(X))

        if isinstance(curr_train_set, GPU_Batch):
            gpu_batch = curr_train_set
        else:
            gpu_batch = GPU_Batch(curr_train_set, 1)

        h_act_app = None
        for curr_batch in gpu_batch:
            curr_h_act = func(ensure_ndarray(curr_batch))
            if h_act_app == None:
                h_act_app = curr_h_act
            else:
                h_act_app = np.vstack((h_act_app, curr_h_act))

        curr_train_set = h_act_app

    return curr_train_set
Пример #2
0
    def load_files(self, filenames, clear=False):
        """Load data from a list of filenames, and store the data internally.

        Parameters
        ----------

        filenames : dictionary
            a dictionary of filenames to read data from

        labels : dictionary
            a dictionary of labels

        clear : bool
            if True, discard data that was loaded by earlier calls to `load_files`

        """
        if clear:
            self.data = []
            self.labels = []

        for instance_set, label in self.file_loader(filenames):
            vp_repr_data = []

            for vp in self.viewpoints:
                vp_repr_data.append(
                    ensure_ndarray(vp.raw_to_repr(instance_set, label)))
            fn_bin_data = np.hstack(vp_repr_data)

            self.data.append(fn_bin_data)
            self.labels.append(label)
Пример #3
0
    def convert_files(self, filenames, folder_tmp):
        """
        Converts files in raw format to files in input representation using the
        viewpoints 'raw_to_repr' method.
        """
        LOGGER.info("converting files...")
        files_all = []
        for i, (instance_set, label) in enumerate(self.file_loader(filenames)):
            try:
                fn = os.path.basename(filenames[i][0].split('\t')[0])
                fn = fn.split('.')[0]
            except Exception as e:
                if isinstance(e, TypeError):
                    fn = str(label).split('.')[0]
                else:
                    LOGGER.exception(e)
                    fn = str(label).split('.')[0]

            file_path = os.path.join(folder_tmp, fn + ".lrn2.npy")
            file_path_labels = os.path.join(folder_tmp,
                                            fn + "_labels.lrn2.npy")

            if self.verbose:
                LOGGER.info("processing file {0}...".format(file_path))
            if not os.path.isfile(file_path):
                vp_bin_data = []

                for vp in self.viewpoints:
                    curr_data = ensure_ndarray(
                        vp.raw_to_repr(instance_set, label))
                    vp_bin_data.append(curr_data)

                if len(self.viewpoints) > 1:
                    fn_repr_data = np.hstack(vp_bin_data)
                else:
                    fn_repr_data = vp_bin_data[0]

                if self.use_labels:
                    labels = [label] * len(fn_repr_data)

                if self.verbose:
                    LOGGER.info("saving file {0}...".format(file_path))

                np.save(file_path, fn_repr_data)

                if self.use_labels:
                    np.save(file_path_labels, labels)

                files_all.append(file_path)
            else:
                if self.verbose:
                    LOGGER.info(
                        "Skipping conversion - file {0} already exists.".
                        format(file_path))
                files_all.append(file_path)

        return files_all
Пример #4
0
def get_similarity_matrix(rbm, train_set):
    """
    Projects a (ordered, sequential) train set into the feature space and
    calculate a similarity matrix on the resulting hidden unit activations.
    """
    X = T.matrix()
    project = theano.function([X], rbm.activation_h(X))

    gpu_batch = ensure_gpu_batch(train_set)

    h_act = None
    for curr_batch in gpu_batch:
        if h_act == None:
            h_act = project(ensure_ndarray(curr_batch))
        else:
            h_act = np.hstack((h_act, project(ensure_ndarray(curr_batch))))

    sim_matrix = similarity_matrix(h_act, h_act)

    return np.array(sim_matrix) * -1
Пример #5
0
def get_diff(rbm, train_set):
    """
    Calculates the difference of consecutive hidden unit activations based
    on an (ordered) train set which represents sequential data.
    """
    X = T.matrix()
    project = theano.function([X], rbm.activation_h(X))

    gpu_batch = ensure_gpu_batch(train_set)

    h_act = None
    for curr_batch in gpu_batch:
        if h_act == None:
            h_act = project(ensure_ndarray(curr_batch))
        else:
            h_act = np.hstack((h_act, project(ensure_ndarray(curr_batch))))

    diff = [
        np.linalg.norm(h_act[i] - h_act[i + 1])
        for i in range(h_act.shape[0] - 1)
    ]

    return diff
Пример #6
0
    def get_data_callback(self, batch_nr, batch_size=10, kind="data"):
        assert kind in ("data",
                        "labels"), "'kind' has to be 'data' or 'labels'"
        assert len(
            self.files_open) > 0, "Call 'open_files()' before accessing data."

        n_batches = self.n_batches(batch_size)

        if self.verbose:
            LOGGER.debug("Batch {0}/{1}".format(batch_nr, n_batches))

        block_size = self.get_block_size()
        assert n_batches > 0, "Instance count ({0}) has to be > batch size * step_width ({1} * {2}). Hint: You need to define a batch-size for the max-pooling layer." \
                                .format(self.instance_count, batch_size, self.step_width)

        ngrams = []

        if self.idx is None:
            self.reset_idx()

        if batch_nr <= n_batches:
            i = batch_nr * batch_size
            while len(ngrams) < batch_size and i < len(self.idx):
                ngrams.append(
                    self.preprocess(
                        self.get_block(self.idx[i],
                                       self.idx[i] + block_size,
                                       kind=kind)))
                i += 1
        else:
            self.last_perc = -1
            return None

        if self.convolutional and len(ngrams) < batch_size:
            # Return only full batches
            return []
        return ensure_ndarray(ngrams)
Пример #7
0
def train(net,
          data,
          batch_size=200,
          epochs=500,
          learning_rate=1e-4,
          reduce_lr=False,
          momentum=0.0,
          validation=None,
          out_dir='.',
          img_interval=-1,
          dump_interval=-1,
          tile_fun=lambda x: x,
          exclude=[],
          plot_zero_epoch=True,
          grad_clip=None,
          grad_norm_clip=None,
          mode='default',
          nan_protection=False,
          **kwargs):
    """
    Trains a single layer or a stack with backpropagation using the cost of the
    input net.

    Parameters
    ----------

    net : FFBase (or derivatives)
        the net (or stack) to train

    data : dict of array-likes or None
        Dictionary of 2D or 4D arrays to set the values of the variables of the cost
        (has to match the order of self.variables).
        Convention: data['input'] is main input data, data['target'] is main target data
        If data == None, the net has to have a callback function for the
        notification event 'get_data'.

    batch_size : int
        the mini-batch size for training. if data == None, batch_size does
        not matter.

    epochs : int
        number of epochs to train the net

    learning_rate : float
        learning rate (initial, can be reduced during training by setting
        reduce_lr = True)

    reduce_lr : boolean, optional
        If True, learning rate will be reduced to 0 during training

    momentum : float, optional
        the momentum

    validation : dict, optional
        validation set; dict of 2D or 4D arrays.
        data to which variables will be bound to.

    out_dir : string
        the output folder, where training logs and plots will be written to

    img_interval : int, optional
        interval of epochs where images should be plotted to the output folder

    dump_interval : int, optional
        interval of epochs where the whole network should be dumped to the
        output folder

    tile_fun : function, optional
        takes an input matrix and creates a list of matrixes with the shape of
        the desired plots (how to tile the input data)

    exclude : list (of parameters)
        1D list of parameters to exclude from parameter updates

    Returns
    -------

    the (trained) net

    """

    assert data is not None or len(net.callbacks[Notifier.GET_DATA]) > 0, \
        "Either set the data parameter, and/or register a 'get_data' callback."

    LOGGER.info("\nTrain {0}: {1}...".format(net.name, type(net)))
    net.notify(Notifier.TRAINING_START)

    if tile_fun is None:
        tile_fun = dummy_tiler

    params = [p for p in net.params if id(p) not in [id(e) for e in exclude]]

    valid = net.validate_() if hasattr(net, 'validate_') else None

    lr = learning_rate
    opt = Optimizer(net.cost(),
                    params,
                    net.variables,
                    data,
                    batch_size,
                    lr=lr,
                    momentum=momentum,
                    notifier=net,
                    grad_clip=grad_clip,
                    grad_norm_clip=grad_norm_clip,
                    validate=valid,
                    mode=mode,
                    nan_protection=nan_protection)

    net.optimizer = opt

    curr_epoch = net.epochs_trained if isinstance(net, Monitor) else 0

    LOGGER.info("Training starts in epoch {0}.".format(curr_epoch))

    try:
        for curr_epoch in range(curr_epoch, epochs):
            if data is not None:
                data_batch = OrderedDict()
                for key in data.keys():
                    data_batch[key] = ensure_ndarray(data[key][:batch_size])
            else:
                data_batch = net.notify(Notifier.GET_DATA, 0)
                data_batch = OrderedDict(
                    [[net.variables.keys()[i], data_batch[i]]
                     for i in range(len(data_batch))])

            if curr_epoch == 0 and out_dir is not None and plot_zero_epoch:
                send_plot_command(net, out_dir, tile_fun, curr_epoch,
                                  data_batch)

            start_time = time.time()

            cost_curr = opt.train()

            if isinstance(net, Monitor):
                net.monitor_cost(cost_curr)

            # reduce learning rate
            if reduce_lr:
                opt.learning_rate = lr - curr_epoch * (lr / epochs)

            end_time = time.time()
            elapsed_epoch = end_time - start_time

            LOGGER.info(
                "finished epoch {0}/{3} in {1:.2f} seconds (lr: {4:.3e}; cost: {2:.4f})"
                .format(curr_epoch + 1, elapsed_epoch, cost_curr, epochs,
                        float(opt.learning_rate)))

            if validation is not None and hasattr(net, 'validate'):
                cost_valid = validate(net, validation, batch_size)
                LOGGER.info("Validation set cost = {0}".format(cost_valid))
                if isinstance(net, Monitor):
                    net.monitor_cost_val(cost_valid)

            if dump_interval > 0 and curr_epoch % dump_interval == 0:
                try:
                    net.save(
                        os.path.join(
                            out_dir, "net_{0}_backup_{1}.pyc.bz".format(
                                net.name, curr_epoch)))
                except AttributeError:
                    LOGGER.warning("Net could not be saved. Derive from "
                                   "brick SerializeStack or SerializeLayer.")
                    pass

            if curr_epoch % img_interval == 0 and out_dir is not None and curr_epoch > 0:
                send_plot_command(net, out_dir, tile_fun, curr_epoch,
                                  data_batch)

            if isinstance(net, Notifier):
                net.notify(Notifier.EPOCH_FINISHED,
                           curr_epoch=curr_epoch,
                           epochs=epochs)

    except KeyboardInterrupt:
        LOGGER.info("Training interrupted in epoch {0}, {1}".format(
            curr_epoch, net.name))
        send_plot_command(net, out_dir, tile_fun, curr_epoch, data_batch)

    net.notify(Notifier.TRAINING_STOP)

    return net