def project_to_feature_space(rbm, train_set): if not hasattr(rbm, '__iter__'): rbm = (rbm, ) curr_train_set = train_set for curr_rbm in rbm: X = T.matrix() func = theano.function([X], curr_rbm.activation_h(X)) if isinstance(curr_train_set, GPU_Batch): gpu_batch = curr_train_set else: gpu_batch = GPU_Batch(curr_train_set, 1) h_act_app = None for curr_batch in gpu_batch: curr_h_act = func(ensure_ndarray(curr_batch)) if h_act_app == None: h_act_app = curr_h_act else: h_act_app = np.vstack((h_act_app, curr_h_act)) curr_train_set = h_act_app return curr_train_set
def load_files(self, filenames, clear=False): """Load data from a list of filenames, and store the data internally. Parameters ---------- filenames : dictionary a dictionary of filenames to read data from labels : dictionary a dictionary of labels clear : bool if True, discard data that was loaded by earlier calls to `load_files` """ if clear: self.data = [] self.labels = [] for instance_set, label in self.file_loader(filenames): vp_repr_data = [] for vp in self.viewpoints: vp_repr_data.append( ensure_ndarray(vp.raw_to_repr(instance_set, label))) fn_bin_data = np.hstack(vp_repr_data) self.data.append(fn_bin_data) self.labels.append(label)
def convert_files(self, filenames, folder_tmp): """ Converts files in raw format to files in input representation using the viewpoints 'raw_to_repr' method. """ LOGGER.info("converting files...") files_all = [] for i, (instance_set, label) in enumerate(self.file_loader(filenames)): try: fn = os.path.basename(filenames[i][0].split('\t')[0]) fn = fn.split('.')[0] except Exception as e: if isinstance(e, TypeError): fn = str(label).split('.')[0] else: LOGGER.exception(e) fn = str(label).split('.')[0] file_path = os.path.join(folder_tmp, fn + ".lrn2.npy") file_path_labels = os.path.join(folder_tmp, fn + "_labels.lrn2.npy") if self.verbose: LOGGER.info("processing file {0}...".format(file_path)) if not os.path.isfile(file_path): vp_bin_data = [] for vp in self.viewpoints: curr_data = ensure_ndarray( vp.raw_to_repr(instance_set, label)) vp_bin_data.append(curr_data) if len(self.viewpoints) > 1: fn_repr_data = np.hstack(vp_bin_data) else: fn_repr_data = vp_bin_data[0] if self.use_labels: labels = [label] * len(fn_repr_data) if self.verbose: LOGGER.info("saving file {0}...".format(file_path)) np.save(file_path, fn_repr_data) if self.use_labels: np.save(file_path_labels, labels) files_all.append(file_path) else: if self.verbose: LOGGER.info( "Skipping conversion - file {0} already exists.". format(file_path)) files_all.append(file_path) return files_all
def get_similarity_matrix(rbm, train_set): """ Projects a (ordered, sequential) train set into the feature space and calculate a similarity matrix on the resulting hidden unit activations. """ X = T.matrix() project = theano.function([X], rbm.activation_h(X)) gpu_batch = ensure_gpu_batch(train_set) h_act = None for curr_batch in gpu_batch: if h_act == None: h_act = project(ensure_ndarray(curr_batch)) else: h_act = np.hstack((h_act, project(ensure_ndarray(curr_batch)))) sim_matrix = similarity_matrix(h_act, h_act) return np.array(sim_matrix) * -1
def get_diff(rbm, train_set): """ Calculates the difference of consecutive hidden unit activations based on an (ordered) train set which represents sequential data. """ X = T.matrix() project = theano.function([X], rbm.activation_h(X)) gpu_batch = ensure_gpu_batch(train_set) h_act = None for curr_batch in gpu_batch: if h_act == None: h_act = project(ensure_ndarray(curr_batch)) else: h_act = np.hstack((h_act, project(ensure_ndarray(curr_batch)))) diff = [ np.linalg.norm(h_act[i] - h_act[i + 1]) for i in range(h_act.shape[0] - 1) ] return diff
def get_data_callback(self, batch_nr, batch_size=10, kind="data"): assert kind in ("data", "labels"), "'kind' has to be 'data' or 'labels'" assert len( self.files_open) > 0, "Call 'open_files()' before accessing data." n_batches = self.n_batches(batch_size) if self.verbose: LOGGER.debug("Batch {0}/{1}".format(batch_nr, n_batches)) block_size = self.get_block_size() assert n_batches > 0, "Instance count ({0}) has to be > batch size * step_width ({1} * {2}). Hint: You need to define a batch-size for the max-pooling layer." \ .format(self.instance_count, batch_size, self.step_width) ngrams = [] if self.idx is None: self.reset_idx() if batch_nr <= n_batches: i = batch_nr * batch_size while len(ngrams) < batch_size and i < len(self.idx): ngrams.append( self.preprocess( self.get_block(self.idx[i], self.idx[i] + block_size, kind=kind))) i += 1 else: self.last_perc = -1 return None if self.convolutional and len(ngrams) < batch_size: # Return only full batches return [] return ensure_ndarray(ngrams)
def train(net, data, batch_size=200, epochs=500, learning_rate=1e-4, reduce_lr=False, momentum=0.0, validation=None, out_dir='.', img_interval=-1, dump_interval=-1, tile_fun=lambda x: x, exclude=[], plot_zero_epoch=True, grad_clip=None, grad_norm_clip=None, mode='default', nan_protection=False, **kwargs): """ Trains a single layer or a stack with backpropagation using the cost of the input net. Parameters ---------- net : FFBase (or derivatives) the net (or stack) to train data : dict of array-likes or None Dictionary of 2D or 4D arrays to set the values of the variables of the cost (has to match the order of self.variables). Convention: data['input'] is main input data, data['target'] is main target data If data == None, the net has to have a callback function for the notification event 'get_data'. batch_size : int the mini-batch size for training. if data == None, batch_size does not matter. epochs : int number of epochs to train the net learning_rate : float learning rate (initial, can be reduced during training by setting reduce_lr = True) reduce_lr : boolean, optional If True, learning rate will be reduced to 0 during training momentum : float, optional the momentum validation : dict, optional validation set; dict of 2D or 4D arrays. data to which variables will be bound to. out_dir : string the output folder, where training logs and plots will be written to img_interval : int, optional interval of epochs where images should be plotted to the output folder dump_interval : int, optional interval of epochs where the whole network should be dumped to the output folder tile_fun : function, optional takes an input matrix and creates a list of matrixes with the shape of the desired plots (how to tile the input data) exclude : list (of parameters) 1D list of parameters to exclude from parameter updates Returns ------- the (trained) net """ assert data is not None or len(net.callbacks[Notifier.GET_DATA]) > 0, \ "Either set the data parameter, and/or register a 'get_data' callback." LOGGER.info("\nTrain {0}: {1}...".format(net.name, type(net))) net.notify(Notifier.TRAINING_START) if tile_fun is None: tile_fun = dummy_tiler params = [p for p in net.params if id(p) not in [id(e) for e in exclude]] valid = net.validate_() if hasattr(net, 'validate_') else None lr = learning_rate opt = Optimizer(net.cost(), params, net.variables, data, batch_size, lr=lr, momentum=momentum, notifier=net, grad_clip=grad_clip, grad_norm_clip=grad_norm_clip, validate=valid, mode=mode, nan_protection=nan_protection) net.optimizer = opt curr_epoch = net.epochs_trained if isinstance(net, Monitor) else 0 LOGGER.info("Training starts in epoch {0}.".format(curr_epoch)) try: for curr_epoch in range(curr_epoch, epochs): if data is not None: data_batch = OrderedDict() for key in data.keys(): data_batch[key] = ensure_ndarray(data[key][:batch_size]) else: data_batch = net.notify(Notifier.GET_DATA, 0) data_batch = OrderedDict( [[net.variables.keys()[i], data_batch[i]] for i in range(len(data_batch))]) if curr_epoch == 0 and out_dir is not None and plot_zero_epoch: send_plot_command(net, out_dir, tile_fun, curr_epoch, data_batch) start_time = time.time() cost_curr = opt.train() if isinstance(net, Monitor): net.monitor_cost(cost_curr) # reduce learning rate if reduce_lr: opt.learning_rate = lr - curr_epoch * (lr / epochs) end_time = time.time() elapsed_epoch = end_time - start_time LOGGER.info( "finished epoch {0}/{3} in {1:.2f} seconds (lr: {4:.3e}; cost: {2:.4f})" .format(curr_epoch + 1, elapsed_epoch, cost_curr, epochs, float(opt.learning_rate))) if validation is not None and hasattr(net, 'validate'): cost_valid = validate(net, validation, batch_size) LOGGER.info("Validation set cost = {0}".format(cost_valid)) if isinstance(net, Monitor): net.monitor_cost_val(cost_valid) if dump_interval > 0 and curr_epoch % dump_interval == 0: try: net.save( os.path.join( out_dir, "net_{0}_backup_{1}.pyc.bz".format( net.name, curr_epoch))) except AttributeError: LOGGER.warning("Net could not be saved. Derive from " "brick SerializeStack or SerializeLayer.") pass if curr_epoch % img_interval == 0 and out_dir is not None and curr_epoch > 0: send_plot_command(net, out_dir, tile_fun, curr_epoch, data_batch) if isinstance(net, Notifier): net.notify(Notifier.EPOCH_FINISHED, curr_epoch=curr_epoch, epochs=epochs) except KeyboardInterrupt: LOGGER.info("Training interrupted in epoch {0}, {1}".format( curr_epoch, net.name)) send_plot_command(net, out_dir, tile_fun, curr_epoch, data_batch) net.notify(Notifier.TRAINING_STOP) return net