def create_h5_dataset(self, dataset, path): """ Create h5 file for the given dataset """ pprint('\nCreating %s'%dataset + ' h5 data set using fast, lossless ' + 'compression (lzf)...' , end='') # read train and test data from original dataset nparray_2D_train, label_train, label_names = \ self._read_set_from(dataset, 'train', path) nparray_2D_test, label_test = \ self._read_set_from(dataset, 'test', path)[0:2] # store in h5 file using lzf compression h5file = h5py.File(self._h5path, 'w-') h5file.create_dataset('/train/data', data=nparray_2D_train, compression='lzf') h5file.create_dataset('/train/label', data=label_train, compression='lzf') if (nparray_2D_test is not None): h5file.create_dataset('/test/data', data=nparray_2D_test, compression="lzf") h5file.create_dataset('/test/label', data=label_test, compression="lzf") h5file.create_dataset('label_names', data=label_names, compression="lzf") h5file.close() pprint(' Done.')
def initialize_multilayer(self, nmultilayer, config, dataset): """ Initialize given MultiLayer according to config. The Layers get initialized with the data depending on their InputSource """ # Set if theano and/or theano.scan should be used for learning # iterations try: self._theano = config['config']['Theano'] except: self._theano = False try: self._theanoscan = config['config']['Theano']*config['config']['Scan'] except: self._theanoscan = False try: mini_batch_size = config['config']['mini_batch_size'] except: mini_batch_size = 1 if self._theano and not self._theanoscan and (mini_batch_size > 1): pprint("WARNING: theano (without scan) doesn't support " + "mini-batches yet. mini_batch_size set to 1.") mini_batch_size = 1 self.MultiLayer[nmultilayer]._mini_batch_size = mini_batch_size try: scan_batch_size = config['config']['scan_batch_size'] except: scan_batch_size = None self.MultiLayer[nmultilayer]._scan_batch_size = scan_batch_size # Set the Data for the InputLayer of the MultiLayer depending on # InputSource (DataSet or another MultiLayer) InputSource = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['InputLayer']['InputSource'] self.MultiLayer[nmultilayer].Layer[0].set_inputsource(InputSource) if (InputSource == 'DataSet'): # If the InputSource is "DataSet" get the Training Data + # Label from the given DataSet Y = dataset.get_train_data() Label = dataset.get_train_label() elif (InputSource[0][0:10] == 'MultiLayer'): # If the InputSource is a "MultiLayer[#]" get as Training # Data the Output of that MultiLayer. if not self._theanoscan: Y = np.empty(shape=(dataset.get_train_data().shape[0], config['model'][InputSource[0]][InputSource[1]]['C']), dtype='float32') for i in xrange(dataset.get_train_data().shape[0]): Y[i,:] = self.output(dataset.get_train_data()[i], int(InputSource[0][10])-1) else: # TODO: Implement Scan-Splitting for big data sets inputdata = [] weights = {} activations = {} ml = self.MultiLayer[int(InputSource[0][10])-1] if ml._scan_batch_size is None: nbatches = 1 scan_batch_size = dataset.get_train_data().shape[0] else: nbatches = int(np.ceil( dataset.get_train_data().shape[0] /float(ml._scan_batch_size))) scan_batch_size = ml._scan_batch_size for layer in ml.Layer: if layer.__class__.__name__ == 'InputLayer': data = dataset.get_train_data() label = dataset.get_train_label() layer.set_input(data, label, shuffle=False) layer.normalize_inputs() data = layer.get_input_data().astype('float32') # TODO: use layer.output() ! N = data.shape[0] D = data.shape[1] data = data.reshape((1, N, D)) inputdata.append(data) elif layer.__class__.__name__ == 'ProcessingLayer': weights[layer._layermodel.W_t.name] = \ layer.get_weights().astype('float32') activations[layer._layermodel.s_t.name] = np.zeros( (N, weights[layer._layermodel.W_t.name].shape[0]), dtype='float32') # TODO: reconstruct this from layers as in Train # (done for outputs_info and non_sequences): Y = np.empty(shape=(dataset.get_train_data().shape[0], config['model'][InputSource[0]][InputSource[1]]['C']), dtype='float32') for nbatch in xrange(nbatches): sequences = [inputdata[0][:,nbatch*scan_batch_size:\ (nbatch+1)*scan_batch_size,:]] outputs_info = [activations[item.name]\ [nbatch*scan_batch_size:(nbatch+1)*scan_batch_size,:] for layer in ml.Layer for item in layer.outputs_info(mode='test')] non_sequences = [weights[item.name] for layer in ml.Layer for item in layer.non_sequences(mode='test')] args = sequences + outputs_info + non_sequences Y[nbatch*scan_batch_size:(nbatch+1)*scan_batch_size] = \ ml._activation_scan(*args)[0] Label = dataset.get_train_label() self.MultiLayer[nmultilayer].set_iterations( config['model']['MultiLayer'+str(nmultilayer+1)]['Iterations']) plcount = 0 # if the number of datapoints changes on different layers/ # multilayers the next line must be changed ndatapoints = Y.shape[0] self.MultiLayer[nmultilayer].set_iteration(0) for nlayer in xrange(self.MultiLayer[nmultilayer].number_of_layers()): #--- Initialize Input Layer --- if (self.MultiLayer[nmultilayer].Layer[nlayer].__class__.__name__ == 'InputLayer'): InputSource = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['InputLayer']['InputSource'] A = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['InputLayer']['A'] if (A == 'Default'): if (InputSource[0][0:10] == 'MultiLayer'): A = config['model'][InputSource[0]][InputSource[1]]\ ['C'] + 1 # if you change this formular remember to also # change it in Output->WriteSetting (!) else: A = None try: theanoscan = config['config']['Theano']*config['config']['Scan'] except: theanoscan = False self.MultiLayer[nmultilayer].initialize_inputlayer( Y,Label,A,nlayer,theanoscan) #--- Processing Layer --- elif (self.MultiLayer[nmultilayer].Layer[nlayer].__class__.__name__ == 'ProcessingLayer'): plcount += 1 InputSource = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['InputSource'] if (type(InputSource) == str): InputSource = (InputSource,) C = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['C'] if (C == 'Default'): try: C = len(config['dataset']['classes']) except: C = config['dataset']['nclasses'] epsilon = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['epsilon'] try: if (epsilon == 'Default'): # if you change this formular remember to also change # it in Output->WriteSetting (!) if (config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['Model'] == 'MM-LabeledOnly'): epsilon = min(C/2.* \ 1./config['dataset']['training_label_size'], 1.) else: epsilon = C/2. * \ 1./config['dataset']['training_data_size'] elif (epsilon[0] == 'factor'): if (config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['Model'] == 'MM-LabeledOnly'): epsilon = min(epsilon[1] * \ C/float(config['dataset']['training_label_size']), 1.) else: epsilon = min(epsilon[1] * \ C/float(config['dataset']['training_data_size']), 1.) except: pass Model = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['Model'] L = self.MultiLayer[nmultilayer].Layer[0].get_input_label() if (InputSource[0] == 'InputLayer'): Y = self.MultiLayer[nmultilayer].Layer[0].get_input_data() D = Y.shape[1:] elif (InputSource[0][0:15] == 'ProcessingLayer'): D = (config['model']['MultiLayer'+str(nmultilayer+1)]\ [InputSource[0]]['C'],) Y = None else: D = None Y = None print "ERROR: %s"%'model', print "| MultiLayer%d"%(nmultilayer+1), print "| ProcessingLayer%d:"%nlayer, print "Invalid InputSource" """ # obsolete # for recurrent model: number of neurons in following layer try: K = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount+1)]['C']] if (K == 'Default'): K = len(config['dataset']['classes']) except: K = None """ # optional arguments: try: A = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['A'] if (A == 'Default'): # if you change this formular remember to also change # it in Output->WriteSetting (!) if (InputSource[0][0:15] == 'ProcessingLayer'): A = D + 1 else: A = None except: A = None try: threshold = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['threshold'] except: threshold = None h5path = None h5file = None try: InitMethod = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['Initialization'] if isinstance(InitMethod, tuple): InitMethod = InitMethod[0] if (InitMethod == 'h5'): h5path = config['model']\ ['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]\ ['Initialization'][1] # expected h5 file name format: # "Run[i]M[j]L[k].h5" # e.g. "Run3M1L2.h5" for weights of 1st MultiLayer, # 2nd Layer, the 3rd Run (counting starts with 1). # For each Run, their must be an individual h5 file # present. try: h5file = config['model']\ ['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]\ ['Initialization'][2] except: h5file = "Run%dM%dL%d.h5"%( self._run+1,nmultilayer+1,nlayer) except: InitMethod = None try: Theano = config['config']['Theano'] except: Theano = False try: Scan = config['config']['Scan'] except: Scan = False Parameters = { 'C':C, 'A':A, 'epsilon':epsilon, 'threshold':threshold } self.MultiLayer[nmultilayer].initialize_processinglayer( Model,Parameters,InputSource,nlayer,Theano, Scan,D,Y,L,InitMethod,h5path,h5file ) if (np.ceil(float(config['dataset']['training_data_size'])/\ MPI.COMM_WORLD.Get_size() > ndatapoints)): self.MultiLayer[nmultilayer]._blankstep = True if self._theanoscan: self.MultiLayer[nmultilayer].compile_theano_functions()
def train(self, nmultilayer, output, config, dataset): ml = self.MultiLayer[nmultilayer] ml.next_run() pprint('%d.%d.2.1 - Visualize Weights' % ( self._run+1, nmultilayer+1), end='') output.visualize_all_weights(self,nmultilayer,config) pprint(' (%0.2f MB)' % mem_usage()) # if (len(self.MultiLayer) == 1): # TODO: for greedy model: output Likelihood of up to the # current MultiLayer pprint('%d.%d.2.2 - LogLikelihood' % ( self._run+1, nmultilayer+1), end='') output.write_loglikelihood(self, nmultilayer) pprint(' (%0.2f MB)' % mem_usage()) # variables for stopping criterion # TODO: generalize for all MultiLayers try: STOPPING_CRITERION = config.get()\ ['model']['MultiLayer1']['StoppingCriterion'] except: STOPPING_CRITERION = False if STOPPING_CRITERION: try: mvngwidth = int(config.get()\ ['model']['MultiLayer1']['MovingWidth']) except: pprint('WARNING (model.Model::Train): No width for \ moving average was given. It will be set to %d'%20) mvngwidth = 20 loglikelihood = np.asarray([], dtype=np.float32) mvng_avg, mvng_std = 0., 0. max_mvng_avg = float('-inf') last_weights = [] else: loglikelihood = np.asarray([None]) STOP = False MPI.COMM_WORLD.Barrier() pprint('%d.%d.2.3 - Training Iterations' % (self._run+1,nmultilayer+1)) for niteration in xrange(ml.get_iterations()): pprint('Iteration: %*d' % ( int(math.log10(ml.get_iterations()))+1, ml.get_iteration()+1), end='') # pprint('2.2.3.1 - Convergence', end='') output.conv_pre(ml) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) MPI.COMM_WORLD.Barrier() # pprint('2.2.3.2 - Learning Iteration', end='') ml.learning_iteration(self._theanoscan) # pprint(' - Memory usage: %s (Mb)\n' % mem_usage()) MPI.COMM_WORLD.Barrier() # experimental: variing learning rates # if ((niteration % 1 == 0) and # (ml.Layer[1]._epsilon >= 0.0008)): # ml.Layer[1]._epsilon *= 0.98 # MPI.COMM_WORLD.Barrier() # to save learned weights every iteration # output.save_weights(self.MultiLayer[nmultilayer]) # to save posterior distribution of training data every iteration # if nmultilayer == self.number_of_multilayers()-1: # output.save_posterior(self, config, dataset) # Stopping criterion if (len(self.MultiLayer) == 1): # TODO: for greedy model: output Likelihood of up to the # current MultiLayer if STOPPING_CRITERION: t0 = time.time() loglikelihood = np.append( loglikelihood, self.loglikelihood() ) if (niteration >= mvngwidth): # save only the last #mvngwidth values loglikelihood = loglikelihood[1:] pprint(' | Log-Likelihood: %f (%f s)'%( loglikelihood[-1], time.time()-t0), end='') if STOPPING_CRITERION: # save only the last #mvngwidht/2-1 weights # for centered moving average last_weights.append([ml.Layer[nlayer].get_weights() for nlayer in xrange(1,len(ml.Layer))]) if (niteration > mvngwidth/2): last_weights = last_weights[1:] # calculate moving average over last #mvngwidth values if (niteration >= mvngwidth-1): mvng_avg = np.mean(loglikelihood) mvng_std = np.std(loglikelihood) pprint(' | Moving Average (%d): %f +- %f'% ((niteration+1 - mvngwidth/2),mvng_avg,mvng_std), end='') if (mvng_avg > max_mvng_avg): max_mvng_avg = mvng_avg elif (mvng_avg < max_mvng_avg - mvng_std): # if the moving average drops below the maximum # moving average by more than the moving standard # deviation, stop the learning iteration and revert # back to the point of the centered moving average for nlayer in xrange(1,len(ml.Layer)): ml.Layer[nlayer].set_weights(last_weights[0][nlayer-1]) stopping_iteration = niteration+1 - mvngwidth/2 pprint('\nStopping criterion met at iteration %d.'% stopping_iteration) STOP = True ml.set_iteration(stopping_iteration) ml.set_iterations(stopping_iteration) # abort on numerical error (nan in any of the weights) if any([np.any(np.isnan(ml.Layer[nlayer].get_weights())) for nlayer in xrange(1,len(ml.Layer))]): pprint('\nNumerical error: try to decrease learning ' + 'rate and/or mini-batch size.') STOP = True try: ml.set_iteration( stopping_iteration) ml.set_iterations( stopping_iteration) except: pass this_loglikelihood = output.write_loglikelihood(self, nmultilayer, loglikelihood[-1]) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) if (len(self.MultiLayer) == 1): # pprint('2.2.3.7 - Test Error', end='') output.write_online_results(self, config, dataset, this_loglikelihood) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) # pprint('2.2.3.4 - Visualize Weights', end='') output.visualize_all_weights(self,nmultilayer,config) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) # pprint('2.2.3.5 - Convergence 2', end='') output.conv_post(self.MultiLayer[nmultilayer]) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) # pprint('2.2.3.6 - Visualize Convergence', end='') output.visualize_convergence(self,nmultilayer) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) if STOP: break pprint('') #linebreak
def learning_iteration(self, theanoscan): t0 = time.time() if not theanoscan: logn = int(math.log10(self.Layer[0].imagecount())) pprint(' -- Image %*d'%(logn+1,0), end='') for i in xrange(self.Layer[0].imagecount()): if (((i+1)%1000 == 0) or ((i+1) == self.Layer[0].imagecount())): pprint('\b'*(logn+1) + '%*d'%(logn+1, i+1), end='') self.learningstep() if self._blank_step: self.blank_step() self._niteration += 1 pprint('\b \b'*(10+logn+1), end='') """ from mpi4py import MPI import time if (MPI.COMM_WORLD.Get_rank() == 0): if (self._nrun == 1): print '%d Processes' % MPI.COMM_WORLD.Get_size() print 'C\tL1 total\tL1 comm\tL1 calls\tL2 ', print 'total\tL2 comm\tL2 calls' print '%d\t%f\t%f\t%f\t%f\t%f\t%f' % ( self.Layer[1].GetNumberOfNeurons(), self.Layer[1].elapsed, self.Layer[1].comm_time, self.Layer[1].ncomm, self.Layer[2].elapsed, self.Layer[2].comm_time, self.Layer[2].ncomm ) self.Layer[1].elapsed = time.time() - time.time() self.Layer[2].elapsed = time.time() - time.time() self.Layer[1].comm_time = time.time() - time.time() self.Layer[2].comm_time = time.time() - time.time() self.Layer[1].ncomm = 0 self.Layer[2].ncomm = 0 """ else: inputs = {} weights = {} parameters = {} mbs = self._mini_batch_size if self._scan_batch_size is None: nbatches = 1 scan_batch_size = self.Layer[0].get_input_data().shape[0] # don't set self._scan_batch_size, because for testing # the data shape will be different else: nbatches = int(np.ceil( self.Layer[0].get_input_data().shape[0] /float(self._scan_batch_size))) scan_batch_size = self._scan_batch_size for nbatch in xrange(nbatches): for layer in self.Layer: if layer.__class__.__name__ == 'InputLayer': layer.shuffle() try: data = layer.get_input_data().astype('float32')[ nbatch*scan_batch_size: (nbatch+1)*scan_batch_size] label = layer.get_input_label().astype('int32')[ nbatch*scan_batch_size: (nbatch+1)*scan_batch_size] except: data = layer.get_input_data().astype('float32')[ nbatch*scan_batch_size:] label = layer.get_input_label().astype('int32')[ nbatch*scan_batch_size:] # if mbs > 1: (...) data = np.append( data, np.zeros( ((mbs - data.shape[0]%mbs)%mbs, data.shape[1]), dtype='float32'), axis=0) data = data.reshape( (data.shape[0]/mbs, mbs, data.shape[1])) label = np.append( label, (-1)*np.ones((mbs - label.shape[0]%mbs)%mbs, dtype='int32'), axis=0) label = label.reshape((label.shape[0]/mbs, mbs)) inputs[layer.s_t.name] = data inputs[layer.L_t.name] = label elif layer.__class__.__name__ == 'ProcessingLayer': weights[layer._layermodel.W_t.name] = \ layer.get_weights().astype('float32') # TODO: generalize parameters in Layer parameters.update(dict(zip( [item.name for item in layer._layermodel.parameters_t], [layer.parameters[item.name.rpartition('_')[0]] for item in layer._layermodel.parameters_t] ))) # parameters[layer._layermodel.epsilon_t.name] = \ # np.asarray(layer.get_learningrate(), dtype='float32') sequences = [inputs[item.name] for layer in self.Layer for item in layer.sequences(mode='train')] outputs_info = [weights[item.name] for layer in self.Layer for item in layer.outputs_info(mode='train')] non_sequences = [parameters[item.name] for layer in self.Layer for item in layer.non_sequences(mode='train')] args = sequences + outputs_info + non_sequences learnedweights = self._learningiteration_scan(*args) for n in xrange(len(learnedweights)): self.Layer[n+1].set_weights(learnedweights[n]) self._niteration += 1 pprint(' (%f s)' % (time.time() - t0), end='')
def learning_iteration(self, theanoscan): t0 = time.time() if not theanoscan: logn = int(math.log10(self.Layer[0].imagecount())) pprint(' -- Image %*d' % (logn + 1, 0), end='') for i in xrange(self.Layer[0].imagecount()): if (((i + 1) % 1000 == 0) or ((i + 1) == self.Layer[0].imagecount())): pprint('\b' * (logn + 1) + '%*d' % (logn + 1, i + 1), end='') self.learningstep() if self._blank_step: self.blank_step() self._niteration += 1 pprint('\b \b' * (10 + logn + 1), end='') """ from mpi4py import MPI import time if (MPI.COMM_WORLD.Get_rank() == 0): if (self._nrun == 1): print '%d Processes' % MPI.COMM_WORLD.Get_size() print 'C\tL1 total\tL1 comm\tL1 calls\tL2 ', print 'total\tL2 comm\tL2 calls' print '%d\t%f\t%f\t%f\t%f\t%f\t%f' % ( self.Layer[1].GetNumberOfNeurons(), self.Layer[1].elapsed, self.Layer[1].comm_time, self.Layer[1].ncomm, self.Layer[2].elapsed, self.Layer[2].comm_time, self.Layer[2].ncomm ) self.Layer[1].elapsed = time.time() - time.time() self.Layer[2].elapsed = time.time() - time.time() self.Layer[1].comm_time = time.time() - time.time() self.Layer[2].comm_time = time.time() - time.time() self.Layer[1].ncomm = 0 self.Layer[2].ncomm = 0 """ else: inputs = {} weights = {} parameters = {} mbs = self._mini_batch_size if self._scan_batch_size is None: nbatches = 1 scan_batch_size = self.Layer[0].get_input_data().shape[0] # don't set self._scan_batch_size, because for testing # the data shape will be different else: nbatches = int( np.ceil(self.Layer[0].get_input_data().shape[0] / float(self._scan_batch_size))) scan_batch_size = self._scan_batch_size for nbatch in xrange(nbatches): for layer in self.Layer: if layer.__class__.__name__ == 'InputLayer': layer.shuffle() try: data = layer.get_input_data().astype( 'float32')[nbatch * scan_batch_size:(nbatch + 1) * scan_batch_size] label = layer.get_input_label().astype( 'int32')[nbatch * scan_batch_size:(nbatch + 1) * scan_batch_size] except: data = layer.get_input_data().astype( 'float32')[nbatch * scan_batch_size:] label = layer.get_input_label().astype( 'int32')[nbatch * scan_batch_size:] # if mbs > 1: (...) data = np.append(data, np.zeros( ((mbs - data.shape[0] % mbs) % mbs, data.shape[1]), dtype='float32'), axis=0) data = data.reshape( (data.shape[0] / mbs, mbs, data.shape[1])) label = np.append(label, (-1) * np.ones( (mbs - label.shape[0] % mbs) % mbs, dtype='int32'), axis=0) label = label.reshape((label.shape[0] / mbs, mbs)) inputs[layer.s_t.name] = data inputs[layer.L_t.name] = label elif layer.__class__.__name__ == 'ProcessingLayer': weights[layer._layermodel.W_t.name] = \ layer.get_weights().astype('float32') # TODO: generalize parameters in Layer parameters.update( dict( zip([ item.name for item in layer._layermodel.parameters_t ], [ layer.parameters[item.name.rpartition('_') [0]] for item in layer._layermodel.parameters_t ]))) # parameters[layer._layermodel.epsilon_t.name] = \ # np.asarray(layer.get_learningrate(), dtype='float32') sequences = [ inputs[item.name] for layer in self.Layer for item in layer.sequences(mode='train') ] outputs_info = [ weights[item.name] for layer in self.Layer for item in layer.outputs_info(mode='train') ] non_sequences = [ parameters[item.name] for layer in self.Layer for item in layer.non_sequences(mode='train') ] args = sequences + outputs_info + non_sequences learnedweights = self._learningiteration_scan(*args) for n in xrange(len(learnedweights)): self.Layer[n + 1].set_weights(learnedweights[n]) self._niteration += 1 pprint(' (%f s)' % (time.time() - t0), end='')
# Copyright (C) 2015, Dennis Forster <*****@*****.**> # # LICENSE: THE SOFTWARE IS PROVIDED "AS IS" UNDER THE # ACADEMIC FREE LICENSE (AFL) v3.0. # """Module docstring. """ from utils.sysfunctions import mem_usage from utils.parallel import pprint pprint('Start (RAM Usage: %0.2f MB)' % mem_usage()) # system imports pprint('0.0.0 - System Imports', end='') import sys from mpi4py import MPI pprint(' (%0.2f MB)' % mem_usage()) # custom imports pprint('0.0.1 - Custom Imports') from classes.model.Model import Model from classes.input.DataSet import DataSet from classes.config.Config import Config from classes.output.Output import Output # MPI definitions pprint('0.1 - MPI Definitions', end='') comm = MPI.COMM_WORLD rank = comm.Get_rank()
# Copyright (C) 2015, Dennis Forster <*****@*****.**> # # LICENSE: THE SOFTWARE IS PROVIDED "AS IS" UNDER THE # ACADEMIC FREE LICENSE (AFL) v3.0. # """Module docstring. """ from utils.sysfunctions import mem_usage from utils.parallel import pprint pprint('Start (RAM Usage: %0.2f MB)' % mem_usage()) # system imports pprint('0.0.0 - System Imports', end='') import sys from mpi4py import MPI pprint(' (%0.2f MB)' % mem_usage()) # custom imports pprint('0.0.1 - Custom Imports') from classes.model.Model import Model from classes.input.DataSet import DataSet from classes.config.Config import Config from classes.output.Output import Output # MPI definitions pprint('0.1 - MPI Definitions', end='') comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size()
def pick_new_subset(self, config, dset, run=None): """ 1. From the labeled data points the given number of data points to use for training or testing are selected - these can be either chosen completely randomly, or evenly distributed between classes (and only randomly inside of each class); all remaining labeled data points are declared as unlabeled 2. Then (for training) the remaining data points are chosen from the unlabeled data points """ h5file = h5py.File(self._h5path, 'r') if (dset == 'train'): label = np.asarray(h5file['/train/label'].value, dtype=int) label_size = min(config['dataset']['training_label_size'], len(self._indexlist['full_train_labeled'])) data_size = min(config['dataset']['training_data_size'], len(self._indexlist['full_train_labeled']) +\ len(self._indexlist['full_train_unlabeled'])) if (label_size > data_size): label_size = data_size pprint("ERROR: more training labels than training data. " + "These values are now set as follows:") pprint("training_data_size: %d", data_size) pprint("training_label_size: %d", label_size) elif (dset == 'test'): label = np.asarray(h5file['/test/label'].value, dtype=int) data_size = min(config['dataset']['test_size'], len(self._indexlist['full_test'])) label_size = data_size h5file.close() # TODO: change data_size value in saved setting accordingly # TODO: change label_size value in saved setting accordingly try: EVENLABELS = config['dataset']['EVENLABELS'] except: EVENLABELS = False try: indexlist = config['dataset']['indexlist'] except: indexlist = None try: labellist = config['dataset']['labellist'] except: labellist = None if ((indexlist is not None) and (labellist is not None)): pprint('WARNING (classes.input.DataSet): when dataset.indexlist ' + 'is given, dataset.labellist will be ignored.') if (indexlist is None): if (labellist is None): # draw labeled data labeled_index = self.draw_indexlist(label, self._classes, label_size, EVENLABELS) else: try: filename = labellist h5file = h5py.File(filename, 'r') except: filename = labellist + 'Run' + str(run+1) + 'Data.h5' h5file = h5py.File(filename,'r') if (dset == 'train'): try: labeled_index = h5file['train/labeled'].value pprint('WARNING: labels set according to labellist - '+ 'label_size will be ignored.') # TODO: draw labels according to label_size from labellist except: pprint('WARNING (classes.input.DataSet): ' + 'training indexlist not found in provided ' + 'labellist, it will be drawn randomly') labeled_index = self.draw_indexlist( label, self._classes, label_size, EVENLABELS) elif (dset == 'test'): try: labeled_index = h5file['test'].value except: pprint('WARNING (classes.input.DataSet): ' + 'testlist not found in provided labellist, it ' + 'will be drawn randomly') labeled_index = self.draw_indexlist( label, self._classes, label_size, EVENLABELS) unlabeled_index = np.asarray([], dtype=int) if (label_size < data_size): # set remaining data points as unlabeled remaining = [idx for idx in self._indexlist['full_train_labeled'] if idx not in labeled_index] label[remaining] = int(-1) # draw additional unlabeled data unlabeled_index = self.draw_indexlist( label, [-1], data_size-label_size, False) else: try: h5file = h5py.File(indexlist, 'r') except: h5file = h5py.File(indexlist+'Run'+str(run+1)+'Data.h5', 'r') if (dset == 'train'): labeled_index = h5file['train/labeled'].value unlabeled_index = h5file['train/unlabeled'].value elif (dset == 'test'): labeled_index = h5file['test'].value h5file.close() if (dset == 'train'): self._indexlist['train_labeled'] = labeled_index self._indexlist['train_unlabeled'] = unlabeled_index elif (dset == 'test'): self._indexlist['test'] = labeled_index
def initialize_multilayer(self, nmultilayer, config, dataset): """ Initialize given MultiLayer according to config. The Layers get initialized with the data depending on their InputSource """ # Set if theano and/or theano.scan should be used for learning # iterations try: self._theano = config['config']['Theano'] except: self._theano = False try: self._theanoscan = config['config']['Theano'] * config['config'][ 'Scan'] except: self._theanoscan = False try: mini_batch_size = config['config']['mini_batch_size'] except: mini_batch_size = 1 if self._theano and not self._theanoscan and (mini_batch_size > 1): pprint("WARNING: theano (without scan) doesn't support " + "mini-batches yet. mini_batch_size set to 1.") mini_batch_size = 1 self.MultiLayer[nmultilayer]._mini_batch_size = mini_batch_size try: scan_batch_size = config['config']['scan_batch_size'] except: scan_batch_size = None self.MultiLayer[nmultilayer]._scan_batch_size = scan_batch_size # Set the Data for the InputLayer of the MultiLayer depending on # InputSource (DataSet or another MultiLayer) InputSource = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['InputLayer']['InputSource'] self.MultiLayer[nmultilayer].Layer[0].set_inputsource(InputSource) if (InputSource == 'DataSet'): # If the InputSource is "DataSet" get the Training Data + # Label from the given DataSet Y = dataset.get_train_data() Label = dataset.get_train_label() elif (InputSource[0][0:10] == 'MultiLayer'): # If the InputSource is a "MultiLayer[#]" get as Training # Data the Output of that MultiLayer. if not self._theanoscan: Y = np.empty(shape=( dataset.get_train_data().shape[0], config['model'][InputSource[0]][InputSource[1]]['C']), dtype='float32') for i in xrange(dataset.get_train_data().shape[0]): Y[i, :] = self.output(dataset.get_train_data()[i], int(InputSource[0][10]) - 1) else: # TODO: Implement Scan-Splitting for big data sets inputdata = [] weights = {} activations = {} ml = self.MultiLayer[int(InputSource[0][10]) - 1] if ml._scan_batch_size is None: nbatches = 1 scan_batch_size = dataset.get_train_data().shape[0] else: nbatches = int( np.ceil(dataset.get_train_data().shape[0] / float(ml._scan_batch_size))) scan_batch_size = ml._scan_batch_size for layer in ml.Layer: if layer.__class__.__name__ == 'InputLayer': data = dataset.get_train_data() label = dataset.get_train_label() layer.set_input(data, label, shuffle=False) layer.normalize_inputs() data = layer.get_input_data().astype('float32') # TODO: use layer.output() ! N = data.shape[0] D = data.shape[1] data = data.reshape((1, N, D)) inputdata.append(data) elif layer.__class__.__name__ == 'ProcessingLayer': weights[layer._layermodel.W_t.name] = \ layer.get_weights().astype('float32') activations[layer._layermodel.s_t.name] = np.zeros( (N, weights[layer._layermodel.W_t.name].shape[0]), dtype='float32') # TODO: reconstruct this from layers as in Train # (done for outputs_info and non_sequences): Y = np.empty(shape=( dataset.get_train_data().shape[0], config['model'][InputSource[0]][InputSource[1]]['C']), dtype='float32') for nbatch in xrange(nbatches): sequences = [inputdata[0][:,nbatch*scan_batch_size:\ (nbatch+1)*scan_batch_size,:]] outputs_info = [activations[item.name]\ [nbatch*scan_batch_size:(nbatch+1)*scan_batch_size,:] for layer in ml.Layer for item in layer.outputs_info(mode='test')] non_sequences = [ weights[item.name] for layer in ml.Layer for item in layer.non_sequences(mode='test') ] args = sequences + outputs_info + non_sequences Y[nbatch*scan_batch_size:(nbatch+1)*scan_batch_size] = \ ml._activation_scan(*args)[0] Label = dataset.get_train_label() self.MultiLayer[nmultilayer].set_iterations( config['model']['MultiLayer' + str(nmultilayer + 1)]['Iterations']) plcount = 0 # if the number of datapoints changes on different layers/ # multilayers the next line must be changed ndatapoints = Y.shape[0] self.MultiLayer[nmultilayer].set_iteration(0) for nlayer in xrange(self.MultiLayer[nmultilayer].number_of_layers()): #--- Initialize Input Layer --- if (self.MultiLayer[nmultilayer].Layer[nlayer].__class__.__name__ == 'InputLayer'): InputSource = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['InputLayer']['InputSource'] A = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['InputLayer']['A'] if (A == 'Default'): if (InputSource[0][0:10] == 'MultiLayer'): A = config['model'][InputSource[0]][InputSource[1]]\ ['C'] + 1 # if you change this formular remember to also # change it in Output->WriteSetting (!) else: A = None try: theanoscan = config['config']['Theano'] * config['config'][ 'Scan'] except: theanoscan = False self.MultiLayer[nmultilayer].initialize_inputlayer( Y, Label, A, nlayer, theanoscan) #--- Processing Layer --- elif (self.MultiLayer[nmultilayer].Layer[nlayer].__class__.__name__ == 'ProcessingLayer'): plcount += 1 InputSource = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['InputSource'] if (type(InputSource) == str): InputSource = (InputSource, ) C = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['C'] if (C == 'Default'): try: C = len(config['dataset']['classes']) except: C = config['dataset']['nclasses'] epsilon = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['epsilon'] try: if (epsilon == 'Default'): # if you change this formular remember to also change # it in Output->WriteSetting (!) if (config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['Model'] == 'MM-LabeledOnly'): epsilon = min(C/2.* \ 1./config['dataset']['training_label_size'], 1.) else: epsilon = C/2. * \ 1./config['dataset']['training_data_size'] elif (epsilon[0] == 'factor'): if (config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['Model'] == 'MM-LabeledOnly'): epsilon = min(epsilon[1] * \ C/float(config['dataset']['training_label_size']), 1.) else: epsilon = min(epsilon[1] * \ C/float(config['dataset']['training_data_size']), 1.) except: pass Model = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['Model'] L = self.MultiLayer[nmultilayer].Layer[0].get_input_label() if (InputSource[0] == 'InputLayer'): Y = self.MultiLayer[nmultilayer].Layer[0].get_input_data() D = Y.shape[1:] elif (InputSource[0][0:15] == 'ProcessingLayer'): D = (config['model']['MultiLayer'+str(nmultilayer+1)]\ [InputSource[0]]['C'],) Y = None else: D = None Y = None print "ERROR: %s" % 'model', print "| MultiLayer%d" % (nmultilayer + 1), print "| ProcessingLayer%d:" % nlayer, print "Invalid InputSource" """ # obsolete # for recurrent model: number of neurons in following layer try: K = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount+1)]['C']] if (K == 'Default'): K = len(config['dataset']['classes']) except: K = None """ # optional arguments: try: A = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['A'] if (A == 'Default'): # if you change this formular remember to also change # it in Output->WriteSetting (!) if (InputSource[0][0:15] == 'ProcessingLayer'): A = D + 1 else: A = None except: A = None try: threshold = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['threshold'] except: threshold = None h5path = None h5file = None try: InitMethod = config['model']['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]['Initialization'] if isinstance(InitMethod, tuple): InitMethod = InitMethod[0] if (InitMethod == 'h5'): h5path = config['model']\ ['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]\ ['Initialization'][1] # expected h5 file name format: # "Run[i]M[j]L[k].h5" # e.g. "Run3M1L2.h5" for weights of 1st MultiLayer, # 2nd Layer, the 3rd Run (counting starts with 1). # For each Run, their must be an individual h5 file # present. try: h5file = config['model']\ ['MultiLayer'+str(nmultilayer+1)]\ ['ProcessingLayer'+str(plcount)]\ ['Initialization'][2] except: h5file = "Run%dM%dL%d.h5" % ( self._run + 1, nmultilayer + 1, nlayer) except: InitMethod = None try: Theano = config['config']['Theano'] except: Theano = False try: Scan = config['config']['Scan'] except: Scan = False Parameters = { 'C': C, 'A': A, 'epsilon': epsilon, 'threshold': threshold } self.MultiLayer[nmultilayer].initialize_processinglayer( Model, Parameters, InputSource, nlayer, Theano, Scan, D, Y, L, InitMethod, h5path, h5file) if (np.ceil(float(config['dataset']['training_data_size'])/\ MPI.COMM_WORLD.Get_size() > ndatapoints)): self.MultiLayer[nmultilayer]._blankstep = True if self._theanoscan: self.MultiLayer[nmultilayer].compile_theano_functions()
def train(self, nmultilayer, output, config, dataset): ml = self.MultiLayer[nmultilayer] ml.next_run() pprint('%d.%d.2.1 - Visualize Weights' % (self._run + 1, nmultilayer + 1), end='') output.visualize_all_weights(self, nmultilayer, config) pprint(' (%0.2f MB)' % mem_usage()) # if (len(self.MultiLayer) == 1): # TODO: for greedy model: output Likelihood of up to the # current MultiLayer pprint('%d.%d.2.2 - LogLikelihood' % (self._run + 1, nmultilayer + 1), end='') output.write_loglikelihood(self, nmultilayer) pprint(' (%0.2f MB)' % mem_usage()) # variables for stopping criterion # TODO: generalize for all MultiLayers try: STOPPING_CRITERION = config.get()\ ['model']['MultiLayer1']['StoppingCriterion'] except: STOPPING_CRITERION = False if STOPPING_CRITERION: try: mvngwidth = int(config.get()\ ['model']['MultiLayer1']['MovingWidth']) except: pprint('WARNING (model.Model::Train): No width for \ moving average was given. It will be set to %d' % 20) mvngwidth = 20 loglikelihood = np.asarray([], dtype=np.float32) mvng_avg, mvng_std = 0., 0. max_mvng_avg = float('-inf') last_weights = [] else: loglikelihood = np.asarray([None]) STOP = False MPI.COMM_WORLD.Barrier() pprint('%d.%d.2.3 - Training Iterations' % (self._run + 1, nmultilayer + 1)) for niteration in xrange(ml.get_iterations()): pprint('Iteration: %*d' % (int(math.log10(ml.get_iterations())) + 1, ml.get_iteration() + 1), end='') # pprint('2.2.3.1 - Convergence', end='') output.conv_pre(ml) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) MPI.COMM_WORLD.Barrier() # pprint('2.2.3.2 - Learning Iteration', end='') ml.learning_iteration(self._theanoscan) # pprint(' - Memory usage: %s (Mb)\n' % mem_usage()) MPI.COMM_WORLD.Barrier() # experimental: variing learning rates # if ((niteration % 1 == 0) and # (ml.Layer[1]._epsilon >= 0.0008)): # ml.Layer[1]._epsilon *= 0.98 # MPI.COMM_WORLD.Barrier() # to save learned weights every iteration # output.save_weights(self.MultiLayer[nmultilayer]) # to save posterior distribution of training data every iteration # if nmultilayer == self.number_of_multilayers()-1: # output.save_posterior(self, config, dataset) # Stopping criterion if (len(self.MultiLayer) == 1): # TODO: for greedy model: output Likelihood of up to the # current MultiLayer if STOPPING_CRITERION: t0 = time.time() loglikelihood = np.append(loglikelihood, self.loglikelihood()) if (niteration >= mvngwidth): # save only the last #mvngwidth values loglikelihood = loglikelihood[1:] pprint(' | Log-Likelihood: %f (%f s)' % (loglikelihood[-1], time.time() - t0), end='') if STOPPING_CRITERION: # save only the last #mvngwidht/2-1 weights # for centered moving average last_weights.append([ ml.Layer[nlayer].get_weights() for nlayer in xrange(1, len(ml.Layer)) ]) if (niteration > mvngwidth / 2): last_weights = last_weights[1:] # calculate moving average over last #mvngwidth values if (niteration >= mvngwidth - 1): mvng_avg = np.mean(loglikelihood) mvng_std = np.std(loglikelihood) pprint(' | Moving Average (%d): %f +- %f' % ((niteration + 1 - mvngwidth / 2), mvng_avg, mvng_std), end='') if (mvng_avg > max_mvng_avg): max_mvng_avg = mvng_avg elif (mvng_avg < max_mvng_avg - mvng_std): # if the moving average drops below the maximum # moving average by more than the moving standard # deviation, stop the learning iteration and revert # back to the point of the centered moving average for nlayer in xrange(1, len(ml.Layer)): ml.Layer[nlayer].set_weights( last_weights[0][nlayer - 1]) stopping_iteration = niteration + 1 - mvngwidth / 2 pprint( '\nStopping criterion met at iteration %d.' % stopping_iteration) STOP = True ml.set_iteration(stopping_iteration) ml.set_iterations(stopping_iteration) # abort on numerical error (nan in any of the weights) if any([ np.any(np.isnan(ml.Layer[nlayer].get_weights())) for nlayer in xrange(1, len(ml.Layer)) ]): pprint('\nNumerical error: try to decrease learning ' + 'rate and/or mini-batch size.') STOP = True try: ml.set_iteration(stopping_iteration) ml.set_iterations(stopping_iteration) except: pass this_loglikelihood = output.write_loglikelihood( self, nmultilayer, loglikelihood[-1]) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) if (len(self.MultiLayer) == 1): # pprint('2.2.3.7 - Test Error', end='') output.write_online_results(self, config, dataset, this_loglikelihood) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) # pprint('2.2.3.4 - Visualize Weights', end='') output.visualize_all_weights(self, nmultilayer, config) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) # pprint('2.2.3.5 - Convergence 2', end='') output.conv_post(self.MultiLayer[nmultilayer]) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) # pprint('2.2.3.6 - Visualize Convergence', end='') output.visualize_convergence(self, nmultilayer) # pprint(' - Memory usage: %s (Mb)' % mem_usage()) if STOP: break pprint('') #linebreak