# parzen print 'Evaluating parzen window' import likelihood_estimation_parzen likelihood_estimation_parzen.main(0.20,'mnist') # Inpainting print 'Inpainting' test_X = test_X.get_value() numpy.random.seed(2) test_idx = numpy.arange(len(test_Y)) for Iter in range(10): numpy.random.shuffle(test_idx) test_X = test_X[test_idx] test_Y = test_Y[test_idx] digit_idx = [(test_Y==i).argmax() for i in range(10)] inpaint_list = [] for idx in digit_idx: DIGIT = test_X[idx:idx+1] V_inpaint, H_inpaint = inpainting(DIGIT) inpaint_list.append(V_inpaint) INPAINTING = numpy.vstack(inpaint_list) plot_inpainting = PIL.Image.fromarray(tile_raster_images(INPAINTING, (root_N_input,root_N_input), (10,50)))
def experiment(state, channel): if state.test_model and 'config' in os.listdir('.'): print 'Loading local config file' config_file = open('config', 'r') config = config_file.readlines() try: config_vals = config[0].split('(')[1:][0].split(')')[:-1][0].split(', ') except: config_vals = config[0][3:-1].replace(': ','=').replace("'","").split(', ') config_vals = filter(lambda x:not 'jobman' in x and not '/' in x and not ':' in x and not 'experiment' in x, config_vals) for CV in config_vals: print CV if CV.startswith('test'): print 'Do not override testing switch' continue try: exec('state.'+CV) in globals(), locals() except: exec('state.'+CV.split('=')[0]+"='"+CV.split('=')[1]+"'") in globals(), locals() else: # Save the current configuration # Useful for logs/experiments print 'Saving config' f = open('config', 'w') f.write(str(state)) f.close() print state # Load the data, train = train+valid, and shuffle train # Targets are not used (will be misaligned after shuffling train if state.dataset == 'MNIST': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_mnist(state.data_path) train_X = numpy.concatenate((train_X, valid_X)) elif state.dataset == 'MNIST_binary': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_mnist_binary(state.data_path) train_X = numpy.concatenate((train_X, valid_X)) elif state.dataset == 'TFD': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_tfd(state.data_path) N_input = train_X.shape[1] root_N_input = numpy.sqrt(N_input) numpy.random.seed(1) numpy.random.shuffle(train_X) train_X = theano.shared(train_X) valid_X = theano.shared(valid_X) test_X = theano.shared(test_X) # Theano variables and RNG X = T.fmatrix() # Input of the graph index = T.lscalar() # index to minibatch MRG = RNG_MRG.MRG_RandomStreams(1) # Network and training specifications K = state.K # number of hidden layers N = state.N # number of walkbacks layer_sizes = [N_input] + [state.hidden_size] * K # layer sizes, from h0 to hK (h0 is the visible layer) learning_rate = theano.shared(cast32(state.learning_rate)) # learning rate annealing = cast32(state.annealing) # exponential annealing coefficient momentum = theano.shared(cast32(state.momentum)) # momentum term # PARAMETERS : weights list and bias list. # initialize a list of weights and biases based on layer_sizes weights_list = [get_shared_weights(layer_sizes[i], layer_sizes[i+1], numpy.sqrt(6. / (layer_sizes[i] + layer_sizes[i+1] )), 'W') for i in range(K)] bias_list = [get_shared_bias(layer_sizes[i], 'b') for i in range(K + 1)] if state.test_model: # Load the parameters of the last epoch # maybe if the path is given, load these specific attributes param_files = filter(lambda x:'params' in x, os.listdir('.')) max_epoch_idx = numpy.argmax([int(x.split('_')[-1].split('.')[0]) for x in param_files]) params_to_load = param_files[max_epoch_idx] PARAMS = cPickle.load(open(params_to_load,'r')) [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(PARAMS[:len(weights_list)], weights_list)] [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(PARAMS[len(weights_list):], bias_list)] # Util functions def dropout(IN, p = 0.5): noise = MRG.binomial(p = p, n = 1, size = IN.shape, dtype='float32') OUT = (IN * noise) / cast32(p) return OUT def add_gaussian_noise(IN, std = 1): print 'GAUSSIAN NOISE : ', std noise = MRG.normal(avg = 0, std = std, size = IN.shape, dtype='float32') OUT = IN + noise return OUT def corrupt_input(IN, p = 0.5): # salt and pepper? masking? noise = MRG.binomial(p = p, n = 1, size = IN.shape, dtype='float32') IN = IN * noise return IN def salt_and_pepper(IN, p = 0.2): # salt and pepper noise print 'DAE uses salt and pepper noise' a = MRG.binomial(size=IN.shape, n=1, p = 1 - p, dtype='float32') b = MRG.binomial(size=IN.shape, n=1, p = 0.5, dtype='float32') c = T.eq(a,0) * b return IN * a + c # Odd layer update function # just a loop over the odd layers def update_odd_layers(hiddens, noisy): for i in range(1, K+1, 2): print i if noisy: simple_update_layer(hiddens, None, i) else: simple_update_layer(hiddens, None, i, add_noise = False) # Even layer update # p_X_chain is given to append the p(X|...) at each update (one update = odd update + even update) def update_even_layers(hiddens, p_X_chain, noisy): for i in range(0, K+1, 2): print i if noisy: simple_update_layer(hiddens, p_X_chain, i) else: simple_update_layer(hiddens, p_X_chain, i, add_noise = False) # The layer update function # hiddens : list containing the symbolic theano variables [visible, hidden1, hidden2, ...] # layer_update will modify this list inplace # p_X_chain : list containing the successive p(X|...) at each update # update_layer will append to this list # add_noise : pre and post activation gaussian noise def simple_update_layer(hiddens, p_X_chain, i, add_noise=True): # Compute the dot product, whatever layer post_act_noise = 0 if i == 0: hiddens[i] = T.dot(hiddens[i+1], weights_list[i].T) + bias_list[i] elif i == K: hiddens[i] = T.dot(hiddens[i-1], weights_list[i-1]) + bias_list[i] else: # next layer : layers[i+1], assigned weights : W_i # previous layer : layers[i-1], assigned weights : W_(i-1) hiddens[i] = T.dot(hiddens[i+1], weights_list[i].T) + T.dot(hiddens[i-1], weights_list[i-1]) + bias_list[i] # Add pre-activation noise if NOT input layer if i==1 and state.noiseless_h1: print '>>NO noise in first layer' add_noise = False # pre activation noise if i != 0 and add_noise: print 'Adding pre-activation gaussian noise' hiddens[i] = add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma) # ACTIVATION! if i == 0: print 'Sigmoid units' hiddens[i] = T.nnet.sigmoid(hiddens[i]) else: print 'Hidden units' hiddens[i] = hidden_activation(hiddens[i]) # post activation noise if i != 0 and add_noise: print 'Adding post-activation gaussian noise' hiddens[i] = add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma) # build the reconstruction chain if i == 0: # if input layer -> append p(X|...) p_X_chain.append(hiddens[i]) # sample from p(X|...) if state.input_sampling: print 'Sampling from input' sampled = MRG.binomial(p = hiddens[i], size=hiddens[i].shape, dtype='float32') else: print '>>NO input sampling' sampled = hiddens[i] # add noise sampled = salt_and_pepper(sampled, state.input_salt_and_pepper) # set input layer hiddens[i] = sampled def update_layers(hiddens, p_X_chain, noisy = True): print 'odd layer update' update_odd_layers(hiddens, noisy) print print 'even layer update' update_even_layers(hiddens, p_X_chain, noisy) ''' F PROP ''' if state.act == 'sigmoid': print 'Using sigmoid activation' hidden_activation = T.nnet.sigmoid elif state.act == 'rectifier': print 'Using rectifier activation' hidden_activation = lambda x : T.maximum(cast32(0), x) elif state.act == 'tanh': hidden_activation = lambda x : T.tanh(x) ''' Corrupt X ''' X_corrupt = salt_and_pepper(X, state.input_salt_and_pepper) ''' hidden layer init ''' hiddens = [X_corrupt] p_X_chain = [] print "Hidden units initialization" for w,b in zip(weights_list, bias_list[1:]): # init with zeros print "Init hidden units at zero before creating the graph" hiddens.append(T.zeros_like(T.dot(hiddens[-1], w))) # The layer update scheme print "Building the graph :", N,"updates" for i in range(N): update_layers(hiddens, p_X_chain) # COST AND GRADIENTS print 'Cost w.r.t p(X|...) at every step in the graph' #COST = T.mean(T.nnet.binary_crossentropy(reconstruction, X)) COST = [T.mean(T.nnet.binary_crossentropy(rX, X)) for rX in p_X_chain] #COST = [T.mean(T.sqr(rX-X)) for rX in p_X_chain] show_COST = COST[-1] COST = numpy.sum(COST) #COST = T.mean(COST) params = weights_list + bias_list gradient = T.grad(COST, params) gradient_buffer = [theano.shared(numpy.zeros(x.get_value().shape, dtype='float32')) for x in params] m_gradient = [momentum * gb + (cast32(1) - momentum) * g for (gb, g) in zip(gradient_buffer, gradient)] g_updates = [(p, p - learning_rate * mg) for (p, mg) in zip(params, m_gradient)] b_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(g_updates + b_updates) f_cost = theano.function(inputs = [X], outputs = show_COST) indexed_batch = train_X[index * state.batch_size : (index+1) * state.batch_size] sampled_batch = MRG.binomial(p = indexed_batch, size = indexed_batch.shape, dtype='float32') f_learn = theano.function(inputs = [index], updates = updates, givens = {X : indexed_batch}, outputs = show_COST) f_test = theano.function(inputs = [X], outputs = [X_corrupt] + hiddens[0] + p_X_chain, on_unused_input = 'warn') ############# # Denoise some numbers : show number, noisy number, reconstructed number ############# import random as R R.seed(1) random_idx = numpy.array(R.sample(range(len(test_X.get_value())), 100)) numbers = test_X.get_value()[random_idx] f_noise = theano.function(inputs = [X], outputs = salt_and_pepper(X, state.input_salt_and_pepper)) noisy_numbers = f_noise(test_X.get_value()[random_idx]) # Recompile the graph without noise for reconstruction function hiddens_R = [X] p_X_chain_R = [] for w,b in zip(weights_list, bias_list[1:]): # init with zeros hiddens_R.append(T.zeros_like(T.dot(hiddens_R[-1], w))) # The layer update scheme for i in range(N): update_layers(hiddens_R, p_X_chain_R, noisy=False) f_recon = theano.function(inputs = [X], outputs = p_X_chain_R[-1]) ############ # Sampling # ############ # the input to the sampling function network_state_input = [X] + [T.fmatrix() for i in range(K)] # "Output" state of the network (noisy) # initialized with input, then we apply updates #network_state_output = network_state_input network_state_output = [X] + network_state_input[1:] visible_pX_chain = [] # ONE update update_layers(network_state_output, visible_pX_chain, noisy=True) if K == 1: f_sample_simple = theano.function(inputs = [X], outputs = visible_pX_chain[-1]) # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn f_sample2 = theano.function(inputs = network_state_input, outputs = network_state_output + visible_pX_chain, on_unused_input='warn') def sample_some_numbers_single_layer(): x0 = test_X.get_value()[:1] samples = [x0] x = f_noise(x0) for i in range(399): x = f_sample_simple(x) samples.append(x) x = numpy.random.binomial(n=1, p=x, size=x.shape).astype('float32') x = f_noise(x) return numpy.vstack(samples) def sampling_wrapper(NSI): out = f_sample2(*NSI) NSO = out[:len(network_state_output)] vis_pX_chain = out[len(network_state_output):] return NSO, vis_pX_chain def sample_some_numbers(N=400): # The network's initial state init_vis = test_X.get_value()[:1] noisy_init_vis = f_noise(init_vis) network_state = [[noisy_init_vis] + [numpy.zeros((1,len(b.get_value())), dtype='float32') for b in bias_list[1:]]] visible_chain = [init_vis] noisy_h0_chain = [noisy_init_vis] for i in range(N-1): # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain) def plot_samples(epoch_number): to_sample = time.time() if K == 1: # one layer model V = sample_some_numbers_single_layer() else: V, H0 = sample_some_numbers() img_samples = PIL.Image.fromarray(tile_raster_images(V, (root_N_input,root_N_input), (20,20))) fname = 'samples_epoch_'+str(epoch_number)+'.png' img_samples.save(fname) print 'Took ' + str(time.time() - to_sample) + ' to sample 400 numbers' ############## # Inpainting # ############## def inpainting(digit): # The network's initial state # NOISE INIT init_vis = cast32(numpy.random.uniform(size=digit.shape)) #noisy_init_vis = f_noise(init_vis) #noisy_init_vis = cast32(numpy.random.uniform(size=init_vis.shape)) # INDEXES FOR VISIBLE AND NOISY PART noise_idx = (numpy.arange(N_input) % root_N_input < (root_N_input/2)) fixed_idx = (numpy.arange(N_input) % root_N_input > (root_N_input/2)) # function to re-init the visible to the same noise # FUNCTION TO RESET HALF VISIBLE TO DIGIT def reset_vis(V): V[0][fixed_idx] = digit[0][fixed_idx] return V # INIT DIGIT : NOISE and RESET HALF TO DIGIT init_vis = reset_vis(init_vis) network_state = [[init_vis] + [numpy.zeros((1,len(b.get_value())), dtype='float32') for b in bias_list[1:]]] visible_chain = [init_vis] noisy_h0_chain = [init_vis] for i in range(49): # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # reset half the digit net_state_out[0] = reset_vis(net_state_out[0]) vis_pX_chain[0] = reset_vis(vis_pX_chain[0]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain) def save_params(n, params): print 'saving parameters...' save_path = 'params_epoch_'+str(n)+'.pkl' f = open(save_path, 'wb') try: cPickle.dump(params, f, protocol=cPickle.HIGHEST_PROTOCOL) finally: f.close() # TRAINING n_epoch = state.n_epoch batch_size = state.batch_size STOP = False counter = 0 train_costs = [] valid_costs = [] test_costs = [] if state.vis_init: bias_list[0].set_value(logit(numpy.clip(0.9,0.001,train_X.get_value().mean(axis=0)))) if state.test_model: # If testing, do not train and go directly to generating samples, parzen window estimation, and inpainting print 'Testing : skip training' STOP = True while not STOP: counter += 1 t = time.time() print counter,'\t', #train train_cost = [] for i in range(len(train_X.get_value(borrow=True)) / batch_size): #train_cost.append(f_learn(train_X[i * batch_size : (i+1) * batch_size])) #training_idx = numpy.array(range(i*batch_size, (i+1)*batch_size), dtype='int32') train_cost.append(f_learn(i)) train_cost = numpy.mean(train_cost) train_costs.append(train_cost) print 'Train : ',trunc(train_cost), '\t', #valid valid_cost = [] for i in range(len(valid_X.get_value(borrow=True)) / 100): valid_cost.append(f_cost(valid_X.get_value()[i * 100 : (i+1) * batch_size])) valid_cost = numpy.mean(valid_cost) #valid_cost = 123 valid_costs.append(valid_cost) print 'Valid : ', trunc(valid_cost), '\t', #test test_cost = [] for i in range(len(test_X.get_value(borrow=True)) / 100): test_cost.append(f_cost(test_X.get_value()[i * 100 : (i+1) * batch_size])) test_cost = numpy.mean(test_cost) test_costs.append(test_cost) print 'Test : ', trunc(test_cost), '\t', if counter >= n_epoch: STOP = True print 'time : ', trunc(time.time() - t), print 'MeanVisB : ', trunc(bias_list[0].get_value().mean()), print 'W : ', [trunc(abs(w.get_value(borrow=True)).mean()) for w in weights_list] if (counter % 5) == 0: # Checking reconstruction reconstructed = f_recon(noisy_numbers) # Concatenate stuff stacked = numpy.vstack([numpy.vstack([numbers[i*10 : (i+1)*10], noisy_numbers[i*10 : (i+1)*10], reconstructed[i*10 : (i+1)*10]]) for i in range(10)]) number_reconstruction = PIL.Image.fromarray(tile_raster_images(stacked, (root_N_input,root_N_input), (10,30))) #epoch_number = reduce(lambda x,y : x + y, ['_'] * (4-len(str(counter)))) + str(counter) number_reconstruction.save('number_reconstruction'+str(counter)+'.png') #sample_numbers(counter, 'seven') plot_samples(counter) #save params save_params(counter, params) # ANNEAL! new_lr = learning_rate.get_value() * annealing learning_rate.set_value(new_lr) # Save state.train_costs = train_costs state.valid_costs = valid_costs state.test_costs = test_costs # if test # 10k samples print 'Generating 10,000 samples' samples, _ = sample_some_numbers(N=10000) f_samples = 'samples.npy' numpy.save(f_samples, samples) print 'saved digits' # parzen print 'Evaluating parzen window' import likelihood_estimation_parzen likelihood_estimation_parzen.main(0.20,'mnist') # Inpainting print 'Inpainting' test_X = test_X.get_value() numpy.random.seed(2) test_idx = numpy.arange(len(test_Y)) for Iter in range(10): numpy.random.shuffle(test_idx) test_X = test_X[test_idx] test_Y = test_Y[test_idx] digit_idx = [(test_Y==i).argmax() for i in range(10)] inpaint_list = [] for idx in digit_idx: DIGIT = test_X[idx:idx+1] V_inpaint, H_inpaint = inpainting(DIGIT) inpaint_list.append(V_inpaint) INPAINTING = numpy.vstack(inpaint_list) plot_inpainting = PIL.Image.fromarray(tile_raster_images(INPAINTING, (root_N_input,root_N_input), (10,50))) fname = 'inpainting_'+str(Iter)+'.png' #fname = os.path.join(state.model_path, fname) plot_inpainting.save(fname) if False and __name__ == "__main__": os.system('eog inpainting.png') if __name__ == '__main__': import ipdb; ipdb.set_trace() return
def experiment(state, channel): if state.test_model and 'config' in os.listdir('.'): print('Loading local config file') config_file = open('config', 'r') config = config_file.readlines() try: config_vals = config[0].split('(')[1:][0].split(')')[:-1][0].split( ', ') except: config_vals = config[0][3:-1].replace(': ', '=').replace("'", "").split(', ') config_vals = filter( lambda x: not 'jobman' in x and not '/' in x and not ':' in x and not 'experiment' in x, config_vals) for CV in config_vals: print(CV) if CV.startswith('test'): print('Do not override testing switch') continue try: exec('state.' + CV) in globals(), locals() except: exec('state.' + CV.split('=')[0] + "='" + CV.split('=')[1] + "'") in globals(), locals() else: # Save the current configuration # Useful for logs/experiments print('Saving config') f = open('config', 'w') f.write(str(state)) f.close() print(state) # Load the data, train = train+valid, and shuffle train # Targets are not used (will be misaligned after shuffling train if state.dataset == 'MNIST': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_mnist(state.data_path) train_X = numpy.concatenate((train_X, valid_X)) elif state.dataset == 'MNIST_binary': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_mnist_binary(state.data_path) train_X = numpy.concatenate((train_X, valid_X)) elif state.dataset == 'TFD': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = load_tfd(state.data_path) N_input = train_X.shape[1] root_N_input = int(numpy.sqrt(N_input)) # numpy.random.seed(1) numpy.random.shuffle(train_X) train_X = theano.shared(train_X) valid_X = theano.shared(valid_X) test_X = theano.shared(test_X) # Theano variables and RNG X = T.fmatrix() # Input of the graph index = T.lscalar() # index to minibatch MRG = RNG_MRG.MRG_RandomStreams(1) # Network and training specifications K = state.K # number of hidden layers N = state.N # number of walkbacks layer_sizes = [ N_input ] + [state.hidden_size ] * K # layer sizes, from h0 to hK (h0 is the visible layer) learning_rate = theano.shared(cast32(state.learning_rate)) # learning rate annealing = cast32(state.annealing) # exponential annealing coefficient momentum = theano.shared(cast32(state.momentum)) # momentum term # PARAMETERS : weights list and bias list. # initialize a list of weights and biases based on layer_sizes weights_list = [ get_shared_weights( layer_sizes[i], layer_sizes[i + 1], numpy.sqrt(6. / (layer_sizes[i] + layer_sizes[i + 1])), 'W') for i in range(K) ] bias_list = [get_shared_bias(layer_sizes[i], 'b') for i in range(K + 1)] if state.test_model: # Load the parameters of the last epoch # maybe if the path is given, load these specific attributes param_files = list( filter(lambda x: 'params' in x, os.listdir('.')) ) # https://stackoverflow.com/questions/15876259/typeerror-filter-object-is-not-subscriptable max_epoch_idx = numpy.argmax( [int(x.split('_')[-1].split('.')[0]) for x in param_files]) params_to_load = param_files[max_epoch_idx] with open(params_to_load, 'rb') as f: PARAMS = pk.load(f, encoding='bytes') [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip(PARAMS[:len(weights_list)], weights_list) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip(PARAMS[len(weights_list):], bias_list) ] # Util functions def dropout(IN, p=0.5): noise = MRG.binomial(p=p, n=1, size=IN.shape, dtype='float32') OUT = (IN * noise) / cast32(p) return OUT def add_gaussian_noise(IN, std=1): print('GAUSSIAN NOISE : ', std) noise = MRG.normal(avg=0, std=std, size=IN.shape, dtype='float32') OUT = IN + noise return OUT def corrupt_input(IN, p=0.5): # salt and pepper? masking? noise = MRG.binomial(p=p, n=1, size=IN.shape, dtype='float32') IN = IN * noise return IN def salt_and_pepper(IN, p=0.2): # salt and pepper noise print('DAE uses salt and pepper noise') a = MRG.binomial(size=IN.shape, n=1, p=1 - p, dtype='float32') b = MRG.binomial(size=IN.shape, n=1, p=0.5, dtype='float32') c = T.eq(a, 0) * b return IN * a + c # Odd layer update function # just a loop over the odd layers def update_odd_layers(hiddens, noisy): for i in range(1, K + 1, 2): print(i) if noisy: simple_update_layer(hiddens, None, i) else: simple_update_layer(hiddens, None, i, add_noise=False) # Even layer update # p_X_chain is given to append the p(X|...) at each update (one update = odd update + even update) def update_even_layers(hiddens, p_X_chain, noisy): for i in range(0, K + 1, 2): print(i) if noisy: simple_update_layer(hiddens, p_X_chain, i) else: simple_update_layer(hiddens, p_X_chain, i, add_noise=False) # The layer update function # hiddens : list containing the symbolic theano variables [visible, hidden1, hidden2, ...] # layer_update will modify this list inplace # p_X_chain : list containing the successive p(X|...) at each update # update_layer will append to this list # add_noise : pre and post activation gaussian noise def simple_update_layer(hiddens, p_X_chain, i, add_noise=True): # Compute the dot product, whatever layer post_act_noise = 0 if i == 0: hiddens[i] = T.dot(hiddens[i + 1], weights_list[i].T) + bias_list[i] elif i == K: hiddens[i] = T.dot(hiddens[i - 1], weights_list[i - 1]) + bias_list[i] else: # next layer : layers[i+1], assigned weights : W_i # previous layer : layers[i-1], assigned weights : W_(i-1) hiddens[i] = T.dot(hiddens[i + 1], weights_list[i].T) + T.dot( hiddens[i - 1], weights_list[i - 1]) + bias_list[i] # Add pre-activation noise if NOT input layer if i == 1 and state.noiseless_h1: print('>>NO noise in first layer') add_noise = False # pre activation noise if i != 0 and add_noise: print('Adding pre-activation gaussian noise') hiddens[i] = add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma) # ACTIVATION! if i == 0: print('Sigmoid units') hiddens[i] = T.nnet.sigmoid(hiddens[i]) else: print('Hidden units') hiddens[i] = hidden_activation(hiddens[i]) # post activation noise if i != 0 and add_noise: print('Adding post-activation gaussian noise') hiddens[i] = add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma) # build the reconstruction chain if i == 0: # if input layer -> append p(X|...) p_X_chain.append(hiddens[i]) # sample from p(X|...) if state.input_sampling: print('Sampling from input') sampled = MRG.binomial(p=hiddens[i], size=hiddens[i].shape, dtype='float32') else: print('>>NO input sampling') sampled = hiddens[i] # add noise sampled = salt_and_pepper(sampled, state.input_salt_and_pepper) # set input layer hiddens[i] = sampled def update_layers(hiddens, p_X_chain, noisy=True): print('odd layer update') update_odd_layers(hiddens, noisy) print print('even layer update') update_even_layers(hiddens, p_X_chain, noisy) ''' F PROP ''' if state.act == 'sigmoid': print('Using sigmoid activation') hidden_activation = T.nnet.sigmoid elif state.act == 'rectifier': print('Using rectifier activation') hidden_activation = lambda x: T.maximum(cast32(0), x) elif state.act == 'tanh': hidden_activation = lambda x: T.tanh(x) ''' Corrupt X ''' X_corrupt = salt_and_pepper(X, state.input_salt_and_pepper) ''' hidden layer init ''' hiddens = [X_corrupt] p_X_chain = [] print("Hidden units initialization") for w, b in zip(weights_list, bias_list[1:]): # init with zeros print("Init hidden units at zero before creating the graph") hiddens.append(T.zeros_like(T.dot(hiddens[-1], w))) # The layer update scheme print("Building the graph :", N, "updates") for i in range(N): update_layers(hiddens, p_X_chain) # COST AND GRADIENTS print('Cost w.r.t p(X|...) at every step in the graph') #COST = T.mean(T.nnet.binary_crossentropy(reconstruction, X)) COST = [T.mean(T.nnet.binary_crossentropy(rX, X)) for rX in p_X_chain] #COST = [T.mean(T.sqr(rX-X)) for rX in p_X_chain] show_COST = COST[-1] COST = numpy.sum(COST) #COST = T.mean(COST) params = weights_list + bias_list print('======== COST:', COST) print('======== params:', params) gradient = T.grad(COST, params) gradient_buffer = [ theano.shared(numpy.zeros(x.get_value().shape, dtype='float32')) for x in params ] m_gradient = [ momentum * gb + (cast32(1) - momentum) * g for (gb, g) in zip(gradient_buffer, gradient) ] g_updates = [(p, p - learning_rate * mg) for (p, mg) in zip(params, m_gradient)] b_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(g_updates + list(b_updates)) f_cost = theano.function(inputs=[X], outputs=show_COST) indexed_batch = train_X[index * state.batch_size:(index + 1) * state.batch_size] sampled_batch = MRG.binomial(p=indexed_batch, size=indexed_batch.shape, dtype='float32') f_learn = theano.function(inputs=[index], updates=updates, givens={X: indexed_batch}, outputs=show_COST) f_test = theano.function(inputs=[X], outputs=[X_corrupt] + hiddens[0] + p_X_chain, on_unused_input='warn') ############# # Denoise some numbers : show number, noisy number, reconstructed number ############# import random as R R.seed(1) random_idx = numpy.array(R.sample(range(len(test_X.get_value())), 100)) numbers = test_X.get_value()[random_idx] f_noise = theano.function(inputs=[X], outputs=salt_and_pepper( X, state.input_salt_and_pepper)) noisy_numbers = f_noise(test_X.get_value()[random_idx]) # Recompile the graph without noise for reconstruction function hiddens_R = [X] p_X_chain_R = [] for w, b in zip(weights_list, bias_list[1:]): # init with zeros hiddens_R.append(T.zeros_like(T.dot(hiddens_R[-1], w))) # The layer update scheme for i in range(N): update_layers(hiddens_R, p_X_chain_R, noisy=False) f_recon = theano.function(inputs=[X], outputs=p_X_chain_R[-1]) ############ # Sampling # ############ # the input to the sampling function network_state_input = [X] + [T.fmatrix() for i in range(K)] # "Output" state of the network (noisy) # initialized with input, then we apply updates #network_state_output = network_state_input network_state_output = [X] + network_state_input[1:] visible_pX_chain = [] # ONE update update_layers(network_state_output, visible_pX_chain, noisy=True) if K == 1: f_sample_simple = theano.function(inputs=[X], outputs=visible_pX_chain[-1]) # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn f_sample2 = theano.function(inputs=network_state_input, outputs=network_state_output + visible_pX_chain, on_unused_input='warn') def sample_some_numbers_single_layer(): x0 = test_X.get_value()[:1] samples = [x0] x = f_noise(x0) for i in range(399): x = f_sample_simple(x) samples.append(x) x = numpy.random.binomial(n=1, p=x, size=x.shape).astype('float32') x = f_noise(x) return numpy.vstack(samples) def sampling_wrapper(NSI): out = f_sample2(*NSI) NSO = out[:len(network_state_output)] vis_pX_chain = out[len(network_state_output):] return NSO, vis_pX_chain def sample_some_numbers(N=400): # The network's initial state init_vis = test_X.get_value()[:1] noisy_init_vis = f_noise(init_vis) network_state = [[noisy_init_vis] + [ numpy.zeros((1, len(b.get_value())), dtype='float32') for b in bias_list[1:] ]] visible_chain = [init_vis] noisy_h0_chain = [noisy_init_vis] for i in range(N - 1): # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain) def plot_samples(epoch_number): to_sample = time.time() if K == 1: # one layer model V = sample_some_numbers_single_layer() else: V, H0 = sample_some_numbers() img_samples = PIL.Image.fromarray( tile_raster_images(V, (root_N_input, root_N_input), (20, 20))) fname = 'samples_epoch_' + str(epoch_number) + '.png' img_samples.save(fname) print('Took ' + str(time.time() - to_sample) + ' to sample 400 numbers') ############## # Inpainting # ############## def inpainting(digit): # The network's initial state # NOISE INIT init_vis = cast32(numpy.random.uniform(size=digit.shape)) #noisy_init_vis = f_noise(init_vis) #noisy_init_vis = cast32(numpy.random.uniform(size=init_vis.shape)) # INDEXES FOR VISIBLE AND NOISY PART noise_idx = (numpy.arange(N_input) % root_N_input < (root_N_input / 2)) fixed_idx = (numpy.arange(N_input) % root_N_input > (root_N_input / 2)) # function to re-init the visible to the same noise # FUNCTION TO RESET HALF VISIBLE TO DIGIT def reset_vis(V): V[0][fixed_idx] = digit[0][fixed_idx] return V # INIT DIGIT : NOISE and RESET HALF TO DIGIT init_vis = reset_vis(init_vis) network_state = [[init_vis] + [ numpy.zeros((1, len(b.get_value())), dtype='float32') for b in bias_list[1:] ]] visible_chain = [init_vis] noisy_h0_chain = [init_vis] for i in range(49): # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # reset half the digit net_state_out[0] = reset_vis(net_state_out[0]) vis_pX_chain[0] = reset_vis(vis_pX_chain[0]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain) def save_params(n, params): print('saving parameters...') save_path = 'params_epoch_' + str(n) + '.pkl' f = open(save_path, 'wb') try: pk.dump(params, f, protocol=pk.HIGHEST_PROTOCOL) finally: f.close() # TRAINING n_epoch = state.n_epoch batch_size = state.batch_size STOP = False counter = 0 train_costs = [] valid_costs = [] test_costs = [] if state.vis_init: bias_list[0].set_value( logit(numpy.clip(0.9, 0.001, train_X.get_value().mean(axis=0)))) if state.test_model: # If testing, do not train and go directly to generating samples, parzen window estimation, and inpainting print('Testing : skip training') STOP = True while not STOP: counter += 1 t = time.time() print( counter, '\t', ) #train train_cost = [] for i in range(len(train_X.get_value(borrow=True)) // batch_size): #train_cost.append(f_learn(train_X[i * batch_size : (i+1) * batch_size])) #training_idx = numpy.array(range(i*batch_size, (i+1)*batch_size), dtype='int32') train_cost.append(f_learn(i)) train_cost = numpy.mean(train_cost) train_costs.append(train_cost) print( 'Train : ', trunc(train_cost), '\t', ) #valid valid_cost = [] for i in range(len(valid_X.get_value(borrow=True)) // 100): valid_cost.append( f_cost(valid_X.get_value()[i * 100:(i + 1) * batch_size])) valid_cost = numpy.mean(valid_cost) #valid_cost = 123 valid_costs.append(valid_cost) print( 'Valid : ', trunc(valid_cost), '\t', ) #test test_cost = [] for i in range(len(test_X.get_value(borrow=True)) // 100): test_cost.append( f_cost(test_X.get_value()[i * 100:(i + 1) * batch_size])) test_cost = numpy.mean(test_cost) test_costs.append(test_cost) print( 'Test : ', trunc(test_cost), '\t', ) if counter >= n_epoch: STOP = True print( 'time : ', trunc(time.time() - t), ) print( 'MeanVisB : ', trunc(bias_list[0].get_value().mean()), ) print('W : ', [ trunc(abs(w.get_value(borrow=True)).mean()) for w in weights_list ]) if (counter % 5) == 0: # Checking reconstruction reconstructed = f_recon(noisy_numbers) # Concatenate stuff stacked = numpy.vstack([ numpy.vstack([ numbers[i * 10:(i + 1) * 10], noisy_numbers[i * 10:(i + 1) * 10], reconstructed[i * 10:(i + 1) * 10] ]) for i in range(10) ]) number_reconstruction = PIL.Image.fromarray( tile_raster_images(stacked, (root_N_input, root_N_input), (10, 30))) #epoch_number = reduce(lambda x,y : x + y, ['_'] * (4-len(str(counter)))) + str(counter) number_reconstruction.save('number_reconstruction' + str(counter) + '.png') #sample_numbers(counter, 'seven') plot_samples(counter) #save params save_params(counter, params) # ANNEAL! new_lr = learning_rate.get_value() * annealing learning_rate.set_value(new_lr) # Save state.train_costs = train_costs state.valid_costs = valid_costs state.test_costs = test_costs # if test # 10k samples print('Generating 10,000 samples') samples, _ = sample_some_numbers(N=10000) f_samples = 'samples.npy' numpy.save(f_samples, samples) print('saved digits') # parzen print('Evaluating parzen window') import likelihood_estimation_parzen likelihood_estimation_parzen.main(0.20, 'mnist') # Inpainting print('Inpainting') test_X = test_X.get_value() numpy.random.seed(2) test_idx = numpy.arange(len(test_Y)) for Iter in range(10): numpy.random.shuffle(test_idx) test_X = test_X[test_idx] test_Y = test_Y[test_idx] digit_idx = [(test_Y == i).argmax() for i in range(10)] inpaint_list = [] for idx in digit_idx: DIGIT = test_X[idx:idx + 1] V_inpaint, H_inpaint = inpainting(DIGIT) inpaint_list.append(V_inpaint) INPAINTING = numpy.vstack(inpaint_list) plot_inpainting = PIL.Image.fromarray( tile_raster_images(INPAINTING, (root_N_input, root_N_input), (10, 50))) fname = 'inpainting_' + str(Iter) + '.png' #fname = os.path.join(state.model_path, fname) plot_inpainting.save(fname) if False and __name__ == "__main__": os.system('eog inpainting.png') if __name__ == '__main__': import ipdb ipdb.set_trace() return