def End2end_Early_stopping(self, numpy_rng, dataset, n_validate, data_name, batch_size, end2end_lr, algo, norm, patience, validation): train_X, test_X, actual = dataset valid_x = train_X.get_value()[:n_validate] train_x = train_X.get_value()[n_validate:] #train_x = train_x[:100] "for compute tm and vm before optimization process" t = theano.shared(numpy.asarray(train_x, dtype=theano.config.floatX), borrow=True) v = theano.shared(numpy.asarray(valid_x, dtype=theano.config.floatX), borrow=True) "Use downhill for training network" opt = downhill.build(algo=algo, params=self.params, loss=self.end2end_cost, inputs=[self.x]) train = downhill.Dataset(train_x, batch_size=batch_size, rng=numpy_rng) valid = downhill.Dataset(valid_x, batch_size=len(valid_x), rng=numpy_rng) "for monitoring before optimization process" stop_ep = 0 for tm1, vm1 in opt.iterate( train, valid, patience=patience, validate_every=validation, min_improvement=1e-3, #learning_rate = end2end_lr, momentum=0.0, nesterov=False): stop_ep = stop_ep + 1 # ## "******* Classification Results after End to End training ******" # if ((stop_ep%1 == 0) and (stop_ep > 0)): # lof,cen,dis,kde,svm05,svm01,ae = self.Compute_AUC_Hidden(train_X, test_X, actual, norm, data_name) # a = [stop_ep, lof, cen, dis, kde, svm05, svm01, ae] # monitor = np.append(monitor, a) if (stop_ep >= 1000): break #Plotting AUC and save to csv file # monitor = np.reshape(monitor, (-1,8)) # Plotting_Monitor(monitor, 0.4, 1.0, data_name, path) # np.savetxt(path + data_name + "_monitor_auc.csv", monitor, delimiter=",", fmt='%f' ) return [stop_ep, vm1['loss'], tm1['loss']]
def End2end_Early_stopping(self, numpy_rng, dataset, n_validate, data_name, batch_size, end2end_lr, algo, norm, patience, validation): train_X, test_X, actual = dataset valid_x = train_X.get_value()[:n_validate] train_x = train_X.get_value()[n_validate:] "for compute tm and vm before optimization process" "Training network by downhill" #'adadelta' 'adagrad (default 0.01)' 'adam''esgd' 'nag''rmsprop' 'rprop' 'sgd' opt = downhill.build(algo=algo, params=self.params, loss=self.end2end_cost, inputs=[self.x]) train = downhill.Dataset(train_x, batch_size=batch_size, rng=numpy_rng) valid = downhill.Dataset(valid_x, batch_size=len(valid_x), rng=numpy_rng) "***** Monitor before optimization *****" stop_ep = 0 RE = np.empty([0, 3]) for tm, vm in opt.iterate( train, # 5, 5, 1e-2, 0.9 valid, patience=patience, # 10 validate_every=validation, # 5 min_improvement=1e-3, # 1e-3 #learning_rate = end2end_lr, # 1e-4 momentum=0.0, nesterov=False): stop_ep = stop_ep + 1 re = np.column_stack([stop_ep, vm['loss'], tm['loss']]) RE = np.append(RE, re) if (stop_ep >= 1000): break RE = np.reshape(RE, (-1, 3)) Plotting_End2End_RE(RE, stop_ep, 0.0, 0.4, data_name, path) np.savetxt(path + data_name + "_training_error1.csv", RE, delimiter=",", fmt='%f') np.set_printoptions(precision=6, suppress=True) print("\n ", RE[stop_ep - 1]) return RE[stop_ep - 1]
def test_batch_size(self): ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=10, rng=4) assert len(ds._batches) == 4 assert ds._batches[0][0].shape == (10, 2) assert ds._batches[1][0].shape == (10, 2) assert ds._batches[2][0].shape == (10, 2) assert ds._batches[3][0].shape == (10, 2) ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=11, rng=4) assert len(ds._batches) == 4 assert ds._batches[0][0].shape == (11, 2) assert ds._batches[1][0].shape == (11, 2) assert ds._batches[2][0].shape == (7, 2) assert ds._batches[3][0].shape == (11, 2)
def test_batch_size(self): ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=10, rng=4) assert len(ds._slices) == 4 assert_size(ds, 0, 10) assert_size(ds, 1, 10) assert_size(ds, 2, 10) assert_size(ds, 3, 10) ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=11, rng=4) assert len(ds._slices) == 4 assert_size(ds, 0, 11) assert_size(ds, 1, 11) assert_size(ds, 2, 7) assert_size(ds, 3, 11)
def pretrain_Early_stopping(self, numpy_rng, train_set, n_validate, data_name, batch_size, pre_lr, corruptions): RE = np.empty([10000, self.n_layers]) stop_epoch = np.empty([self.n_layers]) for i in range(self.n_layers): cost, updates = self.dA_layers[i].get_cost_updates( corruptions[i], pre_lr) if (i == 0): train_x1 = train_set.get_value() else: train_x1 = self.get_hidden_i(train_set, i - 1) valid_x = train_x1[:n_validate] train_x = train_x1[n_validate:] # adadelta, 'adagrad (default 0.01)' 'adam''esgd' 'nag''rmsprop' 'rprop' 'sgd' opt = downhill.build(algo='sgd', params=self.dA_layers[i].params, loss=cost) train = downhill.Dataset(train_x, batch_size=batch_size, rng=numpy_rng) valid = downhill.Dataset(valid_x, batch_size=len(valid_x), rng=numpy_rng) epoch = 0 re = np.empty([10000]) for tm1, vm1 in opt.iterate( train, valid, patience=100, #100 validate_every=5, #5 min_improvement=1e-3, #4 learning_rate=pre_lr, #1e-2 momentum=0.0, nesterov=False): re[epoch] = tm1['loss'] epoch = epoch + 1 if (epoch == 200): break RE[:, i] = re stop_epoch[i] = epoch print(' + Stopping epoch:', stop_epoch) Plotting_Pre_RE1(RE, stop_epoch, self.n_layers, 0.0, 0.1, batch_size, data_name, path)
def fit(self, train, entities, relations, param): self.n, self.m, self.l, self.k = entities, relations, entities, param.k self.setup(param) train, inputs = self.minibatch(train, param) opt = downhill.build(param.sgd, loss=self.loss_opt, inputs=inputs, monitor_gradients=True) train = downhill.Dataset(train, name='train') it = 0 for _ in opt.iterate(train, None, max_updates=param.epoch, validate_every=10, patience=5, max_gradient_norm=1, learning_rate=param.lr): it += 1 if it >= param.epoch: break
def fit(self, train_triples, valid_triples, hparams, n=0,m=0,l=0, scorer = None): #Set input_dimensions: if n == 0: #No given dimensions, can be useful for transparent predicton of entities/rels not seen in train self.set_dims(train_triples, hparams) else: self.n, self.m, self.l, self.k = n, m, l, hparams.embedding_size #Define the downhill loss corresponding to the input dimensions self.setup_params_for_train(train_triples, valid_triples, hparams) #get the loss inputs: train_vals, train_symbs, valid_vals = self.get_loss_args_and_symb_vars(train_triples, valid_triples, hparams) opt = downhill.build(hparams.learning_rate_policy, loss=self.loss_to_opt, inputs=train_symbs, monitor_gradients=True) train_vals = downhill.Dataset(train_vals, name = 'train') #Main SGD loop it = 0 best_valid_mrr = -1 best_valid_ap = -1 for tm, vm in opt.iterate(train_vals, None, max_updates=hparams.max_iter, validate_every=9999999, #I take care of the valiation, with validation metrics instead of loss patience=9999999, #Number of tolerated imporvements of validation loss that are inferior to min_improvement max_gradient_norm=1, # Prevent gradient explosion! learning_rate=hparams.learning_rate): if it % hparams.valid_scores_every == 0 and scorer is not None: if valid_triples is not None: logger.info("Validation metrics:") res = scorer.compute_scores(self, self.name, hparams, valid_triples) cv_res = CV_Results() cv_res.add_res(res, self.name, hparams.embedding_size, hparams.lmbda, self.nb_params) if scorer.compute_ranking_scores: metrics = cv_res.print_MRR_and_hits() #Early stopping on filtered MRR if best_valid_mrr >= metrics[self.name][2]: logger.info("Validation filtered MRR decreased, stopping here.") break else: best_valid_mrr = metrics[self.name][2] else: logger.info("Validation AP: " + str(res.ap)) #Early stopping on Average Precision if best_valid_ap >= res.ap: logger.info("Validation AP decreased, stopping here.") break else: best_valid_ap = res.ap it += 1 if it >= hparams.max_iter: #Avoid downhill resetting the parameters when max_iter is reached break
def test_iteration_size(self): def batches_unchanged(previous): return all( np.allclose(a, b) for a, b in zip(ds._batches, previous)) ds = downhill.Dataset([np.random.randn(40, 2)], batch_size=5, iteration_size=3) previous = list(ds._batches) c = sum(1 for _ in ds) assert c == 3, 'got {}'.format(c) assert ds._index == 3, 'got {}'.format(ds._index) assert batches_unchanged(previous) previous = list(ds._batches) c = sum(1 for _ in ds) assert c == 3 assert ds._index == 6, 'got {}'.format(ds._index) assert batches_unchanged(previous) previous = list(ds._batches) c = sum(1 for _ in ds) assert c == 3 assert ds._index == 1, 'got {}'.format(ds._index) assert not batches_unchanged(previous)
def create_dataset(self, data, **kwargs): '''Create a dataset for this experiment. Parameters ---------- data : sequence of ndarray or callable The values that you provide for data will be encapsulated inside a :class:`Dataset <downhill.Dataset>` instance; see that class for documentation on the types of things it needs. In particular, you can currently pass in either a list/array/etc. of data, or a callable that generates data dynamically. Returns ------- data : :class:`Dataset <downhill.Dataset>` A dataset capable of providing mini-batches of data to a training algorithm. ''' default_axis = 0 if not callable(data) and not callable(data[0]) and len( data[0].shape) == 3: default_axis = 1 name = kwargs.get('name', 'dataset') b, i, s = 'batch_size', 'iteration_size', '{}_batches'.format(name) return downhill.Dataset(data, name=name, batch_size=kwargs.get(b, 32), iteration_size=kwargs.get(i, kwargs.get(s)), axis=kwargs.get('axis', default_axis))
def dnn_sep(M, W1, W2, hh=.0001, ep=5000, d=0, sp=.0001, spb=3, al='rprop'): # GPU cached data _M = theano.shared(M.T.astype(float64)) dum = Th.vector('dum') # Get layer sizes K = [] for i in range(len(W1)): K.append([W1[i].shape[0], W2[i].shape[0]]) K.append([M.T.shape[1], M.T.shape[1]]) # We have weights to discover, init = 2/(Nin+Nout) H = theano.shared( sqrt(2. / (K[0][0] + K[0][1] + M.shape[1])) * random.rand(M.T.shape[0], K[0][0] + K[0][1]).astype(float64)) fI = InputLayer(shape=(M.T.shape[0], K[0][0] + K[0][1]), input_var=H) # Split in two pathways, one for each source's autoencoder H1 = (len(W1) + 1) * [None] H2 = (len(W1) + 1) * [None] H1[0] = SliceLayer(fI, indices=slice(0, K[0][0]), axis=1) H2[0] = SliceLayer(fI, indices=slice(K[0][0], K[0][0] + K[0][1]), axis=1) # Put the subsequent layers for i in range(len(W1)): H1[i + 1] = DenseLayer(H1[i], num_units=K[i + 1][0], W=W1[i].astype(float64), nonlinearity=lambda x: psoftplus(x, spb), b=None) H2[i + 1] = DenseLayer(H2[i], num_units=K[i + 1][1], W=W2[i].astype(float64), nonlinearity=lambda x: psoftplus(x, spb), b=None) # Add the two approximations R = ElemwiseSumLayer([H1[-1], H2[-1]]) # Cost function Ro = get_output(R) + eps cost = Th.mean(_M * (Th.log(_M + eps) - Th.log(Ro)) - _M + Ro) + 0 * Th.mean(dum) for i in range(len(H1) - 1): cost += sp * Th.mean(abs(get_output(H1[i]))) + sp * Th.mean( abs(get_output(H2[i]))) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[dum], params=[H]) train = downhill.Dataset(array([d]).astype(float64), batch_size=0) er = downhill_train(opt, train, hh, ep, None) # Get outputs _r = nget(R, dum, array([0]).astype(float64)).T + eps _r1 = nget(H1[-1], dum, array([0]).astype(float64)).T _r2 = nget(H2[-1], dum, array([0]).astype(float64)).T return _r, _r1, _r2, er
def dnn_model(M, K=[20, 20], hh=.0001, ep=5000, d=0, wsp=0.0001, hsp=0, spb=3, bt=0, al='rprop'): # Sort out the activation from inspect import isfunction if isfunction(spb): act = spb else: act = lambda x: psoftplus(x, spb) # Copy key variables to GPU _M = Th.matrix('_M') # Input and forward transform I = InputLayer(shape=(None, M.shape[0]), input_var=_M) # Setup the layers L = K + [M.T.shape[1]] H = len(L) * [None] Hd = len(L) * [None] # First layer H[0] = DenseLayer(I, num_units=K[0], nonlinearity=act, b=None) # All the rest for k in range(1, len(L)): # Optional dropout Hd[k - 1] = DropoutLayer(H[k - 1], d) # Next layer H[k] = DenseLayer(Hd[k - 1], num_units=L[k], nonlinearity=act, b=None) # Cost function Ro = get_output(H[-1]) + eps cost = Th.mean(_M * (Th.log(_M + eps) - Th.log(Ro)) - _M + Ro) for k in range(len(L) - 1): cost += wsp * Th.mean(abs(H[k].W)) + hsp * Th.mean(get_output(H[k])) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[_M], params=get_all_params(H[-1])) train = downhill.Dataset(M.T.astype(float64), batch_size=bt) er = downhill_train(opt, train, hh, ep, None) # Get approximation h = [nget(H[k], _M, M.T.astype(float64)).T for k in range(len(L))] w = [H[k].W.get_value() for k in range(len(L))] return h, w, er
def downhill_models(M, P, FE, z, K=20, hh=.001, ep=5000, dp=0, wsp=.001, plt=False): from paris.signal import bss_eval rng = theano.tensor.shared_randomstreams.RandomStreams(0) # Shared variables to use x = Th.matrix('x') y = theano.shared(M.astype(theano.config.floatX)) d = theano.shared(float32(dp)) # Network weights W0 = theano.shared( sqrt(2. / (K + M.shape[0])) * random.randn(K, M.shape[0]).astype(theano.config.floatX)) W1 = theano.shared( sqrt(2. / (K + M.shape[0])) * random.randn(M.shape[0], K).astype(theano.config.floatX)) # First layer is the transform to a non-negative subspace h = psoftplus(W0.dot(x), 3.) # Dropout if dp > 0: h *= (1. / (1. - d) * (rng.uniform(size=h.shape) > d).astype( theano.config.floatX)).astype(theano.config.floatX) # Second layer reconstructs the input r = psoftplus(W1.dot(h), 3.) # Approximate input using KL-like distance cost = Th.mean(y * (Th.log(y + eps) - Th.log(r + eps)) - y + r) + wsp * Th.mean(abs(W1)) # Make an optimizer and define the training input opt = downhill.build('rprop', loss=cost, inputs=[x], params=[W0, W1]) train = downhill.Dataset(M.astype(theano.config.floatX), batch_size=0) # Train it downhill_train(opt, train, hh, ep, None) # Get approximation d = 0 _, _r = theano.function(inputs=[x], outputs=[h, r], updates=[])(M.astype(theano.config.floatX)) o = FE.ife(_r, P) sxr = bss_eval(o, 0, array([z])) return W1.get_value(), sxr
def build_model(algo): loss_value = [] W1.set_value(W1_val) b1.set_value(b1_val) W2.set_value(W2_val) b2.set_value(b2_val) opt = downhill.build(algo, loss=loss) train = downhill.Dataset([train_X[:-1000], train_y_onehot[:-1000]], batch_size=1, iteration_size=1) valid = downhill.Dataset([train_X[-1000:], train_y_onehot[-1000:]]) iterations = 0 for tm, vm in opt.iterate(train, valid, patience=1000): iterations += 1 loss_value.append(vm['loss']) if iterations > 1000: break return loss_value
def cnn_model(M, K=20, T=1, hh=.0001, ep=5000, d=0, hsp=0.0001, wsp=0, spb=3, bt=0, al='rprop'): # Facilitate reasonable convolutions core theano.config.dnn.conv.algo_fwd = 'fft_tiling' theano.config.dnn.conv.algo_bwd_filter = 'none' theano.config.dnn.conv.algo_bwd_data = 'none' # Reformat input data M3 = reshape(M.astype(float32), (1, M.shape[0], M.shape[1])) # Copy key variables to GPU _M = Th.tensor3('_M') # Input and forward transform I = InputLayer(shape=M3.shape, input_var=_M) # First layer is the transform to a non-negative subspace H = Conv1DLayer(I, filter_size=T, num_filters=K, pad='same', nonlinearity=lambda x: psoftplus(x, spb), b=None) # Upper layer is the synthesizer R = Conv1DLayer(H, filter_size=T, num_filters=M.shape[0], pad='same', nonlinearity=lambda x: psoftplus(x, spb), b=None) # Cost function Ro = get_output(R) + eps cost = Th.mean( _M*(Th.log( _M+eps) - Th.log( Ro)) - _M + Ro) \ + hsp*Th.mean( get_output( H)) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[_M], params=get_all_params(R)) train = downhill.Dataset(M3, batch_size=bt) er = downhill_train(opt, train, hh, ep, None) # Get approximation and hidden state _r = squeeze(nget(R, _M, M3)) _h = squeeze(nget(H, _M, M3)) return _r, R.W.get_value(), er, _h
def create_dataset(data, **kwargs): name = kwargs.get('name', 'dataset') s = '{}_batches'.format(name) return downhill.Dataset(data, name=name, batch_size=kwargs.get('batch_size', 32), iteration_size=kwargs.get( 'iteration_size', kwargs.get(s)), axis=kwargs.get('axis', 0), rng=kwargs['rng'])
def test_shared(self): x = theano.shared(np.random.randn(40, 2)) ds = downhill.Dataset([x], batch_size=10, rng=4) assert len(ds._slices) == 4 assert_size(ds, 0, 10) assert_size(ds, 1, 10) assert_size(ds, 2, 10) assert_size(ds, 3, 10) f = list(ds)[0][0] assert isinstance(f, TT.TensorVariable), type(f)
def test_sparse_csr(self): import scipy.sparse as ss x = ss.csr_matrix(np.random.randn(40, 2)) ds = downhill.Dataset([x], batch_size=10, rng=4) assert len(ds._slices) == 4 assert_size(ds, 0, 10) assert_size(ds, 1, 10) assert_size(ds, 2, 10) assert_size(ds, 3, 10) f = list(ds)[0][0] assert isinstance(f, ss.csr.csr_matrix), type(f)
def test_pandas(self): import pandas as pd x = pd.DataFrame(np.random.randn(40, 2)) ds = downhill.Dataset([x], batch_size=10, rng=4) assert len(ds._slices) == 4 assert_size(ds, 0, 10) assert_size(ds, 1, 10) assert_size(ds, 2, 10) assert_size(ds, 3, 10) f = list(ds)[0][0] assert isinstance(f, pd.DataFrame), type(f)
def lasagne_models(M, P, FE, z, K=20, hh=.0001, ep=5000, d=0, wsp=0.0001, plt=True): from paris.signal import bss_eval # Copy key variables to GPU _M = Th.matrix('_M') # Input and forward transform I = InputLayer(shape=M.T.shape, input_var=_M) # First layer is the transform to a non-negative subspace H0 = DenseLayer(I, num_units=K, nonlinearity=lambda x: psoftplus(x, 3.), b=None) # Optional dropout H = DropoutLayer(H0, d) # Compute source modulator R = DenseLayer(H, num_units=M.T.shape[1], nonlinearity=lambda x: psoftplus(x, 3.), b=None) # Cost function cost = (_M*(Th.log(_M+eps) - Th.log( get_output( R)+eps)) - _M + get_output( R)).mean() \ + wsp*Th.mean( abs( R.W)) # Train it using Lasagne opt = downhill.build('rprop', loss=cost, inputs=[_M], params=get_all_params(R)) train = downhill.Dataset(M.T.astype(float32), batch_size=0) er = downhill_train(opt, train, hh, ep, None)[-1] # Get approximation _r = nget(R, _M, M.T.astype(float32)).T _h = nget(H, _M, M.T.astype(float32)).T o = FE.ife(_r, P) sxr = bss_eval(o, 0, array([z])) return R, sxr
def test_callable_length(self): class Batches: called = 0 def __call__(self): self.called += 1 return 'hello' def __len__(self): return 10 batches = Batches() ds = downhill.Dataset(batches, iteration_size=10) assert list(ds) == ['hello'] * 10 assert batches.called == 10
def test_minimize(self): x = theano.shared(-3 + np.zeros((2, ), 'f'), name='x') data = downhill.Dataset(np.zeros((1, 1), 'f'), batch_size=1) data._slices = [[]] downhill.minimize( (100 * (x[1:] - x[:-1]**2)**2 + (1 - x[:-1])**2).sum(), data, algo='nag', learning_rate=0.001, momentum=0.9, patience=1, min_improvement=0.1, max_gradient_norm=1, ) assert np.allclose(x.get_value(), [1, 1]), x.get_value()
def rnn_model(M, K=20, hh=.0001, ep=5000, d=0, wsp=0.0001, hsp=0, spb=3, bt=0, al='rmsprop', t=5): # Copy key variables to GPU _M = Th.matrix('_M') # Input and forward transform I = InputLayer(shape=(None, M.shape[0]), input_var=_M) # First layer is the transform to a non-negative subspace H0 = DenseLayer(I, num_units=K, nonlinearity=lambda x: psoftplus(x, spb), b=None) # Optional dropout H = DropoutLayer(H0, d) # Compute output R = RecurrentLayer(H, num_units=M.T.shape[1], nonlinearity=lambda x: psoftplus(x, spb), gradient_steps=t, b=None) # Cost function Ro = get_output(R) + eps cost = Th.mean( _M*(Th.log( _M+eps) - Th.log( Ro)) - _M + Ro) \ + hsp*Th.mean( get_output( H0)) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[_M], params=get_all_params(R)) train = downhill.Dataset(M.T.astype(float32), batch_size=bt) er = downhill_train(opt, train, hh, ep, None) # Get approximation _r = nget(R, _M, M.T.astype(float32)).T _h = nget(H, _M, M.T.astype(float32)).T return _r, (R.W_in_to_hid.get_value(), R.W_hid_to_hid.get_value()), er, _h
def nn_model(M, K=20, hh=.0001, ep=5000, d=0, wsp=0.0001, hsp=0, spb=3, bt=0, al='rprop'): # Sort out the activation from inspect import isfunction if isfunction(spb): act = spb else: act = lambda x: psoftplus(x, spb) # Copy key variables to GPU _M = Th.matrix('_M') # Input and forward transform I = InputLayer(shape=(None, M.shape[0]), input_var=_M) # First layer is the transform to a non-negative subspace H0 = DenseLayer(I, num_units=K, nonlinearity=act, b=None) # Optional dropout H = DropoutLayer(H0, d) # Compute output R = DenseLayer(H, num_units=M.T.shape[1], nonlinearity=act, b=None) # Cost function Ro = get_output(R) + eps cost = Th.mean( _M*(Th.log( _M+eps) - Th.log( Ro)) - _M + Ro) \ + wsp*Th.mean( abs( R.W[0])) + hsp*Th.mean( get_output( H0)) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[_M], params=get_all_params(R)) train = downhill.Dataset(M.T.astype(float64), batch_size=bt) er = downhill_train(opt, train, hh, ep, None) # Get approximation _r = nget(R, _M, M.T.astype(float64)).T _h = nget(H, _M, M.T.astype(float64)).T return _r, R.W.get_value(), er, _h
y_hat = T.nnet.softmax(z2) loss_reg = 1. / batch_size * reg_lambda / 2 * (T.sum(T.sqr(W1)) + T.sum(T.sqr(W2))) loss = T.nnet.categorical_crossentropy(y_hat, y).mean() + loss_reg prediction = T.argmax(y_hat, axis=1) predict = theano.function([X], prediction) #Store the training and vlidation loss train_loss = [] validation_loss = [] opt = downhill.build('sgd', loss=loss) #Set up training and validation dataset splits, use only one example in a batch #and use only one batch per step/epoc #Use everything except last 1000 examples for training train = downhill.Dataset([train_X[:-1000], train_y_onehot[:-1000]], batch_size=batch_size, iteration_size=1) #Use last 1000 examples for valudation valid = downhill.Dataset([train_X[-1000:], train_y_onehot[-1000:]]) #SGD iterations = 0 for tm, vm in opt.iterate(train, valid, patience=10000): iterations += 1 # Record the training and validation loss train_loss.append(tm['loss']) validation_loss.append(vm['loss']) if iterations > 1000: break def build_model(algo):
def lasagne_separate(M, P, FE, W1, W2, z1, z2, hh=.0001, ep=5000, d=0, wsp=.0001, plt=True): # Gt dictionary shapes K = [W1.shape[0], W2.shape[0]] # GPU cached data _M = theano.shared(M.astype(float32)) # Input is the learned dictionary set lW = hstack((W1.T, W2.T)).astype(float32) _lW = Th.matrix('_lW') fI = InputLayer(shape=lW.shape, input_var=_lW) # Split in two paths fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1) fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1) # Dropout? dfW1 = DropoutLayer(fW1, d) dfW2 = DropoutLayer(fW2, d) N_sequence = 10 # # Compute source modulators # R1 = LSTMLayer(dfW1, N_sequence) # R2 = LSTMLayer(dfW2, N_sequence) # Bring to standard orientation R = ElemwiseSumLayer([R1, R2]) # Cost function cost = ( _M * (Th.log(_M + eps) - Th.log(get_output(R) + eps)) - _M + get_output(R)).mean() + wsp * (Th.mean(abs(R1.W)) + Th.mean(abs(R2.W))) # Train it using Lasagne opt = downhill.build('rprop', loss=cost, inputs=[_lW], params=get_all_params(R)) train = downhill.Dataset(lW, batch_size=0) er = downhill_train(opt, train, hh, ep, None)[-1] # Get outputs _r = nget(R, _lW, lW) + eps _r1 = nget(R1, _lW, lW) _r2 = nget(R2, _lW, lW) o1 = FE.ife(_r1 * (M / _r), P) o2 = FE.ife(_r2 * (M / _r), P) sxr = bss_eval(o1, 0, vstack((z1, z2))) + bss_eval(o2, 1, vstack((z1, z2))) return o1, o2, (array(sxr[:3]) + array(sxr[3:])) / 2.
def rnn_sep(M, W1, W2, hh=.0001, ep=5000, d=0, sp=.0001, spb=3, al='rmsprop', t=5): # Get dictionary shapes K = [W1[0].shape[0], W2[0].shape[0]] # GPU cached data _M = theano.shared(M.T.astype(float32)) dum = Th.vector('dum') # We have weights to discover H = theano.shared( sqrt(2. / (K[0] + K[1] + M.shape[1])) * random.rand(M.T.shape[0], K[0] + K[1]).astype(float32)) fI = InputLayer(shape=(M.T.shape[0], K[0] + K[1]), input_var=H) # Split in two pathways fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1) fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1) # Dropout? dfW1 = DropoutLayer(fW1, dum[0]) dfW2 = DropoutLayer(fW2, dum[0]) # Compute source modulators using previously learned dictionaries R1 = RecurrentLayer(dfW1, num_units=M.T.shape[1], b=None, W_in_to_hid=W1[0].astype(float32), W_hid_to_hid=W1[1].astype(float32), nonlinearity=lambda x: psoftplus(x, spb), gradient_steps=5) R2 = RecurrentLayer(dfW2, num_units=M.T.shape[1], b=None, W_in_to_hid=W2[0].astype(float32), W_hid_to_hid=W2[1].astype(float32), nonlinearity=lambda x: psoftplus(x, spb), gradient_steps=5) # Add the two approximations R = ElemwiseSumLayer([R1, R2]) # Cost function Ro = get_output(R) + eps cost = (_M*(Th.log(_M+eps) - Th.log( Ro+eps)) - _M + Ro).mean() \ + sp*Th.mean( abs( H)) + 0*Th.mean( dum) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[dum], params=[H]) train = downhill.Dataset(array([d]).astype(float32), batch_size=0) er = downhill_train(opt, train, hh, ep, None) # Get outputs _r = nget(R, dum, array([0]).astype(float32)).T + eps _r1 = nget(R1, dum, array([0]).astype(float32)).T _r2 = nget(R2, dum, array([0]).astype(float32)).T return _r, _r1, _r2, er
def test_name(self): ds = downhill.Dataset([np.random.randn(40, 2)], name='foo') assert ds.name == 'foo' ds = downhill.Dataset([np.random.randn(40, 2)]) assert ds.name.startswith('dataset') assert ds.name[7:].isdigit()
def cnn_sep(M, W1, W2, hh=.0001, ep=5000, d=0, sp=.0001, spb=3, al='rprop'): # Facilitate reasonable convolutions core theano.config.dnn.conv.algo_fwd = 'fft_tiling' theano.config.dnn.conv.algo_bwd_filter = 'none' theano.config.dnn.conv.algo_bwd_data = 'none' # Reformat input data M3 = reshape(M.astype(float32), (1, M.shape[0], M.shape[1])) # Copy key variables to GPU _M = theano.shared(M3.astype(float32)) # Get dictionary shapes K = [W1.shape[1], W2.shape[1]] T = W1.shape[2] # We have weights to discover H = theano.shared( sqrt(2. / (K[0] + K[1] + M.shape[1])) * random.rand(1, K[0] + K[1], M.T.shape[0]).astype(float32)) fI = InputLayer(shape=(1, K[0] + K[1], M.T.shape[0]), input_var=H) # Split in two pathways H1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1) H2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1) # Compute source modulators using previously learned convolutional dictionaries R1 = Conv1DLayer(H1, filter_size=T, W=W1, num_filters=M.shape[0], pad='same', nonlinearity=lambda x: psoftplus(x, spb), b=None) R2 = Conv1DLayer(H2, filter_size=T, W=W2, num_filters=M.shape[0], pad='same', nonlinearity=lambda x: psoftplus(x, spb), b=None) # Add the two approximations R = ElemwiseSumLayer([R1, R2]) # Cost function dum = Th.vector('dum') Ro = get_output(R) + eps cost = Th.mean(_M * (Th.log(_M + eps) - Th.log(Ro)) - _M + Ro) + 0 * Th.mean(dum) + sp * Th.mean(abs(H)) # Train it using Lasagne opt = downhill.build(al, loss=cost, inputs=[dum], params=[H]) train = downhill.Dataset(array([0]).astype(float32), batch_size=0) er = downhill_train(opt, train, hh, ep, None) # Get outputs _r = squeeze(nget(R, dum, array([0]).astype(float32))) + eps _r1 = squeeze(nget(R1, dum, array([0]).astype(float32))) _r2 = squeeze(nget(R2, dum, array([0]).astype(float32))) return _r, _r1, _r2, er
def test_callable(self): def batches(): return 'hello' ds = downhill.Dataset(batches, iteration_size=10) assert list(ds) == ['hello'] * 10
def test_rng(self): ds = downhill.Dataset([np.random.randn(40, 2)], rng=4) assert ds.rng.randint(10) == 7 ds = downhill.Dataset([np.random.randn(40, 2)], rng=np.random.RandomState(4)) assert ds.rng.randint(10) == 7