def __init__(self, readout, transition, dim_dec, attention=None, add_contexts=True, pointer_weight=0.5, transition_with_att_class=None, use_word_annotations=False, **kwargs): super(Generator, self).__init__(**kwargs) self.inputs = [name for name in transition.apply.sequences if 'mask' not in name] self.dim_dec = dim_dec self.pointer_weight = pointer_weight fork = Fork(self.inputs) kwargs.setdefault('fork', fork) if attention: transition = transition_with_att_class( transition, attention, add_contexts=add_contexts, name="att_trans") else: transition = FakeAttentionRecurrent(transition, name="with_fake_attention") self.readout = readout self.transition = transition self.fork = fork self.children = [self.readout, self.fork, self.transition] self.use_word_annotations = use_word_annotations if use_word_annotations: self.word_annotation_preprocessor = Linear( name='input_attention_preprocessor', bias=False) self.children.append(self.word_annotation_preprocessor)
class CoreNetwork(BaseRecurrent, Initializable): def __init__(self, input_dim, dim, **kwargs): super(CoreNetwork, self).__init__(**kwargs) self.input_dim = input_dim self.dim = dim self.lstm = LSTM(dim=dim, name=self.name + '_lstm', weights_init=self.weights_init, biases_init=self.biases_init) self.proj = Linear(input_dim=input_dim, output_dim=dim*4, name=self.name + '_proj', weights_init=self.weights_init, biases_init=self.biases_init) self.children = [self.lstm, self.proj] def get_dim(self, name): if name == 'inputs': return self.input_dim elif name in ['state', 'cell']: return self.dim else: raise ValueError @recurrent(sequences=['inputs'], states=['state', 'cell'], contexts=[], outputs=['state', 'cell']) def apply(self, inputs, state, cell): state, cell = self.lstm.apply(self.proj.apply(inputs), state, cell, iterate=False) return state, cell
class LocationNetwork(Random, Initializable): def __init__(self, input_dim, loc_emb, std, non_hetro=False, **kwargs): super(LocationNetwork, self).__init__(**kwargs) self.std = std self.non_hetro = non_hetro self.mean_affine = Linear( input_dim=input_dim, output_dim=loc_emb, weights_init=self.weights_init, biases_init=self.biases_init) if non_hetro: self.std_affine = Linear(input_dim=input_dim, output_dim=loc_emb, weights_init=self.weights_init, biases_init=self.biases_init) self.children = [self.mean_affine, self.std_affine] else: self.children = [self.mean_affine] def get_dim(self, name): if name == 'hidden_g': return self.transform.get_dim('inputs') elif name in ['l', 'l_sample']: return self.transform.get_dim('outputs') else: raise ValueError @application(inputs=['hidden_g'], outputs=['l', 'l_sample']) def apply(self, hidden_g): loc_mean = self.mean_affine.apply(hidden_g) loc_u = hard_tanh(loc_mean) if self.non_hetro: loc_std = T.nnet.relu(self.std_affine.apply(hidden_g)) std = loc_std else: std = self.std loc_sample = self.theano_rng.normal(avg=loc_u, std=std, size=loc_mean.shape, dtype=theano.config.floatX) return loc_u, hard_tanh(loc_sample)
def __init__(self, input_dim, loc_emb, std, non_hetro=False, **kwargs): super(LocationNetwork, self).__init__(**kwargs) self.std = std self.non_hetro = non_hetro self.mean_affine = Linear( input_dim=input_dim, output_dim=loc_emb, weights_init=self.weights_init, biases_init=self.biases_init) if non_hetro: self.std_affine = Linear(input_dim=input_dim, output_dim=loc_emb, weights_init=self.weights_init, biases_init=self.biases_init) self.children = [self.mean_affine, self.std_affine] else: self.children = [self.mean_affine]
def __init__(self, input_dim, n_classes, multi_object=False, **kwargs): super(ActionNetwork, self).__init__(**kwargs) self.transform = Linear(input_dim=input_dim, output_dim=n_classes, **kwargs) if multi_object: self.out = Logistic() else: self.out = Softmax() self.children = [self.transform, self.out]
def __init__(self, output_names, input_dim, prototype=None, **kwargs): if not prototype: prototype = Linear() self.output_names = output_names self.input_dim = input_dim kwargs.setdefault('child_prefix', 'fork') super(Fork, self).__init__(output_names, prototype=prototype, **kwargs) self.input_dims = None
def __init__(self, input_names, input_dims, output_dim, prototype=None, **kwargs): if not prototype: prototype = Linear(use_bias=False) self.output_dim = output_dim super(Merge, self).__init__(input_names, input_dims, [output_dim for _ in input_names], prototype, **kwargs)
class ActionNetwork(Initializable): def __init__(self, input_dim, n_classes, multi_object=False, **kwargs): super(ActionNetwork, self).__init__(**kwargs) self.transform = Linear(input_dim=input_dim, output_dim=n_classes, **kwargs) if multi_object: self.out = Logistic() else: self.out = Softmax() self.children = [self.transform, self.out] def get_dim(self, name): if name == 'hidden_g': return self.transform.get_dim('inputs') else: raise ValueError @application(inputs=['hidden_g'], outputs=['action']) def apply(self, hidden_g): return self.out.apply(self.transform.apply(hidden_g))
def __init__(self, input_dim, dim, **kwargs): super(CoreNetwork, self).__init__(**kwargs) self.input_dim = input_dim self.dim = dim self.lstm = LSTM(dim=dim, name=self.name + '_lstm', weights_init=self.weights_init, biases_init=self.biases_init) self.proj = Linear(input_dim=input_dim, output_dim=dim*4, name=self.name + '_proj', weights_init=self.weights_init, biases_init=self.biases_init) self.children = [self.lstm, self.proj]
def __init__(self, dim, n_channels, img_height, img_width, N, sensor=None, n_retina=3, radius=4, activations=None, **kwargs): super(GlimpseNetwork, self).__init__(**kwargs) if sensor is None or sensor == 'simple': self.sensor = GlimpseSensorBeta(channels=n_channels, img_height=img_height, img_width=img_width, N=N) elif sensor == 'retina': self.sensor = RetinaGlimpse(img_width, img_height, n_channels, n_retina=n_retina, radius=radius) else: raise ValueError("sensor mode support [simple]|[retina]." + 'Got ' + sensor + '.') self.loc_emb = self.sensor.emb_dim self.glimpes_0 = Linear(input_dim=self.loc_emb, output_dim=dim, name=self.name + '_glimp_0', weights_init=self.weights_init, biases_init=self.biases_init) self.glimpes_1 = Linear(input_dim=self.sensor.get_dim('glimpse'), output_dim=dim, name=self.name + '_glimp_1', weights_init=self.weights_init, biases_init=self.biases_init) self.glimpes_out = Linear(input_dim=dim*2, output_dim=dim, name=self.name + '_glimp_out', weights_init=self.weights_init, biases_init=self.biases_init) self.children = [self.glimpes_0, self.glimpes_1, self.glimpes_out] self.output_dim = dim
def __init__(self, target_names, source_name, target_dims, source_dim, prototype=None, **kwargs): if not prototype: prototype = Linear(use_bias=False) self.target_names = target_names self.source_name = source_name self.target_dims = target_dims self.source_dim = source_dim super(Distribute, self).__init__(output_names=target_names, output_dims=target_dims, input_dim=source_dim, prototype=prototype, **kwargs)
def build_theano_functions(self, data_mean, data_std) : x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') # before the cell, input, forget and output gates, x needs to # be transformed linear_transforms = [] for transform in ['c','i','f','o'] : linear_transforms.append( Linear(self.input_dim, self.lstm_dim, weights_init=Uniform(mean=data_mean, std=data_std), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(data_mean), name=transform+"_transform") ) for transform in linear_transforms : transform.initialize() linear_applications = [] for transform in linear_transforms : linear_applications.append( transform.apply(x)) lstm_input = T.concatenate(linear_applications, axis=2) # the lstm wants batch X time X value lstm = LSTM( dim=self.lstm_dim, weights_init=IsotropicGaussian(mean=0.5,std=1), biases_init=Constant(1)) lstm.initialize() h, _dummy = lstm.apply(lstm_input) # this is where Alex Graves' paper starts output_transform = Linear(self.lstm_dim, self.output_dim, #weights_init=Uniform(mean=data_mean, std=data_std), weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(1), name="output_transform") output_transform.initialize() y_hat = output_transform.apply(h) # transforms to find each gmm params (mu, pi, sig) #pis = NDimensionalSoftmax.apply(y_hat[:,:,0:self.gmm_dim]) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim))) , (self.batch_dim, self.time_dim, self.gmm_dim)) #sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2]) sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 mus = y_hat[:,:,self.gmm_dim*2:] pis = pis[:,:,:,np.newaxis] mus = mus[:,:,:,np.newaxis] sig = sig[:,:,:,np.newaxis] y = y[:,:,np.newaxis,:] #sig=theano.printing.Print()(sig) # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time #LL = -T.log((pis*(1./(T.sqrt(2.*np.pi)*sig))*T.exp(-0.5*((y-mus)**2)/sig**2)).sum(axis=2)).sum() expo = T.exp(-0.5*((y-mus)**2)/sig**2) test_expo = theano.function([x,y],[expo, mus, sig]) return test_expo coeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) inside_log = (coeff*expo).sum(axis=2) LL = -(T.log(inside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] for i in range(len(grads)) : updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) gradf = theano.function([x, y],[LL],updates=updates) f = theano.function([x],[pis, sig, mus]) return gradf, f
def build_theano_functions(self): x = T.fmatrix('time_sequence') x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim)) y = x[:,1:self.sequence_dim,:] x = x[:,:self.sequence_dim-1,:] # if we try to include the spectrogram features spec_dims = 0 if self.image_size is not None : print "Convolution activated" self.init_conv() spec = T.ftensor4('spectrogram') spec_features, spec_dims = self.conv.build_conv_layers(spec) print "Conv final dims =", spec_dims spec_dims = np.prod(spec_dims) spec_features = spec_features.reshape( (self.batch_dim, self.sequence_dim-1, spec_dims)) x = T.concatenate([x, spec_features], axis=2) layers_input = [x] dims =np.array([self.time_dim + spec_dims]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X sequence X time lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:,:,:self.gmm_dim], ((self.sequence_dim-1)*self.batch_dim, self.gmm_dim))), (self.batch_dim, (self.sequence_dim-1), self.gmm_dim)) sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6 mus = y_hat[:,:,self.gmm_dim*2:] pis = pis[:,:,:,np.newaxis] mus = mus[:,:,:,np.newaxis] sig = sig[:,:,:,np.newaxis] y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5*((y-mus)**2)/sig**2 coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean() LL.name = "summed_likelihood" model = Model(LL) self.model = model parameters = model.parameters algorithm = GradientDescent( cost=LL, parameters=model.parameters, step_rule=Adam()) f = theano.function([x],[pis, sig, mus]) return algorithm, f
def build_theano_functions(self): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor4('y') layers_input = [x] dims = np.array([self.time_dim]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear( dims[layer], dims[layer + 1] * 4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale * Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear( dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=0., std=1), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor #pis = T.reshape( # T.nnet.softmax( # T.nnet.sigmoid( # T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))), # (self.batch_dim, self.time_dim, self.gmm_dim)) pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:, :, :self.gmm_dim], (self.sequence_dim * self.batch_dim, self.gmm_dim))), (self.batch_dim, self.sequence_dim, self.gmm_dim)) sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6 #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:]) mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] #y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5 * ((y - mus)**2) / sig**2 coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum( sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum( T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True)) ).mean() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)): #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append( tuple([parameters[i], parameters[i] - lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug: gradf = theano.function([x, y, lr], [LL, pis, mus, sig], updates=updates) else: #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr], [LL], updates=updates) f = theano.function([x], [pis, sig, mus]) return gradf, f
class GlimpseNetwork(Initializable): """ GlimpseSensor & Linear + Rectifier ---------------------------------- apply: input_shape (batch_size, n_channels * img_width * img_height) output_dim (batch_size, dim) """ def __init__(self, dim, n_channels, img_height, img_width, N, sensor=None, n_retina=3, radius=4, activations=None, **kwargs): super(GlimpseNetwork, self).__init__(**kwargs) if sensor is None or sensor == 'simple': self.sensor = GlimpseSensorBeta(channels=n_channels, img_height=img_height, img_width=img_width, N=N) elif sensor == 'retina': self.sensor = RetinaGlimpse(img_width, img_height, n_channels, n_retina=n_retina, radius=radius) else: raise ValueError("sensor mode support [simple]|[retina]." + 'Got ' + sensor + '.') self.loc_emb = self.sensor.emb_dim self.glimpes_0 = Linear(input_dim=self.loc_emb, output_dim=dim, name=self.name + '_glimp_0', weights_init=self.weights_init, biases_init=self.biases_init) self.glimpes_1 = Linear(input_dim=self.sensor.get_dim('glimpse'), output_dim=dim, name=self.name + '_glimp_1', weights_init=self.weights_init, biases_init=self.biases_init) self.glimpes_out = Linear(input_dim=dim*2, output_dim=dim, name=self.name + '_glimp_out', weights_init=self.weights_init, biases_init=self.biases_init) self.children = [self.glimpes_0, self.glimpes_1, self.glimpes_out] self.output_dim = dim def get_dim(self, name): if name == 'img': return self.sensor.get_dim('img') elif name == 'l_last': return self.sensor.emb_dim else: raise ValueError @application(contexts=['img'], sequences=[], state=['l_last'], outputs=['hidden_g']) def apply(self, img, l_last): """ Params ------ img: (batch_size, img_height, img_width, n_channels) center_x: (batch_size,) center_y: (batch_size,) --- Return ------ h_g : (batch_size, output_dim) """ l_unpack = self.sensor.nn2att(l_last) glimpes = self.sensor.read(img, *l_unpack) h0 = T.nnet.relu(self.glimpes_0.apply(l_last)) h1 = T.nnet.relu(self.glimpes_1.apply(glimpes)) h_c = T.concatenate([h0, h1], axis=1) hidden_g = T.nnet.relu(self.glimpes_out.apply(h_c)) return hidden_g
def build_theano_functions(self): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') z = T.ftensor3('z') layers_input = [x] dims = np.array([self.input_dim]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear( dims[layer], dims[layer + 1] * 4, #weights_init=Uniform(mean=data_mean, std=1), weights_init=IsotropicGaussian(mean=1., std=1), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value(Orthogonal().generate( np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear( dims[1:].sum(), self.output_dim, weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:, :, :self.output_dim / 2]) + 0.05 mus = y_hat[:, :, self.output_dim / 2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5 * ((y - mus)**2) / sig**2 expo = T.exp(inside_expo) coeff = 1. / (T.sqrt(2. * np.pi) * sig) inside_log = T.log(coeff * expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log( T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() #zinside_expo = -0.5*((z-mus)**2)/sig**2 #zexpo = T.exp(zinside_expo) #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) #zinside_log = (zcoeff*zexpo).sum(axis=2) #zLL = -(T.log(zinside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)): #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append( tuple([parameters[i], parameters[i] - lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug: gradf = theano.function([x, y, lr], [LL, mus, sig], updates=updates) else: #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr], [LL], updates=updates) f = theano.function([x], [sig, mus]) return gradf, f
batch_size = 5 input_dim = 6 output_dim = 8 n_classes = 10 test_data = {x: np.random.normal(size=(n_steps, batch_size, input_dim) ).astype(np.float32), y: np.random.randint(n_classes, size=(batch_size, ) ).astype(np.int32)} inits = { 'weights_init': IsotropicGaussian(0.1), 'biases_init': Constant(0.), } core = CoreNetwork(input_dim=input_dim, dim=output_dim, **inits) core.initialize() proj = Linear(input_dim=output_dim*2, output_dim=n_classes, **inits) proj.initialize() out = Softmax() state, cell = core.apply(x) a = T.concatenate([state, cell], axis=2) a = a.reshape((a.shape[0]*a.shape[1], a.shape[2])) a = proj.apply(a) prop = out.apply(a).reshape((n_steps, batch_size, n_classes)) pred = prop[-1] prop = prop.reshape((n_steps * batch_size, n_classes)) print prop.eval({x: test_data[x]}) y_reat = T.repeat(y[None, :], n_steps, axis=0).reshape(
def build_theano_functions(self): x = T.fmatrix('time_sequence') x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim)) y = x[:, 1:self.sequence_dim, :] x = x[:, :self.sequence_dim - 1, :] # if we try to include the spectrogram features spec_dims = 0 if self.image_size is not None: print "Convolution activated" self.init_conv() spec = T.ftensor4('spectrogram') spec_features, spec_dims = self.conv.build_conv_layers(spec) print "Conv final dims =", spec_dims spec_dims = np.prod(spec_dims) spec_features = spec_features.reshape( (self.batch_dim, self.sequence_dim - 1, spec_dims)) x = T.concatenate([x, spec_features], axis=2) layers_input = [x] dims = np.array([self.time_dim + spec_dims]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer + 1] * 4, weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X sequence X time lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale * Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape( y_hat[:, :, :self.gmm_dim], ((self.sequence_dim - 1) * self.batch_dim, self.gmm_dim))), (self.batch_dim, (self.sequence_dim - 1), self.gmm_dim)) sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6 mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] y = y[:, :, np.newaxis, :] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5 * ((y - mus)**2) / sig**2 coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum( sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum( T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True)) ).mean() LL.name = "summed_likelihood" model = Model(LL) self.model = model parameters = model.parameters algorithm = GradientDescent(cost=LL, parameters=model.parameters, step_rule=Adam()) f = theano.function([x], [pis, sig, mus]) return algorithm, f
def build_theano_functions(self) : x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor4('y') layers_input = [x] dims =np.array([self.time_dim]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=0., std=1), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor #pis = T.reshape( # T.nnet.softmax( # T.nnet.sigmoid( # T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))), # (self.batch_dim, self.time_dim, self.gmm_dim)) pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:,:,:self.gmm_dim], (self.sequence_dim*self.batch_dim, self.gmm_dim))), (self.batch_dim, self.sequence_dim, self.gmm_dim)) sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6 #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:]) mus = y_hat[:,:,self.gmm_dim*2:] pis = pis[:,:,:,np.newaxis] mus = mus[:,:,:,np.newaxis] sig = sig[:,:,:,np.newaxis] #y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5*((y-mus)**2)/sig**2 coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)) : #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append(tuple([parameters[i], parameters[i] - lr*grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug : gradf = theano.function([x, y, lr],[LL, pis, mus, sig],updates=updates) else : #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr],[LL],updates=updates) f = theano.function([x],[pis, sig, mus]) return gradf, f
def build_theano_functions(self, data_mean, data_std): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') # before the cell, input, forget and output gates, x needs to # be transformed linear_transforms = [] for transform in ['c', 'i', 'f', 'o']: linear_transforms.append( Linear( self.input_dim, self.lstm_dim, weights_init=Uniform(mean=data_mean, std=data_std), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(data_mean), name=transform + "_transform")) for transform in linear_transforms: transform.initialize() linear_applications = [] for transform in linear_transforms: linear_applications.append(transform.apply(x)) lstm_input = T.concatenate(linear_applications, axis=2) # the lstm wants batch X time X value lstm = LSTM(dim=self.lstm_dim, weights_init=IsotropicGaussian(mean=0.5, std=1), biases_init=Constant(1)) lstm.initialize() h, _dummy = lstm.apply(lstm_input) # this is where Alex Graves' paper starts output_transform = Linear( self.lstm_dim, self.output_dim, #weights_init=Uniform(mean=data_mean, std=data_std), weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(1), name="output_transform") output_transform.initialize() y_hat = output_transform.apply(h) # transforms to find each gmm params (mu, pi, sig) #pis = NDimensionalSoftmax.apply(y_hat[:,:,0:self.gmm_dim]) # small hack to softmax a 3D tensor pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:, :, 0:self.gmm_dim], (self.time_dim * self.batch_dim, self.gmm_dim))), (self.batch_dim, self.time_dim, self.gmm_dim)) #sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2]) sig = T.nnet.relu(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 0.1 mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] y = y[:, :, np.newaxis, :] #sig=theano.printing.Print()(sig) # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time #LL = -T.log((pis*(1./(T.sqrt(2.*np.pi)*sig))*T.exp(-0.5*((y-mus)**2)/sig**2)).sum(axis=2)).sum() expo = T.exp(-0.5 * ((y - mus)**2) / sig**2) test_expo = theano.function([x, y], [expo, mus, sig]) return test_expo coeff = pis * (1. / (T.sqrt(2. * np.pi) * sig)) inside_log = (coeff * expo).sum(axis=2) LL = -(T.log(inside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] for i in range(len(grads)): updates.append( tuple([parameters[i], parameters[i] - self.lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) gradf = theano.function([x, y], [LL], updates=updates) f = theano.function([x], [pis, sig, mus]) return gradf, f
def build_theano_functions(self) : x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') z = T.ftensor3('z') layers_input = [x] dims =np.array([self.input_dim]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, #weights_init=Uniform(mean=data_mean, std=1), weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value(Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05 mus = y_hat[:,:,self.output_dim/2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5*((y-mus)**2)/sig**2 expo = T.exp(inside_expo) coeff = 1./(T.sqrt(2.*np.pi)*sig) inside_log = T.log(coeff*expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() #zinside_expo = -0.5*((z-mus)**2)/sig**2 #zexpo = T.exp(zinside_expo) #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) #zinside_log = (zcoeff*zexpo).sum(axis=2) #zLL = -(T.log(zinside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)) : #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append(tuple([parameters[i], parameters[i] - lr*grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug : gradf = theano.function([x, y, lr],[LL, mus, sig],updates=updates) else : #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr],[LL],updates=updates) f = theano.function([x],[sig, mus]) return gradf, f
def build_theano_functions(self) : # shape of theano inpu is time+1 X features x = T.fmatrix('frequency_sequence') x = x.reshape((self.batch_dim, self.time_dim+1, self.input_dim)) y = x[:,1:self.time_dim+1,:] x = x[:,:self.time_dim,:] layers_input = [x] dims =np.array([self.input_dim]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, #weights_init=IsotropicGaussian(mean=0., std=1), weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05 mus = y_hat[:,:,self.output_dim/2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5*((y-mus)**2)/sig**2 expo = T.exp(inside_expo) coeff = 1./(T.sqrt(2.*np.pi)*sig) inside_log = T.log(coeff*expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() LL.name = "summed_likelihood" model = Model(LL) self.model = model algorithm = GradientDescent( cost=LL, parameters=model.parameters, step_rule=AdaGrad()) f = theano.function([x],[sig, mus]) return algorithm, f