def negative_log_likelihood_classwise_masking(self, y, mask_class_labeled, mask_class_not_present): """ todo: test. :y: true classes (as integer value): (batchsize, x, y) :mask_class_labeled: matrix: (batchsize, num_classes) allowed values: 0 or 1; setting everything to 1 leads to the ordinary nll; all zeroes is an invalid state. a zero for one class indicates that this class may be present but is not labeled as such. :mask_class_not_present: (batchsize, num_classes): similar to mask_class_labeled, but now a 1 indicates that a class is CERTAINLY NOT PRESENT in the batch. values of -1 in y count as "absolutely not labeled / ignore predictions"; this has PRIORITY over anything else (including mask_class_not_present). """ y = y.dimshuffle(0, 'x', 1, 2) #(batchsize, 1, x, y) mask_class_labeled = mask_class_labeled.dimshuffle(0, 1, 'x', 'x') #(batchsize, num_classes,1 ,1) mask_class_not_present = mask_class_not_present.dimshuffle(0, 1, 'x', 'x') #(batchsize, num_classes,1 ,1) global_loss_mask = (y != -1) #apply to overall loss after everything is calculated; marks positions pred = self.class_probabilities_realshape # (batchsize, num_classes, x, y) mod_y = T.where(y<0,0,y) #dirty hack: compute "standard" nll when most predictive weight is put on classes which are in fact labeled votes_not_for_unlabeled = T.where( T.sum(pred*mask_class_labeled,axis=1)>=0.5, 1, 0 ).dimshuffle(0,'x',1,2) # could also add '* mask_class_labeled' inside, but this should not change anything , provided there is no logical conflict between y and mask_class_labeled ! nll = -T.mean((T.log(pred) * votes_not_for_unlabeled * global_loss_mask)[:,mod_y]) #standard loss part -> increase p(correct_prediction); thus disabled if the "correct" class is not known # penalize predictions: sign is a plus! (yes: '+') # remove <global_loss_mask> if <mask_class_not_present> should override 'unlabeled' areas. nll += T.mean(T.log(pred) * mask_class_not_present * global_loss_mask) return nll
def lda_logp(rt, gaze, values, error_ll, v_index, tau_index, gamma_index, s_index, t0_index, is_multiplicative, zerotol): # compute drifts ## Select the right drift function drift = ifelse( is_multiplicative, glam.components.tt_drift_multiplicative( v[0, tt.cast(v_index, dtype='int32')][:, None], tau[0, tt.cast(tau_index, dtype='int32')][:, None], gamma[0, tt.cast(gamma_index, dtype='int32')][:, None], values, gaze, zerotol), glam.components.tt_drift_additive( v[0, tt.cast(v_index, dtype='int32')][:, None], tau[0, tt.cast(tau_index, dtype='int32')][:, None], gamma[0, tt.cast(gamma_index, dtype='int32')][:, None], values, gaze, zerotol)) # drift = driftfun(v[0, tt.cast(v_index, dtype='int32')][:, None], # tau[0, tt.cast(tau_index, dtype='int32')][:, None], # gamma[0, tt.cast(gamma_index, dtype='int32')][:, None], # values, # gaze, # zerotol) glam_ll = glam.components.tt_wienerrace_pdf( rt[:, None], drift, s[0, tt.cast(s_index, dtype='int32')][:, None], b, t0[0, tt.cast(t0_index, dtype='int32')][:, None], zerotol) # mix likelihoods mixed_ll = ((1 - p_error) * glam_ll + p_error * error_ll) mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll) mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll) return tt.log(mixed_ll + zerotol)
def irprop_minus_trainer(x, y, w, parameters, loss, random_stream, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6): """IRPROP- is batch trainer, for details see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3428 . This is default trainer, very stable for classification. :param positive_step: factor, by which the step is increased when continuing going in the direction :param negative_step: factor, by which the step is increased when changing direction to opposite :param min_step: minimal change of weight during iteration :param max_step: maximal change of weight during iteration """ shareds = [] updates = [] loss_value = loss(x, y, w) for name, param in parameters.items(): old_derivative = theano.shared(param.get_value() * 0.) delta = theano.shared(param.get_value() * 0. + 1e-3) shareds.extend([old_derivative, delta]) new_derivative = T.grad(loss_value, param) new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step) new_delta = T.clip(new_delta, min_step, max_step) updates.append([param, param - new_delta * T.sgn(new_derivative)]) updates.append([delta, new_delta]) new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative) updates.append([old_derivative, new_old_derivative]) return shareds, updates
def irprop_plus_trainer(x, y, w, parameters, loss, random_stream, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6): """IRPROP+ is batch trainer, for details see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3428 :param positive_step: factor, by which the step is increased when continuing going in the direction :param negative_step: factor, by which the step is increased when changing direction to opposite :param min_step: minimal change of weight during iteration :param max_step: maximal change of weight during iteration """ loss_value = loss(x, y, w) prev_loss_value = theano.shared(1e10) shareds = [prev_loss_value] updates = [] for name, param in parameters.items(): old_derivative = theano.shared(param.get_value() * 0.) delta = theano.shared(param.get_value() * 0. + 1e-3) new_derivative = T.grad(loss_value, param) shift_if_bad_step = T.where(new_derivative * old_derivative < 0, delta * T.sgn(old_derivative), 0) shift = ifelse(loss_value > prev_loss_value, shift_if_bad_step, 0. * param) # unfortunately we can't do it this way: param += shift new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step) new_delta = T.clip(new_delta, min_step, max_step) updates.append([param, param + shift - new_delta * T.sgn(new_derivative)]) updates.append([delta, new_delta]) new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative) updates.append([old_derivative, new_old_derivative]) shareds.extend([old_derivative, delta]) updates.append([prev_loss_value, loss_value]) return shareds, updates
def __init__(self, input, filter_shape=None, image_shape=None, W=None, b=None, poolsize=(3, 1)): assert image_shape[1] == filter_shape[1] self.W = W self.b = b tmp = numpy.ones((filter_shape[0], ), dtype=theano.config.floatX) tmp = -tmp * 10000 self.test = theano.shared(value=tmp, borrow=True) conv_out = conv.conv2d(input=input, filters=self.W, filter_shape=filter_shape, image_shape=image_shape) conv_out2 = T.where(T.neq(conv_out, 0), conv_out, self.test.dimshuffle('x', 0, 'x', 'x')) pooled_out = downsample.max_pool_2d(conv_out2, ds=poolsize, ignore_border=True) pooled_out2 = T.where(T.neq(pooled_out, -10000), pooled_out, -self.b.dimshuffle('x', 0, 'x', 'x')) self.output = ReLU(pooled_out2 + self.b.dimshuffle('x', 0, 'x', 'x')) #self.output = T.nnet.sigmoid(pooled_out2 + self.b.dimshuffle('x', 0, 'x', 'x')) self.params = [self.W, self.b]
def irprop_plus_trainer(x, y, w, parameters, loss, random_stream, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6): """IRPROP+ trainer, see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.1332""" loss_value = loss(x, y, w) prev_loss_value = theano.shared(1e10) shareds = [] updates = [] for name, param in parameters.iteritems(): old_derivative = theano.shared(param.get_value() * 0.) delta = theano.shared(param.get_value() * 0. + 1e-3) new_derivative = T.grad(loss_value, param) shift_if_bad_step = T.where(new_derivative * old_derivative < 0, delta * T.sgn(old_derivative), 0) # THIS doesn't work! shift = ifelse(loss_value > prev_loss_value, shift_if_bad_step, 0. * param) # unfortunately we can't do it this way: param += shift new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step) new_delta = T.clip(new_delta, min_step, max_step) updates.append([param, param + shift - new_delta * T.sgn(new_derivative)]) updates.append([delta, new_delta]) new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative) updates.append([old_derivative, new_old_derivative]) shareds.extend([old_derivative, delta, prev_loss_value]) updates.append([prev_loss_value, loss_value]) return shareds, updates
def irprop_star_trainer(x, y, w, parameters, loss, random_stream, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6): """ IRPROP* trainer (own experimental modification, not recommended for usage) """ shareds = [] updates = [] loss_value = loss(x, y, w) for name, param in parameters.items(): param_shape = param.get_value().shape n = numpy.prod(param_shape).astype(int) new_derivative_ = T.grad(loss_value, param).flatten() lnewder, rnewder = new_derivative_.reshape([n, 1]), new_derivative_.reshape([1, n]) new_derivative_plus = lnewder + rnewder new_derivative_minus = lnewder - rnewder new_param = param for new_derivative in [new_derivative_plus, new_derivative_minus]: delta = theano.shared(numpy.zeros([n, n], dtype=floatX) + 1e-3) old_derivative = theano.shared(numpy.zeros([n, n], dtype=floatX)) new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step) new_delta = T.clip(new_delta, min_step, max_step) updates.append([delta, new_delta]) new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative) updates.append([old_derivative, new_old_derivative]) new_param = new_param - (new_delta * T.sgn(new_derivative)).sum(axis=1).reshape(param.shape) shareds.extend([old_derivative, delta]) updates.append([param, new_param]) return shareds, updates
def lda_logp(rt, gaze, values, error_lls, s_condition_index, s_subject_index, v_condition_index, v_subject_index, tau_condition_index, tau_subject_index, gamma_condition_index, gamma_subject_index, t0_condition_index, t0_subject_index, zerotol): # compute drifts drift = glam.components.expdrift( v[tt.cast(v_subject_index, dtype='int32'), tt.cast(v_condition_index, dtype='int32')][:, None], tau[tt.cast(tau_subject_index, dtype='int32'), tt.cast(tau_condition_index, dtype='int32')][:, None], gamma[tt.cast(gamma_subject_index, dtype='int32'), tt.cast(gamma_condition_index, dtype='int32')][:, None], values, gaze, zerotol) glam_ll = glam.components.tt_wienerrace_pdf( rt[:, None], drift, s[tt.cast(s_subject_index, dtype='int32'), tt.cast(s_condition_index, dtype='int32')][:, None], b, t0[tt.cast(t0_subject_index, dtype='int32'), tt.cast(t0_condition_index, dtype='int32')][:, None], zerotol) # mix likelihoods mixed_ll = ((1 - p_error) * glam_ll + p_error * error_lls[subject_idx]) mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll) mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll) return tt.sum(tt.log(mixed_ll + zerotol))
def call(self, inputs): real = get_realpart(inputs) imag = get_imagpart(inputs) cond = T.and_(real >= 0, imag >= 0) x = T.where(cond, real, self.zeros) y = T.where(cond, imag, self.zeros) return K.concatenate((x, y), axis=-1)
def deviance_negbin(y, μ, α, saturated="NegativeBinomial"): if saturated=="NegativeBinomial": logp_sat = tt.where(y==0, np.zeros_like(y,dtype=np.float32), pm.NegativeBinomial.dist(mu=y, alpha=α).logp(y)) elif saturated=="Poisson": logp_sat = tt.where(y==0, np.zeros_like(y,dtype=np.float32), pm.Poisson.dist(mu=y).logp(y)) else: raise NotImplementedError() logp_mod = pm.NegativeBinomial.dist(mu=μ, alpha=α).logp(y) return (2*(logp_sat - logp_mod)).eval()
def _create_iter_funcs(self, layers, objective, update, output_type): y_batch = output_type('y_batch') output_layer = layers[-1] objective_kw = self._get_params_for('objective') l3Layers = [] for l3_name in self.l3_layers: l3Layers.append( layers[ l3_name ] ) loss_train = objective( layers, target=y_batch, l3_layers=l3Layers, **objective_kw) loss_eval = objective( layers, target=y_batch, deterministic=True, **objective_kw) predict_proba = get_output(output_layer, None, deterministic=True) if not self.regression: predict = predict_proba.argmax(axis=1) accuracy = T.mean(T.eq(predict, y_batch)) elif self.objective_loss_function is binary_crossentropy: predict = T.where( predict_proba >= 0.5, 1, 0 ) accuracy = T.mean( T.eq(predict, y_batch) ) else: predict = T.where( predict_proba > 0., 1, 0 ) label = T.where( y_batch > 0., 1, 0 ) accuracy = T.mean( T.eq( predict, label ) ) all_params = self.get_all_params(trainable=True) update_params = self._get_params_for('update') updates = update(loss_train, all_params, layer_weights=self.layer_weights, **update_params ) input_layers = [layer for layer in layers.values() if isinstance(layer, InputLayer)] X_inputs = [theano.Param(input_layer.input_var, name=input_layer.name) for input_layer in input_layers] inputs = X_inputs + [theano.Param(y_batch, name="y")] train_iter = theano.function( inputs=inputs, outputs=[loss_train], updates=updates, allow_input_downcast=True, ) eval_iter = theano.function( inputs=inputs, outputs=[loss_eval, accuracy], allow_input_downcast=True, ) predict_iter = theano.function( inputs=X_inputs, outputs=predict_proba, allow_input_downcast=True, ) return train_iter, eval_iter, predict_iter
def _flaremodel(time, tpeak, fwhm, ampl): # reuses some code from AltaiPony and Apaloosa time = tt.as_tensor_variable(time) flare_lc = tt.zeros_like(time) flare_lc = tt.where((time <= tpeak) * ((time - tpeak) / fwhm > -1.), _before_flare(time, tpeak, fwhm, ampl), flare_lc ) flare_lc = tt.where((time > tpeak) * ((time - tpeak) / fwhm < 20.), _after_flare(time, tpeak, fwhm, ampl), flare_lc ) return flare_lc
def __init__(self, input, filter_shape=None, image_shape=None,W=None, b=None, poolsize=(3, 1)): assert image_shape[1] == filter_shape[1] self.W = W self.b = b tmp = numpy.ones((filter_shape[0],), dtype=theano.config.floatX) tmp = -tmp*10000 self.test = theano.shared(value=tmp, borrow=True) conv_out = conv.conv2d(input=input, filters=self.W, filter_shape=filter_shape, image_shape=image_shape) conv_out2 = T.where(T.neq(conv_out,0), conv_out, self.test.dimshuffle('x', 0, 'x', 'x')) pooled_out = downsample.max_pool_2d(conv_out2,ds=poolsize, ignore_border=True) pooled_out2 = T.where(T.neq(pooled_out, -10000) ,pooled_out, -self.b.dimshuffle('x', 0, 'x', 'x')) self.output = ReLU(pooled_out2 + self.b.dimshuffle('x', 0, 'x', 'x')) #self.output = T.nnet.sigmoid(pooled_out2 + self.b.dimshuffle('x', 0, 'x', 'x')) self.params = [self.W, self.b]
def lerp(old, new, min_tau=0.0, en=None): """ Return new interpolated value and a relative difference """ diff = T.mean(T.sqr(new) - T.sqr(old), axis=1, keepdims=True) rel_diff = diff / (T.mean(T.sqr(old), axis=1, keepdims=True) + 1e-5) t = rel_diff * 20. t = T.where(t < 5, 5, t) t = T.where(t > 100, 100, t) t = t + min_tau if en is not None: lmbd = T.diagonal(en).dimshuffle(0, 'x') * (1. / t) else: lmbd = 1. / t return ((1 - lmbd) * old + lmbd * new, t, rel_diff)
def compile_prop_f(self, signals, has_input, min_tau=0.0): tau_in = T.scalar('min_tau', dtype=FLOATX) inputs = [tau_in] x = self.signal(signals) # Get estimate of the state from layer above estimate = self.estimate(signals) # Feedforward originates from previous layer's state or given input if not has_input: feedforward = self.feedforward(signals) has_nans = T.as_tensor_variable(0) nans = 0.0 else: input_t = T.matrix('input', dtype=FLOATX) inputs += [input_t] nans = T.isnan(input_t) has_nans = T.any(nans) feedforward = T.where(nans, 0.0, input_t) self.info('Compiling propagation: [%6s] -> %4s <- [%6s]' % (",".join([p.name for p in self.prev] if self.prev else 'u/y'), self.name, ",".join([p.name for p in self.next] if self.next else ''))) # Apply nonlinearity to feedforward path only if self.nonlin: feedforward = self.nonlin(feedforward) if self.merge_op: assert not self.persistent, 'cannot combine with merge_op' new_value = self.merge_op(feedforward, estimate) elif self.persistent: new_value = feedforward else: new_value = feedforward - estimate # If predicting missing values, force them to zero in residual so # that they don't influence learning new_value = ifelse(has_nans, T.where(nans, 0.0, new_value), new_value) (new_X, t, d) = lerp(x.var, new_value, tau_in) d = T.max(d) updates = [(x.var, ifelse(self.enabled, new_X, x.var))] return theano.function(inputs=inputs, outputs=d, updates=updates)
def bspline_bfs(x, knots, P): """ temporal basis function x: t-delta distance to last knot (horizon 5) """ knots = knots.astype(np.float32) idx = ((x >= knots[0]) & (x < knots[-1])) # .nonzero() xx = x[idx] N = {} for p in range(P + 1): for i in range(len(knots) - 1 - p): if p == 0: N[(i, p)] = tt.where((knots[i] <= xx) * (xx < knots[i + 1]), 1.0, 0.0) else: N[(i, p)] = \ (xx - knots[i]) / (knots[i + p] - knots[i]) * N[(i, p - 1)] + \ (knots[i + p + 1] - xx) / (knots[i + p + 1] - knots[i + 1]) * \ N[(i + 1, p - 1)] highest_level = [] for i in range(len(knots) - 1 - P): res = tt.zeros_like(x) highest_level.append(tt.set_subtensor(res[idx], N[(i, P)])) return highest_level
def get_output_for(self, inputs, **kwargs): ''' Take the exp() of all inputs, and divide by the total. ''' exps = T.where(T.eq(inputs[0],0), np.float32(0.0), np.float32(1.0)) * T.exp(inputs[1]) return exps / (exps.sum(axis=1).dimshuffle((0, 'x')) + 1e-6)
def window_batch_timewise(t, b, w, full_index): for i in range(w): full_index = T.set_subtensor(full_index[i], T.roll(full_index[i], i)) if i > 0: full_index = T.inc_subtensor( full_index[i], T.where(full_index[i] > 0, i * t * b - i, 0)) return full_index
def dropout(self, rate=0.5, seed=None): obj = self.copy() srng = RandomStreams(seed) obj.out = T.where(srng.uniform(size=obj.out.shape) > rate, obj.out, 0) return obj
def compile_adapt_f(self, signals): x = self.signal(signals) x_prev = [p.signal(signals) for p in self.prev] assert np.all([x.k == xp.k for xp in x_prev]) assert self.m == [xp.n for xp in x_prev] assert x.n == self.n k = np.float32(x.k) # Modulate x if x.modulation is not None: x_ = x.var * T.as_tensor_variable(x.modulation) else: x_ = x.var updates = [] upd = lambda en, old, new: [(old, ifelse(en, new, old))] E_XX_new, _, d = lerp(self.E_XX, T.dot(x_, x_.T) / k, self.min_tau) updates += upd(self.enabled, self.E_XX, E_XX_new) b = 1. d = T.diagonal(E_XX_new) stiff = T.scalar('stiffnes', dtype=FLOATX) Q_new = theano_diag( b / T.where(d < stiff * self.stiffx, stiff * self.stiffx, d)) updates += upd(self.enabled, self.Q, Q_new) for i, x_p in enumerate(x_prev): E_XU_new, _, d_ = lerp(self.E_XU[i], T.dot(x_, x_p.var.T) / k, self.min_tau) updates += upd(self.enabled, self.E_XU[i], E_XU_new) d = T.maximum(d, d_) updates += upd(self.enabled, self.phi[i], T.dot(Q_new, E_XU_new).T) self.info('Compile layer update between: ' + self.name + ' and ' + ', '.join([p.name for p in self.prev])) return theano.function(inputs=[stiff], outputs=d, updates=updates)
def init_layer_updates(self, layer): if not layer.parameters: return [] prediction_func = self.variables.train_prediction_func network_output = self.variables.network_output network_input = self.variables.network_input step = self.variables.step normalized_input = network_input / network_input.norm(L=2) summated_output = network_input.dot(layer.weight) + layer.bias linear_error = prediction_func - network_output update = T.where( T.abs_(summated_output) >= self.dead_zone_radius, linear_error, network_output ) weight_delta = normalized_input.T.dot(update) bias_delta = linear_error.sum(axis=0) return [ (layer.weight, layer.weight - step * weight_delta), (layer.bias, layer.bias - step * bias_delta), ]
def weighted_binary_cross_entropy_1(pred, target, mean_notes_activation): # Weights correspond to the mean number of positive occurences of the class in the training dataset # From : # Weighted Multi-label Binary Cross-entropy Criterion # https://github.com/Nanne/WeightedMultiLabelBinaryCrossEntropyCriterion # https://arxiv.org/pdf/1511.02251.pdf # From theano # # RESULTS : # Accuracy = 26% # Listening : quite good, a bit too much notes, but harmonically consistent # Weights : static biases on output still bias toward negative values, but in a more structured way, i.e. some values around the most likely notes are high (event positives) # W is highly structured, but past influence is weaker and less contrasted than piano influence match = target * T.log(pred) / T.where(mean_notes_activation == 0, 1e-10, mean_notes_activation) not_match = (1.0 - target) * T.log(1.0 - pred) / T.where(mean_notes_activation == 1, 1e-10, (1-mean_notes_activation)) return -(match + not_match)
def compile_prop_f(self, signals, has_input, min_tau=0.0): tau_in = T.scalar('min_tau', dtype=FLOATX) inputs = [tau_in] x = self.signal(signals) # Get estimate of the state from layer above estimate = self.estimate(signals) # Feedforward originates from previous layer's state or given input if not has_input: feedforward = self.feedforward(signals) has_nans = T.as_tensor_variable(0) nans = 0.0 else: input_t = T.matrix('input', dtype=FLOATX) inputs += [input_t] nans = T.isnan(input_t) has_nans = T.any(nans) feedforward = T.where(nans, 0.0, input_t) self.info( 'Compiling propagation: [%6s] -> %4s <- [%6s]' % (",".join([p.name for p in self.prev] if self.prev else 'u/y'), self.name, ",".join([p.name for p in self.next] if self.next else ''))) # Apply nonlinearity to feedforward path only if self.nonlin: feedforward = self.nonlin(feedforward) if self.merge_op: assert not self.persistent, 'cannot combine with merge_op' new_value = self.merge_op(feedforward, estimate) elif self.persistent: new_value = feedforward else: new_value = feedforward - estimate # If predicting missing values, force them to zero in residual so # that they don't influence learning new_value = ifelse(has_nans, T.where(nans, 0.0, new_value), new_value) (new_X, t, d) = lerp(x.var, new_value, tau_in) d = T.max(d) updates = [(x.var, ifelse(self.enabled, new_X, x.var))] return theano.function(inputs=inputs, outputs=d, updates=updates)
def get_output_for(self, inputs, **kwargs): ''' First layer is a batch of matrices of embedding indices: Second layer are the corresponding embeddings: ''' return \ T.where(T.eq(inputs[0],0), np.float32(0.0), np.float32(1.0)).dimshuffle((0,1,2,'x')) * inputs[1]
def weighted_binary_cross_entropy_3(pred, target, mean_notes_activation): # Mix of 1 and 2 # From theano # # RESULTS # Accuracy = 31% # Listening : not good, not harmonic, strange ranges... # Weights : static biases strongly biased toward negative values # W shows that past is neglected BATCH_SIZE = pred.shape[0] DIM = pred.shape[1] N_on_per_batch = T.transpose(T.tile(target.sum(axis=1), (DIM, 1))) + 1 N_off_per_batch = T.transpose(T.tile((1-target).sum(axis=1), (DIM, 1))) + 1 mean_notes_on = T.tile(T.where(mean_notes_activation==0, 1e-10, mean_notes_activation), (BATCH_SIZE, 1)) mean_notes_off = T.tile(T.where(mean_notes_activation==1, 1e-10, (1-mean_notes_activation)), (BATCH_SIZE, 1)) # +1 to avoid zero weighting return - (N_on_per_batch * target * T.log(pred) / mean_notes_on + N_off_per_batch * (1.0 - target) * T.log(1.0 - pred) / mean_notes_off)
def MMD_class_penalty(self, target, Xlabel): #10個のリスト。順番に各クラスの数が入っている Num_c = T.sum(target, 0) D_num = Xlabel.shape[1] #C*Domain_numの行列。書くますに例えばクラスcにはドメイン1,2,3はそれぞれ何個ずついるか計算している Number_label = T.sum(target.T[:, None, :] * Xlabel.T[None, :, :], 2) K_base = self.kern.RBF(self.cal, self.cal) #10*N*N クラスごとの全てのドメインを無視したグラム行列 K_class, updates = theano.scan(fn=lambda a: ((K_base * a).T * a), sequences=[target.T]) #グラム行列の和をとっている K_allsum = T.sum(T.sum(K_class, -1), -1) #それぞれのクラスの数の2乗で割る必要があるが万が一クラスの数が0だと割っては無限になってしまうので、if文でチェックを入れている K_sum_tot, updates = theano.scan( fn=lambda a, b: T.switch(T.gt(b, 0), a / b**2, 0), sequences=[K_allsum, Num_c]) #10*3*N*N(クラス、ドメイン、ごとグラム行列)ただし全てのドメインとのクラスではないのでフィルターであるxlabelを両方からかけている。またあるクラスについて、その中のドメインを順番に見るので、scan文の2重ループ K_class_domain_cross, updates = theano.scan(fn=lambda c: theano.scan( fn=lambda a: ((c * a).T * a), sequences=[Xlabel.T]), sequences=[K_class]) #domainごとにクラスごとになっている今はC*D グラム行列の和をとっている K_allsum = T.sum(T.sum(K_class_domain_cross, -1), -1) #割り方だが、あるクラスのあるドメインに属しているのが誰もいなかったらC*D_numのグラム行列和の成分が0になっているはず。それと同じ分母の数を入れている行列も0になっているはず。なのでもし0なら分母には1を変わりに入れる。しかし結局分子で0になるので問題ない Number_label2 = T.where(T.eq(Number_label, 0), 1, Number_label) K_class_sum = T.sum(K_allsum / (Number_label2**2)) #あるクラスのあるドメインとあるクラスの全てのドメインのクロス。そのためフィルターは横方向からかけているだけ。 K_class_domain_center_cross, updates = theano.scan( fn=lambda c: theano.scan(fn=lambda a: (c * a), sequences=[Xlabel.T]), sequences=[K_class]) #上のドメインごとのものと同じ処理を繰り返す K_sum_cross = T.sum(T.sum(K_class_domain_center_cross, -1), -1) Number_label2 = T.where(T.eq(Number_label, 0), 1, Number_label) K_domain_cross_sum = T.sum(K_sum_cross / (Number_label2 * Num_c[:, None])) #z_switch = T.switch(T.lt(a, b), T.mean(x), T.mean(y)) MMD_class = K_class_sum + T.sum( K_sum_tot) * D_num - 2 * K_domain_cross_sum return MMD_class
def lda_logp(rt, gaze, values, error_ll, v_index, tau_index, gamma_index, s_index, t0_index, zerotol): # compute drifts R = make_R(v[0, tt.cast(v_index, dtype='int32')][:, None], tau[0, tt.cast(tau_index, dtype='int32')][:, None], gamma[0, tt.cast(gamma_index, dtype='int32')][:, None], values, gaze, zerotol) glam_ll = tt_wienerrace_pdf( rt[:, None], R, s[0, tt.cast(s_index, dtype='int32')][:, None], b, t0[0, tt.cast(t0_index, dtype='int32')][:, None], zerotol) # mix likelihoods mixed_ll = ((1 - p_error) * glam_ll + p_error * error_ll) mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll) mixed_ll = tt.where(tt.isinf(mixed_ll), 0., mixed_ll) return tt.log(mixed_ll + zerotol)
def robust_expit(x): def expit_p(z): return 1 / (1 + tt.exp(-z)) def expit_n(z): exp_z = tt.exp(z) return exp_z / (1 + exp_z) return tt.where(x > 0, expit_p, expit_n)
def focal_loss_fixed(y_true, y_pred): if(K.backend()=="tensorflow"): import tensorflow as tf pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred) return -K.mean(alpha * K.pow(1. - pt, gamma) * K.log(pt)) if(K.backend()=="theano"): import theano.tensor as T pt = T.where(T.eq(y_true, 1), y_pred, 1 - y_pred) return -K.mean(alpha * K.pow(1. - pt, gamma) * K.log(pt))
def MMD_class_penalty(self,target,Xlabel): #10個のリスト。順番に各クラスの数が入っている Num_c=T.sum(target,0) D_num=Xlabel.shape[1] #C*Domain_numの行列。書くますに例えばクラスcにはドメイン1,2,3はそれぞれ何個ずついるか計算している Number_label=T.sum(target.T[:,None,:]*Xlabel.T[None,:,:],2) K_base=self.kern.RBF(self.cal,self.cal) #10*N*N クラスごとの全てのドメインを無視したグラム行列 K_class, updates = theano.scan(fn=lambda a: ((K_base*a).T*a), sequences=[target.T]) #グラム行列の和をとっている K_allsum=T.sum(T.sum(K_class,-1),-1) #それぞれのクラスの数の2乗で割る必要があるが万が一クラスの数が0だと割っては無限になってしまうので、if文でチェックを入れている K_sum_tot, updates = theano.scan(fn=lambda a,b: T.switch(T.gt(b,0), a/b**2, 0), sequences=[K_allsum,Num_c]) #10*3*N*N(クラス、ドメイン、ごとグラム行列)ただし全てのドメインとのクラスではないのでフィルターであるxlabelを両方からかけている。またあるクラスについて、その中のドメインを順番に見るので、scan文の2重ループ K_class_domain_cross,updates = theano.scan(fn=lambda c: theano.scan(fn=lambda a: ((c*a).T*a), sequences=[Xlabel.T]) ,sequences=[K_class]) #domainごとにクラスごとになっている今はC*D グラム行列の和をとっている K_allsum=T.sum(T.sum(K_class_domain_cross,-1),-1) #割り方だが、あるクラスのあるドメインに属しているのが誰もいなかったらC*D_numのグラム行列和の成分が0になっているはず。それと同じ分母の数を入れている行列も0になっているはず。なのでもし0なら分母には1を変わりに入れる。しかし結局分子で0になるので問題ない Number_label2=T.where(T.eq(Number_label,0),1,Number_label) K_class_sum=T.sum(K_allsum/(Number_label2**2)) #あるクラスのあるドメインとあるクラスの全てのドメインのクロス。そのためフィルターは横方向からかけているだけ。 K_class_domain_center_cross,updates = theano.scan(fn=lambda c: theano.scan(fn=lambda a: (c*a), sequences=[Xlabel.T]) ,sequences=[K_class]) #上のドメインごとのものと同じ処理を繰り返す K_sum_cross=T.sum(T.sum(K_class_domain_center_cross,-1),-1) Number_label2=T.where(T.eq(Number_label,0),1,Number_label) K_domain_cross_sum=T.sum(K_sum_cross/(Number_label2*Num_c[:,None])) #z_switch = T.switch(T.lt(a, b), T.mean(x), T.mean(y)) MMD_class=K_class_sum+T.sum(K_sum_tot)*D_num-2*K_domain_cross_sum return MMD_class
def irprop_star_trainer(x, y, w, parameters, loss, random_stream, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6): """ IRPROP* trainer (own experimental modification of IRPROP-, not recommended for usage) """ shareds = [] updates = [] loss_value = loss(x, y, w) for name, param in parameters.items(): param_shape = param.get_value().shape n = int(numpy.prod(param_shape)) new_derivative_ = T.grad(loss_value, param).flatten() lnewder, rnewder = new_derivative_.reshape( [n, 1]), new_derivative_.reshape([1, n]) new_derivative_plus = lnewder + rnewder new_derivative_minus = lnewder - rnewder new_param = param for new_derivative in [new_derivative_plus, new_derivative_minus]: delta = theano.shared(numpy.zeros([n, n], dtype=floatX) + 1e-3) old_derivative = theano.shared(numpy.zeros([n, n], dtype=floatX)) new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step) new_delta = T.clip(new_delta, min_step, max_step) updates.append([delta, new_delta]) new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative) updates.append([old_derivative, new_old_derivative]) new_param = new_param - (new_delta * T.sgn(new_derivative)).sum( axis=1).reshape(param.shape) shareds.extend([old_derivative, delta]) updates.append([param, new_param]) return shareds, updates
def censor_updates(self, updates): """ Transition matrix should be non-negative """ if self.W in updates: updated_W = updates[self.W] desired_W = tensor.where(updated_W < 0, self.W, updated_W) updates[self.W] = desired_W self.mlp.censor_updates(updates)
def irprop_minus_trainer(x, y, w, parameters, loss, random_stream, positive_step=1.2, negative_step=0.5, max_step=1., min_step=1e-6): """ IRPROP- trainer, see http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3428 """ shareds = [] updates = [] loss_value = loss(x, y, w) for name, param in parameters.items(): old_derivative = theano.shared(param.get_value() * 0.) delta = theano.shared(param.get_value() * 0. + 1e-3) shareds.extend([old_derivative, delta]) new_derivative = T.grad(loss_value, param) new_delta = T.where(new_derivative * old_derivative > 0, delta * positive_step, delta * negative_step) new_delta = T.clip(new_delta, min_step, max_step) updates.append([param, param - new_delta * T.sgn(new_derivative)]) updates.append([delta, new_delta]) new_old_derivative = T.where(new_derivative * old_derivative < 0, 0, new_derivative) updates.append([old_derivative, new_old_derivative]) return shareds, updates
def cartesianToSpherical_plus_mu(a): """ Convert Cartesian to spherical coordinates. The input must be theano tensors. Note that the angle coordinates follow the astronomical convention of using elevation (declination, latitude) rather than its complement (pi/2-elevation), which is commonly used in the mathematical treatment of spherical coordinates. Parameters ---------- x - Cartesian vector component along the X-axis y - Cartesian vector component along the Y-axis z - Cartesian vector component along the Z-axis vx - Cartesian vector component of velocity along the Phi axis vy - Cartesian vector component of velocity along the Theta axis Returns ------- The spherical coordinates: longitude phi, latitude theta, parallax, proper motion phi, proper motion theta. NOTE THAT THE LONGITUDE ANGLE IS BETWEEN 0 AND +2PI. FOR r=0 AN EXCEPTION IS RAISED. """ x = a[:,0] y = a[:,1] z = a[:,2] vx = a[:,3] vy = a[:,4] rCylSq=x*x+y*y r=tt.sqrt(rCylSq+z*z) phi = tt.arctan2(y,x) phi = tt.where(phi<0.0, phi+2*np.pi, phi) theta = tt.arctan2(z,tt.sqrt(rCylSq)) #------- Velocity ------------------------------------ mu_phi = 1000.0*vx/(4.74*r) # Proper motion in mas/yr mu_theta = 1000.0*vy/(4.74*r) # Proper motion in mas/yr #-------- Units---------- phi = tt.rad2deg(phi) # Degrees theta = tt.rad2deg(theta) # Degrees plx = _auMasParsec/r # mas #------- Join ------ res = tt.stack([phi, theta ,plx, mu_phi, mu_theta],axis=1) return res
def lda_logp(rt, gaze, values, error_ll, zerotol): # compute drifts drift = glam.components.expdrift(v, tau, gamma, values, gaze, zerotol) glam_ll = glam.components.tt_wienerrace_pdf( rt[:, None], drift, s, b, t0, zerotol) # mix likelihoods mixed_ll = ((1 - p_error) * glam_ll + p_error * error_ll) mixed_ll = tt.where(tt.isnan(mixed_ll), 0., mixed_ll) return tt.sum(tt.log(mixed_ll + zerotol))
def negative_log_likelihood_classwise_masking(self, y, mask_class_labeled, mask_class_not_present): """ todo: test. :y: true classes (as integer value): (batchsize, x, y) :mask_class_labeled: matrix: (batchsize, num_classes) allowed values: 0 or 1; setting everything to 1 leads to the ordinary nll; all zeroes is an invalid state. a zero for one class indicates that this class may be present but is not labeled as such. :mask_class_not_present: (batchsize, num_classes): similar to mask_class_labeled, but now a 1 indicates that a class is CERTAINLY NOT PRESENT in the batch. values of -1 in y count as "absolutely not labeled / ignore predictions"; this has PRIORITY over anything else (including mask_class_not_present). """ y = y.dimshuffle(0, 'x', 1, 2) #(batchsize, 1, x, y) mask_class_labeled = mask_class_labeled.dimshuffle( 0, 1, 'x', 'x') #(batchsize, num_classes,1 ,1) mask_class_not_present = mask_class_not_present.dimshuffle( 0, 1, 'x', 'x') #(batchsize, num_classes,1 ,1) global_loss_mask = ( y != -1 ) #apply to overall loss after everything is calculated; marks positions pred = self.class_probabilities_realshape # (batchsize, num_classes, x, y) mod_y = T.where(y < 0, 0, y) #dirty hack: compute "standard" nll when most predictive weight is put on classes which are in fact labeled votes_not_for_unlabeled = T.where( T.sum(pred * mask_class_labeled, axis=1) >= 0.5, 1, 0).dimshuffle(0, 'x', 1, 2) # could also add '* mask_class_labeled' inside, but this should not change anything , provided there is no logical conflict between y and mask_class_labeled ! nll = -T.mean( (T.log(pred) * votes_not_for_unlabeled * global_loss_mask)[:, mod_y] ) #standard loss part -> increase p(correct_prediction); thus disabled if the "correct" class is not known # penalize predictions: sign is a plus! (yes: '+') # remove <global_loss_mask> if <mask_class_not_present> should override 'unlabeled' areas. nll += T.mean(T.log(pred) * mask_class_not_present * global_loss_mask) return nll
def tt_wienerpos_fpt_cdf(t, drift, noise, boundary, numerical_stability=100): """ Cumulative distribution function of first passage times of Wiener process with positive drift towards constant boundary. Theano tensor implementation Cf https://en.wikipedia.org/wiki/Inverse_Gaussian_distribution#Relationship_with_Brownian_motion """ mu = boundary / drift lam = (boundary / noise)**2 bounded_ratio = tt.where( lam/mu >= numerical_stability, numerical_stability, lam/mu) return (tt_normal_cdf(tt.sqrt(lam / t) * (t / mu - 1)) + tt.exp(2*bounded_ratio) * tt_normal_cdf(-(tt.sqrt(lam / t) * (t / mu + 1))))
def fn(x_t, h_tm1, c_tm1, hid_ref, mask_ref, V): x_ct = TT.dot(x_t, self.find('xh')) + self.find( 'b') # batch_size * h_size xi, xf, xc, xo = split(x_ct + TT.dot(h_tm1, self.find('hh'))) i_t = TT.nnet.sigmoid(xi + c_tm1 * self.find('ci')) f_t = TT.nnet.sigmoid(xf + c_tm1 * self.find('cf')) c_t = f_t * c_tm1 + i_t * TT.tanh(xc) o_t = TT.nnet.sigmoid(xo + c_t * self.find('co')) h_t = o_t * TT.tanh(c_t) #hid_p = TT.dot(h_t, V) # batch_size * size. hid_p = TT.dot(h_tm1, V) # batch_size * size. hid_p_dim = hid_p.dimshuffle(('x', 0, 1)) x_ts = TT.extra_ops.repeat(hid_p_dim, hid_ref.shape[0], axis=0) # mask_len * batch_size * size emb = x_ts * hid_ref # mask_len * batch_size * size. beta = TT.sum(emb, axis=-1) # mask_len * batch_size. beta_b = TT.where(mask_ref > 0, beta, beta.min()) beat_b = beta_b - beta_b.max(axis=0, keepdims=True) #beat_b = beta_b.clip(-50, 0) z = TT.exp(beta_b * mask_ref) * mask_ref z_sum = TT.sum(z, axis=0, keepdims=True) #z = theano.printing.Print('this is a very important value')(z) #z_sum = theano.printing.Print('this is a very important value')(z_sum) #alpha = (z * mask_ref ) / ( z * mask_ref ).sum(axis=0, keepdims=True) # max_len * batch_size. #alpha = z / TT.sum(z, axis = 0, keepdims = True) alpha = z / z_sum #if stage == 'train': # alpha_sample = self.h_sampling_mask * self.rng.multinomial(pvals = alpha.T, dtype = 'float32') \ # + (1. - self.h_sampling_mask) * alpha.T # alpha_sample = alpha_sample.T # logging.info('LSTMAtt: stage is %s, using the random.', stage) #elif stage == 'test': # argmax for prediction. # alpha_sample = TT.cast(TT.eq(TT.arange(alpha.shape[0])[:,None], \ # #alpha_sample = TT.cast(TT.eq(TT.arange(alpha.shape[0])[None,:], \ # TT.argmax(alpha,axis=0,keepdims=True)), theano.config.floatX) # logging.info('LSTMAtt: stage is %s, using the argmax.', stage) hid_ref_dim = hid_ref.dimshuffle( (2, 0, 1)) # emb_size * mask_len * batch_size #att = alpha * hid_ref_dim # now is size * max_len * batch_size att = hid_ref_dim * alpha # now is size * max_len * batch_size att = att.sum(axis=1) # size * batch_size return [h_t, c_t, alpha, att.T]
def build_background_detector(BATCH_SIZE=None, input_dim=(1000, 1000), filter_size=11, threshold=0.1): background_detector = OrderedDict() background_detector['input'] = InputLayer( (BATCH_SIZE, 1, input_dim[0], input_dim[1])) background_detector['background_detector'] = ConvLayer( background_detector['input'], num_filters=1, filter_size=filter_size, nonlinearity=lambda x: T.where(T.le(x, threshold), 0, 1), pad='same', W=1.0 / filter_size**2 * np.ones((1, 1, filter_size, filter_size)), b=None) return background_detector
def main(): repeats = 1 # per configuration relu = lambda x: T.where(x < 0., 0., x) runs = [ # tanh requires higher tau {'config': {'layers': [70], 'tau': (20, 4, 0.95), 'iters': 200, 'nonlin': relu}}, {'config': {'layers': [60], 'tau': (20, 4, 0.99), 'iters': 200, 'nonlin': T.tanh}}, {'config': {'layers': [70], 'tau': (20, 4, 0.99), 'iters': 200, 'nonlin': T.tanh}}, ] try: for run in runs: print 'Running configuration', run['config'] results = [] best = 0.0 # Do several runs with different seeds for i in range(repeats): d = TrainData(35022, 0.6, i) pred, best_iter = eca_missing_value_prediction(d, run['config']) # Store results acc = d.accuracy(pred) results = [acc] best = max(best, best_iter) print '%5.2f %%' % (acc * 100.), str(run['config']), 'iteration', i run['results'] = (results, best) print '-----------------' except KeyboardInterrupt: pass print 'Summary' print '-------' for run in runs: if 'results' not in run: continue res, best = run['results'] print "%5.2f +- %.2f (best: %5.2f): %s" % (100. * np.mean(res), 100. * np.sqrt(np.var(res)), 100. * best, str(run['config']),)
def fn(x_t, h_tm1, c_tm1, hid_ref, mask_ref, V): x_ct = TT.dot(x_t, self.find('xh')) + self.find('b') # batch_size * h_size xi, xf, xc, xo = split(x_ct + TT.dot(h_tm1, self.find('hh'))) i_t = TT.nnet.sigmoid(xi + c_tm1 * self.find('ci')) f_t = TT.nnet.sigmoid(xf + c_tm1 * self.find('cf')) c_t = f_t * c_tm1 + i_t * TT.tanh(xc) o_t = TT.nnet.sigmoid(xo + c_t * self.find('co')) h_t = o_t * TT.tanh(c_t) #hid_p = TT.dot(h_t, V) # batch_size * size. hid_p = TT.dot(h_tm1, V) # batch_size * size. hid_p_dim = hid_p.dimshuffle(('x', 0, 1)) x_ts = TT.extra_ops.repeat(hid_p_dim, hid_ref.shape[0], axis = 0) # mask_len * batch_size * size emb = x_ts * hid_ref # mask_len * batch_size * size. beta = TT.sum(emb, axis=-1) # mask_len * batch_size. beta_b = TT.where( mask_ref > 0, beta, beta.min()) beat_b = beta_b - beta_b.max(axis=0, keepdims=True) #beat_b = beta_b.clip(-50, 0) z = TT.exp(beta_b * mask_ref) * mask_ref z_sum = TT.sum(z, axis = 0, keepdims = True) #z = theano.printing.Print('this is a very important value')(z) #z_sum = theano.printing.Print('this is a very important value')(z_sum) #alpha = (z * mask_ref ) / ( z * mask_ref ).sum(axis=0, keepdims=True) # max_len * batch_size. #alpha = z / TT.sum(z, axis = 0, keepdims = True) alpha = z / z_sum #if stage == 'train': # alpha_sample = self.h_sampling_mask * self.rng.multinomial(pvals = alpha.T, dtype = 'float32') \ # + (1. - self.h_sampling_mask) * alpha.T # alpha_sample = alpha_sample.T # logging.info('LSTMAtt: stage is %s, using the random.', stage) #elif stage == 'test': # argmax for prediction. # alpha_sample = TT.cast(TT.eq(TT.arange(alpha.shape[0])[:,None], \ # #alpha_sample = TT.cast(TT.eq(TT.arange(alpha.shape[0])[None,:], \ # TT.argmax(alpha,axis=0,keepdims=True)), theano.config.floatX) # logging.info('LSTMAtt: stage is %s, using the argmax.', stage) hid_ref_dim = hid_ref.dimshuffle((2,0,1)) # emb_size * mask_len * batch_size #att = alpha * hid_ref_dim # now is size * max_len * batch_size att = hid_ref_dim * alpha# now is size * max_len * batch_size att = att.sum(axis = 1) # size * batch_size return [h_t, c_t, alpha, att.T]
def likelihood(self, z, y): η = z.flatten(min(2, z.ndim)) + self.bias Δ = self.binsize # 1st part of the likelihood L1 = tt.dot(y, η) if z.ndim > 1: ndim = z.ndim - 1 shp_z = z.shape[-ndim:] L1 = L1.reshape(shp_z, ndim=ndim) # 2nd part of the likelihood λ = self.invlink(z + self.bias) L2 = Δ * tt.sum(λ, axis=0) # constant factors c1 = tt.sum(y) * tt.log(Δ) c2 = -tt.sum(tt.where(y > 1, tt.gammaln(y + 1), 0.0)) const = c1 - c2 L = L1 - L2 + const return as_tensor_variable(L, name='logL')
def init_layer_updates(self, layer): prediction_func = self.variables.train_prediction_func network_output = self.variables.network_output network_input = self.variables.network_input step = self.variables.step normalized_input = network_input / network_input.norm(L=2) summated_output = network_input.dot(layer.weight) + layer.bias linear_error = prediction_func - network_output update = T.where( T.abs_(summated_output) >= self.dead_zone_radius, linear_error, network_output) weight_delta = normalized_input.T.dot(update) bias_delta = linear_error.sum(axis=0) return [ (layer.weight, layer.weight - step * weight_delta), (layer.bias, layer.bias - step * bias_delta), ]
def create_seg_wise_encoder_output(self, att, aligner=None): assert aligner,"please provide an inverted aligner!" t = self.base[0].output.shape[0] b = self.base[0].output.shape[1] att_with_first_index = T.concatenate([T.zeros((1,att.shape[1]))-numpy.float32(1),att],axis=0) #(N+1)B max_diff = T.cast(T.extra_ops.diff(att_with_first_index,axis=0).flatten().sort()[-1],'int32') reduced_index = aligner.reduced_index.repeat(max_diff).reshape((aligner.reduced_index.shape[0], aligner.reduced_index.shape[1],max_diff)) #NB(max_diff) att_wo_last_ind = att_with_first_index[:-1] #NB att_wo_last_ind +=numpy.int32(1) att_rep = att_wo_last_ind.repeat(max_diff).reshape((att_wo_last_ind.shape[0],att_wo_last_ind.shape[1],max_diff))#NB(max_diff) att_rep = T.switch(reduced_index>0, att_rep + T.arange(max_diff),T.zeros((1,),'float32')-numpy.float32(1)) att_rep = att_rep.dimshuffle(0,2,1) #N(max_diff)B reduced_index = reduced_index.dimshuffle(0,2,1) #N(max_diff)B att_rep = T.switch(reduced_index > 0,att_rep + (T.arange(b) * t),T.zeros((1,),'float32')-numpy.float32(1)) att_rep = att_rep.clip(0,(t*b-1)) diff_arr = att_with_first_index[1:]-att_with_first_index[:-1] diff_arr = diff_arr.clip(0,max_diff) - numpy.float32(1)#NB mask = diff_arr.dimshuffle(0,'x',1).repeat(max_diff,axis=1) - T.arange(max_diff).dimshuffle('x',0,'x') ind = T.cast(T.where(T.lt(mask,numpy.float32(0)),T.zeros((1,),'float32'),numpy.float32(1)),'int8') self.rec_transform_enc = att_rep self.rec_transform_index = ind
def get_output_for(self, inputs, **kwargs): ''' First layer is a batch of embedding indices: [[11,21,43,0,0], [234,543,0,0,0,], ... ] Second layer are the embeddings: [ [[.02, .01...], [.004, .005, ...], ..., .0 .0 .0 ... , .0 .0 .0 ...], [[...], .... ] ] ''' return \ T.where(T.eq(inputs[0],0), np.float32(0.0), np.float32(1.0)).dimshuffle((0,1,'x')) * inputs[1]
def compile_adapt_f(self, signals): x = self.signal(signals) x_prev = [p.signal(signals) for p in self.prev] assert np.all([x.k == xp.k for xp in x_prev]) assert self.m == [xp.n for xp in x_prev] assert x.n == self.n k = np.float32(x.k) # Modulate x if x.modulation is not None: x_ = x.var * T.as_tensor_variable(x.modulation) else: x_ = x.var updates = [] upd = lambda en, old, new: [(old, ifelse(en, new, old))] E_XX_new, _, d = lerp(self.E_XX, T.dot(x_, x_.T) / k, self.min_tau) updates += upd(self.enabled, self.E_XX, E_XX_new) b = 1. d = T.diagonal(E_XX_new) stiff = T.scalar('stiffnes', dtype=FLOATX) Q_new = theano_diag(b / T.where(d < stiff * self.stiffx, stiff * self.stiffx, d)) updates += upd(self.enabled, self.Q, Q_new) for i, x_p in enumerate(x_prev): E_XU_new, _, d_ = lerp(self.E_XU[i], T.dot(x_, x_p.var.T) / k, self.min_tau) updates += upd(self.enabled, self.E_XU[i], E_XU_new) d = T.maximum(d, d_) updates += upd(self.enabled, self.phi[i], T.dot(Q_new, E_XU_new).T) self.info('Compile layer update between: ' + self.name + ' and ' + ', '.join([p.name for p in self.prev])) return theano.function( inputs=[stiff], outputs=d, updates=updates)
def forward_pass(self, orch_past, piano, batch_size): ################################################################ ################################################################ ################################################################ # Normalization by the number of notes # orch_past_norm = self.number_note_normalization_fun(orch_past) # piano_norm = self.number_note_normalization_fun(piano) # TEST : batch norm on the input # orch_past_norm = batch_norm(orch_past, (self.temporal_order, self.n_o)) # piano_norm = batch_norm(piano, (self.n_p,)) # orch_past_norm = orch_past piano_norm = piano ################################################################ ################################################################ ################################################################ # Time needs to be the first dimension orch_past_loop = orch_past_norm.dimshuffle((1, 0, 2)) # Initialization input_layer = [None]*(self.n_layer+1) input_layer[0] = orch_past_loop n_lm1 = self.n_o # Loop for layer, n_h in enumerate(self.n_hs): s_0 = T.zeros((batch_size, n_h), dtype=theano.config.floatX) # Infer hidden states s_seq, updates = theano.scan(fn=self.iteration, sequences=[input_layer[layer]], outputs_info=[s_0], non_sequences=[self.W_z[layer], self.U_z[layer], self.b_z[layer], self.W_r[layer], self.U_r[layer], self.b_r[layer], self.W_h[layer], self.U_h[layer], self.b_h[layer], n_lm1]) # Inputs for the next layer are the hidden units of the current layer input_layer[layer+1] = s_seq # Update dimension n_lm1 = n_h # Last hidden units last_hidden = input_layer[self.n_layer] # Orchestra representation is the last state of the topmost rnn orchestra_repr = last_hidden[-1] ################################################################ ################################################################ ################################################################ # Batch Normalization or no ?? # orchestra_repr_norm = batch_norm(orchestra_repr, (n_lm1,)) orchestra_repr_norm = orchestra_repr ################################################################ ################################################################ ################################################################ ################################################################ ################################################################ # Piano through a mlp ? piano_repr = T.nnet.sigmoid(T.dot(piano_norm, self.W_piano) + self.b_piano) ################################################################ ################################################################ ################################################################ ################################################################ # Sum or concatenate # concat_input = T.concatenate([orchestra_repr_norm, piano_repr], axis=1) concat_input = orchestra_repr_norm + self.sum_coeff * piano_repr ################################################################ ################################################################ # Last layer orch_pred_mean = T.nnet.sigmoid(T.dot(concat_input, self.W) + self.b) ################################################################ ################################################################ ################################################################ # Before sampling, we THRESHOLD orch_pred_mean_threshold = T.where(T.le(orch_pred_mean, self.threshold), 0, orch_pred_mean) ################################################################ ################################################################ ################################################################ # Sampling orch_pred = self.rng.binomial(size=orch_pred_mean_threshold.shape, n=1, p=orch_pred_mean_threshold, dtype=theano.config.floatX) return orch_pred_mean, orch_pred_mean_threshold, orch_pred, updates
def binary_entropy(predictions, targets): predictions = T.where(T.lt(predictions, 0.01), 0.01, predictions) predictions = T.where(T.gt(predictions, 0.99), 0.99, predictions) loss = - targets * (T.log(predictions) - T.log(targets)) loss -= (1. - targets) * (T.log(1. - predictions) - T.log(1. - targets))
def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None): if not self.run_already: from theano import tensor as t import theano #Should really be a matrix for multiple outputs y = t.matrix(name='y') f = t.matrix(name='f') g = t.matrix(name='g') c = t.matrix(name='c') #ef = t.where(f > 18., f, t.log1p(t.exp(f))) #eg = t.where(g > 18., g, t.log1p(t.exp(g))) #ef = t.nnet.softplus(f) #eg = t.nnet.softplus(g) ef = t.exp(f) eg = t.exp(g) #In log(1+b) if b > 300, use log(b) as 1 isn't relevant anymore #inner_1 = (y/ef)**eg # Naively #inner = t.exp(eg*(t.log(y) - t.log(ef))) # do it in log space then exp, then do log1p #clip_log1p_inner_1 = t.where(inner_1 > 300, eg*(t.log(y) - t.log(ef)), t.log1p(inner_1)) #clip_log1p_inner = t.log1p(inner) inner = eg*(t.log(y) - t.log(ef)) # We are going to do log(1+a) which is log(1+exp(log a)) which is softplus(log a) where log a is stable! clip_log1p_inner = t.nnet.softplus(inner) #Full log likelihood before expectations #logpy_t = (1-c)*(+t.log(eg) - eg*t.log(ef) + (eg - 1)*t.log(y) - 2*clip_log1p_inner_1) + c*(-clip_log1p_inner_1) #logpy_t_1 = t.where(c, -clip_log1p_inner_1, t.log(eg) - eg*t.log(ef) + (eg - 1)*t.log(y) - 2*clip_log1p_inner_1) logpy_t = t.where(c, -clip_log1p_inner, t.log(eg) - eg*t.log(ef) + (eg - 1)*t.log(y) - 2*clip_log1p_inner) logpy_sum_t = t.sum(logpy_t) dF_df_t = theano.grad(logpy_sum_t, f) d2F_df2_t = 0.5*theano.grad(t.sum(dF_df_t), f) # This right? dF_dg_t = theano.grad(logpy_sum_t, g) d2F_dg2_t = 0.5*theano.grad(t.sum(dF_dg_t), g) # This right? self.logpy_func = theano.function([f,g,y,c],logpy_t) self.dF_df_func = theano.function([f,g,y,c],dF_df_t)#, mode='DebugMode') self.d2F_df2_func = theano.function([f,g,y,c],d2F_df2_t) self.dF_dg_func = theano.function([f,g,y,c],dF_dg_t) self.d2F_dg2_func = theano.function([f,g,y,c],d2F_dg2_t) self.run_already = True #funcs = [self.logpy_func, self.dF_df_func, self.d2F_df2_func, self.dF_dg_func, self.d2F_dg2_func] funcs = [self.logpy_func, self.dF_df_func, self.d2F_df2_func, self.dF_dg_func, self.d2F_dg2_func] D = Y.shape[1] mf, mg = m[:, :D], m[:, D:] vf, vg = v[:, :D], v[:, D:] c = Y_metadata['censored'] F = 0 # Could do analytical components here T = self.T #Need to get these now to duplicate the censored inputs for quadrature gh_x, gh_w = self._gh_points(T) Y_metadata_new= Y_metadata.copy() c = np.repeat(Y_metadata_new['censored'], gh_x.shape[0]**2, axis=0) ##Some little code to check the result numerically using quadrature #from scipy import integrate #i = 6 # datapoint index #def quad_func(fi, gi, yi, mgi, vgi, mfi, vfi,ci): ##x = safe_exp(-fi*safe_exp(gi))*yi**safe_exp(gi) #x = safe_exp(-fi*safe_exp(gi) + safe_exp(gi)*np.log(yi)) #log1px = np.log1p(x) ##return ((*-gammaln(np.exp(fi)) - gammaln(np.exp(gi)) + gammaln(np.exp(fi) + np.exp(gi))) #p(y|f,g) #return (((1-ci)*(-2*log1px) + ci*(-log1px)) #p(y|f,g) #* np.exp(-0.5*np.log(2*np.pi*vgi) - 0.5*((gi - mgi)**2)/vgi) #q(g) #* np.exp(-0.5*np.log(2*np.pi*vfi) - 0.5*((fi - mfi)**2)/vfi) #q(f) #) #quad_func_l = partial(quad_func, yi=Y[i], mgi=mg[i], vgi=vg[i], mfi=mf[i], vfi=vf[i], ci=Y_metadata['censored'][i]) #def integrl(gi): #return integrate.quad(quad_func_l, -30, 5, args=(gi))[0] #print "Numeric scipy F quad" #print integrate.quad(lambda fi: integrl(fi), -30, 5) #(F_quad, dF_dmf, dF_dvf, dF_dmg, dF_dvg) = self.quad2d(funcs=funcs, Y=Y, mf=mf, vf=vf, mg=mg, vg=vg, #gh_points=gh_points, exp_f=False, exp_g=False, c=c) (F_quad, dF_dmf, dF_dvf, dF_dmg, dF_dvg) = self.quad2d(funcs=funcs, Y=Y, mf=mf, vf=vf, mg=mg, vg=vg, gh_points=gh_points, exp_f=False, exp_g=False, c=c) #print "2d quad F quad" #print F_quad[i] F += F_quad #gprec = safe_exp(mg - 0.5*vg) dF_dmf += 0 #(1-c)*(-gprec) dF_dmg += 0 #(1-c)*(1 + gprec*(np.log(Y) - mf)) dF_dvf += 0 # ? dF_dvg += 0 # ? dF_dm = np.hstack((dF_dmf, dF_dmg)) dF_dv = np.hstack((dF_dvf, dF_dvg)) if np.any(np.isnan(F_quad)): print("We have a nan in F_quad") if np.any(np.isnan(dF_dmf)): print("We have a nan in dF_dmf") if np.any(np.isnan(dF_dmg)): print("We have a nan in dF_dmg") return F, dF_dm, dF_dv, None
def create_lasagne_network(num_features): # Create data for testing network dimensions x_sym = T.fmatrix('x_sym') y_sym = T.fmatrix('y_sym') # create test data X = np.random.rand((128* num_features)).astype('float32').reshape((-1, num_features)) y = np.random.rand((128)).astype('float32').reshape((-1, 1)) """ Create lasagne layers """ l_in = lasagne.layers.InputLayer((None, num_features),name='input') print "l_in shape: %s" % str((lasagne.layers.get_output(l_in, inputs={l_in: x_sym}).eval({x_sym: X}).shape)) l_hidden = lasagne.layers.DenseLayer(l_in, num_units=500, name='l_hidden',nonlinearity=None) print "l_output shape: %s" % str((lasagne.layers.get_output(l_hidden, inputs={l_in: x_sym}).eval({x_sym: X}).shape)) l_out = lasagne.layers.DenseLayer(l_hidden, num_units=1, name='l_out',nonlinearity=None) print "l_output shape: %s" % str((lasagne.layers.get_output(l_out, inputs={l_in: x_sym}).eval({x_sym: X}).shape)) output_train = lasagne.layers.get_output(l_out, inputs={l_in: x_sym},deterministic=False) print "output_train shape: %s" % str(output_train.eval({x_sym: X}).shape) out = output_train.flatten() total_cost = T.where(y_sym.flatten()>0, (out-y_sym.flatten())/(y_sym.flatten()),0)**2 print "total_cost: %s" % str((total_cost.eval({x_sym: X, y_sym: y})).shape) all_trainable_parameters = lasagne.layers.get_all_params([l_out], trainable=True) mean_cost = T.sqrt(T.mean(total_cost)) all_grads = T.grad(mean_cost, all_trainable_parameters) learning_rate = theano.shared(np.cast['float32'](0.01)) updates = lasagne.updates.adam(all_grads, all_trainable_parameters, learning_rate=learning_rate) """ Create theano functions to be used in the training loop and for making predictions """ train_func = theano.function([x_sym, y_sym], [mean_cost], updates=updates) test_func = theano.function([x_sym, y_sym], [mean_cost]) predict_func = theano.function([x_sym], [out]) # when the input X is a dict, the following definitions will allow LasagneNet to call train_func without # knowing the order of the inputs, using the syntax train_function(**X) def train_function(X, y): return train_func(X, y.reshape((-1,1))) def test_function(X, y): return test_func(X, y.reshape((-1,1))) def predict_function(X): return predict_func(X) return l_out, train_function, test_function, predict_function, learning_rate
def maxabs(t1, t2): pos = T.where(t1 > t2, t1, t2) neg = T.where(-t1 > -t2, t1, t2) ret = T.where(pos >= -neg, pos, neg) return ret
def grad_(index, scores): ifnull = [T.zeros_like(p) for p in self.params.values()] g = IFEL(T.eq(self.idxs[index], self.NULL), ifnull, theano.grad(scores[index], self.params.values())) return [T.where(T.isnan(g_), T.zeros_like(g_), g_) for g_ in g]
def stable(x, stabilize=True): if stabilize: x = T.where(T.isnan(x), 1000., x) x = T.where(T.isinf(x), 1000., x) return x
def huber(x, eps): return T.where(abs(x) < eps, x**2 / (2 * eps), abs(x) - eps/2.)