def make_accum_f(mode, var, op=None): import theano import theano.tensor as T dtype = var.dtype broadcastable = var.broadcastable bcast = broadcastable_string(broadcastable) ndim = var.ndim if mode == "avg_shared": import numpy as np arr = np.zeros([1] * ndim, dtype=dtype) s = theano.shared(arr, 's', broadcastable=broadcastable) y = T.scalar('avg_fac', dtype=dtype) name = make_name(mode, dtype, bcast, op) return theano.function([y], updates={s: s * y}, name=name) t_type = T.TensorType(dtype=dtype, broadcastable=broadcastable) x = t_type('accum').transfer(None) if mode == "reduce": y = t_type('slice').transfer(None) T_op = getattr(T, op) x_pad = T.shape_padaxis(x, axis=0) y_pad = T.shape_padaxis(y, axis=0) z = T_op(T.concatenate([x_pad, y_pad], axis=0), axis=0) elif mode == "gather": y = t_type('slice').transfer(None) z = T.concatenate([x, y]) elif mode == "avg_output": y = T.scalar('avg_fac', dtype=dtype) z = x * y else: raise ValueError("Unrecognized mode: ", mode) name = make_name(mode, dtype, bcast, op) return theano.function([x, y], z.transfer(None), name=name)
def _step(self, st_s, t, onoise, inoise): on_t = onoise[:, :, t] in_t = inoise[:, :, t:t + 1] # get action at_s = self.predict(st_s) # obtain new steering variables A_t1 = self.aAction(st_s, at_s) # time-shift steerings 1 into the future # (A(t-15),..A(t) -> A(t-14),..,A(t+1) st_s3 = st_s[:, 1:].reshape( (st_s.shape[0], self.params_task['history'], 4)) st1_s3 = T.set_subtensor( st_s3[:, :, :3], T.concatenate((st_s3[:, 1:, :3], T.shape_padaxis(A_t1, 1)), axis=1)) xt1_s = T.concatenate( (st_s[:, :1], st1_s3.reshape((st_s.shape[0], st_s.shape[1] - 1))), axis=1) # Obtain \delta R(t+1) by BNN xt1_s = xt1_s.reshape( (self.params['samples'], xt1_s.shape[0] / self.params['samples'], xt1_s.shape[1])) drt1_s, vdrt1_s = self.model.predict(xt1_s, mode='symbolic', provide_noise=True, noise=in_t) drt1_s = drt1_s.reshape( (drt1_s.shape[0] * drt1_s.shape[1], drt1_s.shape[2])) vdrt1_s = vdrt1_s.reshape( (vdrt1_s.shape[0] * vdrt1_s.shape[1], vdrt1_s.shape[2])) # sample from output noise drt1_s = on_t * T.sqrt(vdrt1_s) + drt1_s #obtain R(t+1) by adding \delta R(t+1) rt1_s = st_s[:, -1:] + drt1_s[:, 0:1] # undo log-logit transformation to obtain unnormalized reward rew1 = 1. / (1. + T.exp(-rt1_s)) # undo logit rew1 = rew1 * (self.model.params['bounds'][3] - self.model. params['bounds'][1]) + self.model.params['bounds'][1] rew1 = T.exp(rew1) - 1 # update time-embedding: R(t-15)..R(t) -> R(t-14) .. R(t+1) st1_s3 = T.set_subtensor( st1_s3[:, :, 3:], T.concatenate((st1_s3[:, 1:, 3:], T.shape_padaxis(rt1_s, 1)), axis=1)) st1_s = T.concatenate( (st_s[:, :1], st1_s3.reshape((st_s.shape[0], st_s.shape[1] - 1))), axis=1) return [st1_s, t + 1, rew1[:, 0]]
def apply(self, y, y_hat, x): predicted = y_hat.argmax(axis=1) # both expanded_y and expanded_y_hat are shape (cases, labels) expanded_y = tensor.extra_ops.to_one_hot(y, y_hat.shape[1]) expanded_y_hat = tensor.extra_ops.to_one_hot(predicted, y_hat.shape[1]) # pad vectors and elementwise multiply for (cases, labels, labels) expanded_confusion = (tensor.shape_padaxis(expanded_y, 2) * tensor.shape_padaxis(expanded_y_hat, 1)) # now result is (labels, labels, y_dim, x_dim) result = tensor.tensordot(expanded_confusion, x, axes=([0], [0])) return result
def log_likelihood_values(self, x, y, location=0.0, scale=1.0): o = self.output(x) noise_variance = T.tile( T.shape_padaxis(T.exp(self.log_v_noise[0, :]) * scale**2, 0), [o.shape[0], o.shape[1], 1]) location = T.tile(T.shape_padaxis(location, 0), [o.shape[0], o.shape[1], 1]) scale = T.tile(T.shape_padaxis(scale, 0), [o.shape[0], o.shape[1], 1]) return -0.5 * T.log(2 * math.pi * noise_variance) - \ 0.5 * (o * scale + location - T.tile(T.shape_padaxis(y, 0), [ o.shape[ 0 ], 1, 1 ]))**2 / noise_variance
def __init__(self, rng, input, batch_size, latent_size, label_size, out_size, activation, W_z, W_y, b): # init parent class super(Marginalized_Decoder, self).__init__(rng=rng, input=input, latent_size=latent_size, out_size=out_size, activation=activation, W_z=W_z, b=b) # setup the params self.W_y = W_y # compute marginalized outputs labels_tensor = T.extra_ops.repeat( T.shape_padaxis(T.eye(n=label_size, m=label_size), axis=0), repeats=batch_size, axis=0) self.output = self.activation(T.extra_ops.repeat(T.shape_padaxis(T.dot(self.input, self.W_z), axis=1), repeats=label_size, axis=1) + T.dot(labels_tensor, self.W_y) + self.b)
def __init__(self, eta, cutpoints, *args, **kwargs): eta = tt.as_tensor_variable(floatX(eta)) cutpoints = tt.concatenate( [tt.as_tensor_variable([0.0]), tt.as_tensor_variable(cutpoints)]) cutpoints = tt.shape_padaxis(cutpoints, 0) eta = tt.shape_padaxis(eta, 1) p = softmax(cumsum(eta - cutpoints, axis=1)) super().__init__(p=p, *args, **kwargs)
def make_reduce_f(var, mode): dtype = var.dtype bcast = var.broadcastable t_type = T.TensorType(dtype=dtype, broadcastable=bcast) x = t_type('accum').transfer(None) y = t_type('slice').transfer(None) if mode == "gather": z = T.concatenate([x, y]) else: T_op = getattr(T, mode) x_pad = T.shape_padaxis(x, axis=0) y_pad = T.shape_padaxis(y, axis=0) z = T_op(T.concatenate([x_pad, y_pad], axis=0), axis=0) name = mode + "_" + str(dtype) + broadcastable_string(bcast) return theano.function([x, y], z.transfer(None), name=name)
def get_conv_xy(layer, deterministic=True): w_np = layer.W.get_value() input_layer = layer.input_layer if layer.pad == 'same': input_layer = L.PadLayer(layer.input_layer, width=np.array(w_np.shape[2:])/2, batch_ndim=2) input_shape = L.get_output_shape(input_layer) max_x = input_shape[2] - w_np.shape[2] max_y = input_shape[3] - w_np.shape[3] srng = RandomStreams() patch_x = srng.random_integers(low=0, high=max_x) patch_y = srng.random_integers(low=0, high=max_y) #print("input_shape shape: ", input_shape) #print("pad: \"%s\""% (layer.pad,)) #print(" stride: " ,layer.stride) #print("max_x %d max_y %d"%(max_x,max_y)) x = L.get_output(input_layer, deterministic=deterministic) x = x[:, :, patch_x:patch_x + w_np.shape[2], patch_y:patch_y + w_np.shape[3]] x = T.flatten(x, 2) # N,D w = layer.W if layer.flip_filters: w = w[:, :, ::-1, ::-1] w = T.flatten(w, outdim=2).T # D,O y = T.dot(x, w) # N,O if layer.b is not None: y += T.shape_padaxis(layer.b, axis=0) return x, y
def make_reduce_f(mode, dtype, ndim): t_type = T.TensorType(dtype=dtype, broadcastable=[False] * ndim) x = t_type('accum').transfer(None) y = t_type('slice').transfer(None) if mode == "gather": z = T.concatenate([x, y]) else: T_op = getattr(T, mode) x_pad = T.shape_padaxis(x, axis=0) y_pad = T.shape_padaxis(y, axis=0) z = T_op(T.concatenate([x_pad, y_pad], axis=0), axis=0) name = mode + "_" + str(dtype) return theano.function([x, y], z.transfer(None), name=name, allow_input_downcast=True)
def calc_poissonVal_negative_log_likelihood(data, recon, axis_to_sum=1): if axis_to_sum != 1: # addresses the case where we marginalize data = T.extra_ops.repeat(T.shape_padaxis(data, axis=1), repeats=recon.shape[1], axis=1) return T.sum(T.exp(recon) - data * recon, axis=axis_to_sum)
def week_modulation( new_cases_inferred, week_modulation_type="abs_sine", pr_mean_weekend_factor=0.7, pr_sigma_weekend_factor=0.2, week_end_days=(6, 7), model=None, save_in_trace=True, ): """ Parameters ---------- new_cases_inferred week_modulation_type pr_mean_weekend_factor pr_sigma_weekend_factor week_end_days model Returns ------- """ model = modelcontext(model) shape_modulation = list(model.sim_shape) shape_modulation[0] -= model.sim_diff_data len_L2 = () if model.sim_ndim == 1 else model.sim_shape[1] week_end_factor, _ = hierarchical_normal( "weekend_factor", "sigma_weekend_factor", pr_mean=pr_mean_weekend_factor, pr_sigma=pr_sigma_weekend_factor, len_L2=len_L2, ) if week_modulation_type == "step": modulation = np.zeros(shape_modulation[0]) for i in range(shape_modulation[0]): date_curr = model.data_begin + datetime.timedelta(days=i) if date_curr.isoweekday() in week_end_days: modulation[i] = 1 elif week_modulation_type == "abs_sine": offset_rad = pm.VonMises("offset_modulation_rad", mu=0, kappa=0.01) offset = pm.Deterministic("offset_modulation", offset_rad / (2 * np.pi) * 7) t = np.arange( shape_modulation[0]) - model.data_begin.weekday() # Sunday @ zero modulation = 1 - tt.abs_(tt.sin(t / 7 * np.pi + offset_rad / 2)) if model.sim_ndim == 2: modulation = tt.shape_padaxis(modulation, axis=-1) multiplication_vec = np.ones( shape_modulation) - (1 - week_end_factor) * modulation new_cases_inferred_eff = new_cases_inferred * multiplication_vec if save_in_trace: pm.Deterministic("new_cases", new_cases_inferred_eff) return new_cases_inferred_eff
def step(l, x_prev_sampled, x_prev_argmax, z, all_embeddings): x_prev_sampled_embedded = self.embedder( x_prev_sampled, all_embeddings) # N * max(L) * E probs_sampled = self.get_probs(x_prev_sampled_embedded, z, all_embeddings, mode='all') # N * max(L) * D x_sampled_one_hot = self.output_dist.get_samples( [T.shape_padaxis(probs_sampled[:, l], 1)]) # N * 1 * D x_sampled_l = T.argmax(x_sampled_one_hot, axis=-1).flatten() # N x_current_sampled = T.set_subtensor(x_prev_sampled[:, l], x_sampled_l) # N * max(L) # x_prev_argmax_embedded = self.embedder( x_prev_argmax, all_embeddings) # N * max(L) * E probs_argmax = self.get_probs(x_prev_argmax_embedded, z, all_embeddings, mode='all') # N * max(L) * D x_argmax_l = T.argmax(probs_argmax[:, l], axis=-1) # N x_current_argmax = T.set_subtensor(x_prev_argmax[:, l], x_argmax_l) # N * max(L) return T.cast(x_current_sampled, 'int32'), T.cast(x_current_argmax, 'int32')
def calc_realVal_negative_log_likelihood(data, recon, axis_to_sum=1): if axis_to_sum != 1: # addresses the case where we marginalize data = T.extra_ops.repeat(T.shape_padaxis(data, axis=1), repeats=recon.shape[1], axis=1) return .5 * T.sum((data - recon)**2, axis=axis_to_sum)
def output(self, x): x = T.tile(T.shape_padaxis(x, 0), [self.n_samples, 1, 1]) for layer in self.layers: x = layer.output(x) return x
def output(self, x): x = T.tile(T.shape_padaxis(x, 0), [ self.n_samples, 1, 1 ]) for layer in self.layers: x = layer.output(x) return x
def get_dense_xy(layer, deterministic=True): x = L.get_output(L.FlattenLayer(layer.input_layer), deterministic=deterministic) # N, D w = layer.W # D, O y = T.dot(x, w) # (N,O) if layer.b is not None: y += T.shape_padaxis(layer.b, axis=0) return x, y
def gated_mean(x, p=0.5, axis=2): import theano.tensor as T thres = T.shape_padaxis( (p * T.mean(x, axis=axis) + (1 - p) * T.max(x, axis=axis)), axis=-1) mask = T.ge(x, thres) g_values = mask * x g_means = T.sum(g_values, axis=-1) / T.sum(mask, axis=-1) return g_means
def process(self, gstate, input_vector, dropout_masks=Ellipsis): """ Process an input vector and update the state accordingly. Each node runs a GRU step with previous state from the node state and input from the vector. Params: gstate: A GraphState giving the current state input_vector: A tensor of the form (n_batch, input_width) """ if dropout_masks is Ellipsis: dropout_masks = None append_masks = False else: append_masks = True # gstate.edge_states is of shape (n_batch, n_nodes, n_nodes, id+state) # combined input should be broadcasted to (n_batch, n_nodes, n_nodes, X) input_vector_part = T.shape_padaxis(T.shape_padaxis(input_vector, 1), 2) source_state_part = T.shape_padaxis( T.concatenate([gstate.node_ids, gstate.node_states], 2), 2) dest_state_part = T.shape_padaxis( T.concatenate([gstate.node_ids, gstate.node_states], 2), 1) full_input = broadcast_concat( [input_vector_part, source_state_part, dest_state_part], 3) # we flatten to process updates flat_input = full_input.reshape([-1, self._process_input_size]) flat_result, dropout_masks = self._update_stack.process( flat_input, dropout_masks) result = flat_result.reshape([ gstate.n_batch, gstate.n_nodes, gstate.n_nodes, self._graph_spec.num_edge_types, 2 ]) should_set = result[:, :, :, :, 0] should_clear = result[:, :, :, :, 1] new_strengths = gstate.edge_strengths * (1 - should_clear) + ( 1 - gstate.edge_strengths) * should_set new_gstate = gstate.with_updates(edge_strengths=new_strengths) if append_masks: return new_gstate, dropout_masks else: return new_gstate
def output(self, x): x = T.tile(T.shape_padaxis(x, 0), [self.n_samples, 1, 1]) x = T.concatenate((x, 0 * self.randomness_z[:, 0:x.shape[1], :]), 2) for layer in self.layers: x = layer.output(x) return x
def calc_binaryVal_negative_log_likelihood(data, probabilities, axis_to_sum=1): if axis_to_sum != 1: # addresses the case where we marginalize data = T.extra_ops.repeat(T.shape_padaxis(data, axis=1), repeats=probabilities.shape[1], axis=1) return -T.sum(data * T.log(probabilities) + (1 - data) * T.log(1 - probabilities), axis=axis_to_sum)
def pt_forward_all(self, x, posit_x, mask): h0 = T.zeros((x.shape[1], self.n_out * 2), dtype=theano.config.floatX) padded = T.shape_padaxis(T.zeros_like(x[0]), axis=1).dimshuffle( (1, 0, 2)) x_shifted = T.concatenate([padded, x[:-1]], axis=0) padded_mask = T.shape_padaxis(T.zeros_like(mask[0]), axis=1).dimshuffle((1, 0)) mask = T.concatenate([padded_mask, mask[:-1]], axis=0).dimshuffle( (0, 1, 'x')) o, _ = theano.scan(fn=self._forward, sequences=[x, x_shifted, posit_x, mask], outputs_info=[h0, None]) new_probs = o[1].reshape((x.shape[0], x.shape[1])) return new_probs
def gated_mean(x, p=0.5, axis=2): import theano.tensor as T thres = T.shape_padaxis((p * T.mean(x, axis=axis) + (1 - p) * T.max(x, axis=axis)), axis=-1) mask = T.ge(x, thres) g_values = mask*x g_means = T.sum(g_values, axis=-1) / T.sum(mask, axis=-1) return g_means
def test14(): x = T.iscalar('x') y = T.iscalar('y') z = T.arange(x) z = T.shape_padaxis(z, axis=1) z2 = T.zeros((x,y)) z2 = z + z2 fn = theano.function(inputs=[x,y],outputs=[z2],allow_input_downcast=True) res = fn(3,4) print res, res[0].shape
def prepare_toy_data(n_train, n_valid, batch_size): n_train_batches = n_train // batch_size if batch_size < n_train else 1 n_valid_batches = n_valid // batch_size if batch_size < n_valid else 1 rng = np.random.RandomState(1234) # always return the same n_train_per_int = n_train // 2 # interpolation on [-0.5, 0.0], extrapolation on [0.5, 1.0] # X_train = np.concatenate(( # rng.uniform(low=-1.0, high=-0.5, size=n_train_per_int), # rng.uniform(low=0.0, high=0.5, size=n_train - n_train_per_int) # )).astype(floatX) # X_valid = rng.uniform(low=-1.0, high=0.5, size=n_valid).astype(floatX) X_train = np.asarray(rng.uniform(low=-1.0, high=0.5, size=n_train), dtype=floatX) X_valid = np.asarray(rng.uniform(low=-1.0, high=1.0, size=n_valid), dtype=floatX) y_train = np.asarray( # 0.4*np.sin(3 * 2*np.pi*X_train) + 0.05*rng.normal(size=n_train), 0.4 * np.cos(2 * np.pi * X_train) ** 2 * np.sin(2 * np.pi * X_train + 0.1) + 0.01 * rng.normal(size=n_train), dtype=floatX ) y_valid = np.asarray( # 0.4*np.sin(3 * 2*np.pi*X_valid) + 0.05*rng.normal(size=n_valid), 0.4 * np.cos(2 * np.pi * X_valid) ** 2 * np.sin(2 * np.pi * X_valid + 0.1) + 0.01 * rng.normal(size=n_valid), dtype=floatX ) X_train = T.shape_padaxis(theano.shared(X_train, name='X_train'), axis=1) y_train = theano.shared(y_train, name='y_train') X_valid = T.shape_padaxis(theano.shared(X_valid, name='X_valid'), axis=1) y_valid = theano.shared(y_valid, name='y_valid') # used in evaluation with multiple samples y_valid = np.array(y_valid.eval()) return X_train, y_train, X_valid, y_valid, n_train_batches, n_valid_batches
def _get_split(self, layer, deterministic=True, conv_all_patches=True, **kwargs): # Get the patches and the outputs without the non-linearities. if type(layer) is L.DenseLayer: x, y = putils.get_dense_xy(layer, deterministic) elif type(layer) is L.Conv2DLayer: if conv_all_patches is True: x, y = putils.get_conv_xy_all(layer, deterministic) else: x, y = putils.get_conv_xy(layer, deterministic) else: raise ValueError("Unknown layer as input") # Create an output dictionary outputs = dict() for name, fun in subtypes: outputs[name] = dict() mrk_y = 1.0 * T.cast(fun(y), dtype=theano.config.floatX) # (N,O) y_current = y * mrk_y # This has a binary mask cnt_y = T.shape_padaxis(T.sum(mrk_y, axis=0), axis=0) # (1,O) norm = T.maximum(cnt_y, 1.) # Count how many datapoints are considered outputs[name]['cnt'] = cnt_y # The mean of the current batch outputs[name]['m_y'] = T.shape_padaxis( y_current.sum(axis=0), axis=0) / norm # (1,O) mean output for batch outputs[name]['m_x'] = T.dot( x.T, mrk_y) / norm # (D,O) mean input for batch # The mean of the current batch outputs[name]['yty'] = T.shape_padaxis( T.sum(y_current**2., axis=0), axis=0) / norm # (1,O) outputs[name]['xty'] = T.dot(x.T, y_current) / norm # D,O return dict_to_list(outputs)
def process(self, input_vector): """ Convert an input vector into a categorical distribution across num_categories categories Params: input_vector: Vector of shape (n_batch, input_width) Returns: Categorical distribution of shape (n_batch, 1, num_categories), such that it sums to 1 across all categories for each instance in the batch """ transformed = self._transform_stack.process(input_vector) return T.shape_padaxis(transformed, 1)
def predict(self, X_test, mode='numerical', provide_noise=False, noise=None): """ Prediction wrapper-method Requires X_test to be [n_samples,n,d], so use np.tile(X_test,[samples,1,1]) before prediction. For policy search we use theano.scan. In that case we need to be able to feed in the input noise externally (provide_noise,noise) mode='symbolic' if we want to use this model as part of a larger graph(as in the policy search), mode='numerical' for standard predictions, using compiled functions """ print "HERE" X_test_n = (X_test - self.mean_X) / self.std_X #X_test_n = X_test if mode == 'symbolic': if provide_noise == True: # X_test_n.shape[0] refers to the number of samples, ie draws from the weight distribution m = self.bb_alpha.network.output_gn(X_test_n, noise, X_test_n.shape[0]) else: m = self.bb_alpha.network.output(X_test_n, False, X_test_n.shape[0], use_indices=False) log_v_noise = self.bb_alpha.network.log_v_noise noise_variance = T.tile( T.shape_padaxis(T.exp(log_v_noise[0, :]), 0), [m.shape[0], m.shape[1], 1]) else: if X_test_n.ndim == 2: X_test_n = np.tile(X_test_n, [self.params['samples'], 1, 1]) m = self.bb_alpha.fwpass(X_test_n, X_test_n.shape[0]) log_v_noise = self.bb_alpha.network.log_v_noise.get_value()[0, :] noise_variance = np.tile(np.exp(log_v_noise), [m.shape[0], m.shape[1], 1]) mt = m vt = noise_variance # TODO double check we don't need this? mt = mt * self.std_Y + self.mean_Y vt *= self.std_Y**2 return mt, vt
def pretrain(self): bm = self.bm = T.imatrix('bm') padded = T.shape_padaxis(T.zeros_like(bm[0]), axis=1).dimshuffle( (1, 0)) bm_shift = T.concatenate([padded, bm[:-1]], axis=0) new_bm = T.cast(T.or_(bm, bm_shift), theano.config.floatX) new_probs = self.output_layer.forward_all(self.h_final, new_bm) cross_ent = T.nnet.binary_crossentropy(new_probs, new_bm) * self.masks self.obj = obj = T.mean(T.sum(cross_ent, axis=0)) self.cost_g = obj * args.coeff_cost_scale + self.l2_cost
def process(self, input_vector): """ Convert an input vector into a probabilistic set, i.e. a list of probabilities of item i being in the output set. Params: input_vector: Vector of shape (n_batch, input_width) Returns: Set distribution of shape (n_batch, 1, num_categories), where each value is independent from the others. """ transformed = self._transform_stack.process(input_vector) return T.shape_padaxis(transformed,1)
def _forward_all_sample(self, x, posit_x, h0): padded = T.shape_padaxis(T.zeros_like(x[0]), axis=1).dimshuffle( (1, 0, 2)) x_shifted = T.concatenate([padded, x[:-1]], axis=0) mask = T.zeros(shape=(x.shape[1], )).dimshuffle((0, 'x')) [s, _], updates = theano.scan(fn=self._forward_sample, sequences=[x, x_shifted, posit_x], outputs_info=[mask, h0]) samples = theano.gradient.disconnected_grad(s).reshape( (x.shape[0], x.shape[1])) padded_mask = T.shape_padaxis(T.zeros_like(samples[0]), axis=1).dimshuffle((1, 0)) mask_from_samples = T.concatenate([padded_mask, samples[:-1]], axis=0).dimshuffle((0, 1, 'x')) [_, probs], _ = theano.scan( fn=self._forward, sequences=[x, x_shifted, posit_x, mask_from_samples], outputs_info=[h0, None]) return probs.reshape((x.shape[0], x.shape[1])), updates, samples
def _forward(self): if theano.config.device.startswith('gpu'): from theano.tensor.nnet.abstract_conv import bilinear_upsampling else: raise AssertionError('Bilinear interpolation requires GPU and cuDNN.') inpt = T.reshape(self.inpt, (self.inpt_depth, self.n_inpt, self.inpt_height, self.inpt_width)) pre_res = bilinear_upsampling(input=inpt, ratio=self.up_factor) shuffle_res = pre_res.dimshuffle((2, 3, 0, 1)) res = self._bilinear_upsampling_1D(inpt=shuffle_res, ratio=self.up_factor) self.output = res.dimshuffle((2, 3, 0, 1)) self.output = T.shape_padaxis(self.output, axis=0) self.output = T.unbroadcast(self.output, 0)
def changing_weight2(self, v_sample): """ function to compute the transition probability flipping spins for v_sample, which is Ts's=conj(psi(s',M))/conj(psi(s,M)), as for transverse field Ising model the flipping term in the Hamiltonian is hf=h/2(sp_i+sm_i), therefore flip each site contribute the same energy h/2, but the Ts's is different, but one can sum up Ts's for all s' :param v_sample: one sample of the visible layer :param Hamiltonian: Hamiltonian of the physical system we concern, we mainly use Hamiltonian.h :pbc: periodic boundary condition, 1:periodic, 0: open """ # As self.W_real has the size of nvisible*nhidden, and here v_sample is a # vector of nvisible, so ones needs to transpose self.W_real to make it broadcastable exponent=-2*v_sample*self.vbias+\ T.sum( T.log(T.cosh(self.hbias-T.shape_padaxis(self.W_real,axis=0)*T.shape_padaxis(v_sample,axis=-1))),axis=2)-\ T.sum( T.log(T.cosh(self.hbias+T.shape_padaxis(self.W_real,axis=0)*T.shape_padaxis(v_sample,axis=-1))),axis=2) return T.sum(T.exp(exponent), axis=1)
def decode_to_probs(self, activations, relative_position, low_bound, high_bound): assert (low_bound%12==0) and (high_bound-low_bound == self.num_octaves*12), "Circle of thirds must evenly divide into octaves" squashed = T.reshape(activations, (-1,self.RAW_ENCODING_WIDTH)) rsp = T.nnet.softmax(squashed[:,:3]) c1 = T.nnet.softmax(squashed[:,3:7]) c2 = T.nnet.softmax(squashed[:,7:10]) octave_choice = T.nnet.softmax(squashed[:,10:]) octave_notes = T.tile(c1,(1,3)) * T.tile(c2,(1,4)) full_notes = T.reshape(T.shape_padright(octave_choice) * T.shape_padaxis(octave_notes, 1), (-1,12*self.num_octaves)) full_probs = T.concatenate([rsp[:,:2], T.shape_padright(rsp[:,2])*full_notes], 1) newshape = T.concatenate([activations.shape[:-1],[2+high_bound-low_bound]],0) fixed = T.reshape(full_probs, newshape, ndim=activations.ndim) return fixed
def process(self, gstate, dropout_masks=Ellipsis): """ Process a graph state. 1. Data is transfered from each node to each other node along both forward and backward edges. This data is processed with a Wx+b style update, and an optional transformation is applied 2. Nodes sum the transfered data, weighted by the existence of the other node and the edge. 3. Nodes perform a GRU update with this input Params: gstate: A GraphState giving the current state """ if dropout_masks is Ellipsis: dropout_masks = None append_masks = False else: append_masks = True node_obs = T.concatenate([gstate.node_ids, gstate.node_states],2) flat_node_obs = node_obs.reshape([-1, self._process_input_size]) transformed, dropout_masks = self._transfer_stack.process(flat_node_obs,dropout_masks) transformed = transformed.reshape([gstate.n_batch, gstate.n_nodes, 2*self._graph_spec.num_edge_types, self._transfer_size]) scaled_transformed = transformed * T.shape_padright(T.shape_padright(gstate.node_strengths)) # scaled_transformed is of shape (n_batch, n_nodes, 2*num_edge_types, transfer_size) # We want to multiply through by edge strengths, which are of shape # (n_batch, n_nodes, n_nodes, num_edge_types), both fwd and backward edge_strength_scale = T.concatenate([gstate.edge_strengths, gstate.edge_strengths.swapaxes(1,2)], 3) # edge_strength_scale is of (n_batch, n_nodes, n_nodes, 2*num_edge_types) intermed = T.shape_padaxis(scaled_transformed, 2) * T.shape_padright(edge_strength_scale) # intermed is of shape (n_batch, n_nodes "source", n_nodes "dest", 2*num_edge_types, transfer_size) # now reduce along the "source" and "edge_types" dimensions to get dest activations # of shape (n_batch, n_nodes, transfer_size) reduced_result = T.sum(T.sum(intermed, 3), 1) # now add information fom current node id full_input = T.concatenate([gstate.node_ids, reduced_result], 2) # we flatten to apply GRU flat_input = full_input.reshape([-1, self._graph_spec.num_node_ids + self._transfer_size]) flat_state = gstate.node_states.reshape([-1, self._graph_spec.node_state_size]) new_flat_state, dropout_masks = self._propagation_gru.step(flat_input, flat_state, dropout_masks) new_node_states = new_flat_state.reshape(gstate.node_states.shape) new_gstate = gstate.with_updates(node_states=new_node_states) if append_masks: return new_gstate, dropout_masks else: return new_gstate
def test13(): x = T.fmatrix('x') x2 = T.zeros((4,3,5)) y = T.shape_padaxis(x, axis=1) z = y z2 = y + x2 fn = theano.function(inputs=[x],outputs=[z,z2],allow_input_downcast=True) a =[float(i) for i in range(20)] b = [1,2,3] a = np.array(a) a = a.reshape(4,5) print a res,res2 = fn(a) print res, res.shape print res2, res2.shape exit(0) a = a.reshape(5,3,4) print a b = [[1,1],[2,2]] print a[[1,2],[[0,1],[0,1]],[1]] print T.arange(10)
def __init__(self, rng, input, batch_size, in_size, latent_size, W_a = None, W_b = None, epsilon = 0.01): self.srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(999999)) self.input = input # setup variational params if W_a is None: W_values = np.asarray(0.01 * rng.standard_normal(size=(in_size, latent_size-1)), dtype=theano.config.floatX) W_a = theano.shared(value=W_values, name='W_a') if W_b is None: W_values = np.asarray(0.01 * rng.standard_normal(size=(in_size, latent_size-1)), dtype=theano.config.floatX) W_b = theano.shared(value=W_values, name='W_b') self.W_a = W_a self.W_b = W_b # compute Kumaraswamy samples uniform_samples = T.cast(self.srng.uniform(size=(batch_size, latent_size-1), low=0.01, high=0.99), theano.config.floatX) self.a = Softplus(T.dot(self.input, self.W_a)) self.b = Softplus(T.dot(self.input, self.W_b)) v_samples = (1-(uniform_samples**(1/self.b)))**(1/self.a) # setup variables for recursion stick_segment = theano.shared(value=np.zeros((batch_size,), dtype=theano.config.floatX), name='stick_segment') remaining_stick = theano.shared(value=np.ones((batch_size,), dtype=theano.config.floatX), name='remaining_stick') def compute_latent_vars(i, stick_segment, remaining_stick, v_samples): # compute stick segment stick_segment = v_samples[:,i] * remaining_stick remaining_stick *= (1-v_samples[:,i]) return (stick_segment, remaining_stick) (stick_segments, remaining_sticks), updates = theano.scan(fn=compute_latent_vars, outputs_info=[stick_segment, remaining_stick],sequences=T.arange(latent_size-1), non_sequences=[v_samples], strict=True) self.avg_used_dims = T.mean(T.sum(remaining_sticks > epsilon, axis=0)) self.latent_vars = T.transpose(T.concatenate([stick_segments, T.shape_padaxis(remaining_sticks[-1, :],axis=1).T], axis=0)) self.params = [self.W_a, self.W_b]
def compute_output(self): # We compute the output mean self.Kzz = compute_kernel(self.lls, self.lsf, self.z, self.z) + T.eye(self.z.shape[ 0 ]) * self.jitter * T.exp(self.lsf) self.KzzInv = T.nlinalg.MatrixInversePSD()(self.Kzz) LLt = T.dot(self.LParamPost, T.transpose(self.LParamPost)) self.covCavityInv = self.KzzInv + LLt * casting(self.n_points - self.set_for_training) / casting(self.n_points) self.covCavity = T.nlinalg.MatrixInversePSD()(self.covCavityInv) self.meanCavity = T.dot(self.covCavity, casting(self.n_points - self.set_for_training) / casting(self.n_points) * self.mParamPost) self.KzzInvcovCavity = T.dot(self.KzzInv, self.covCavity) self.KzzInvmeanCavity = T.dot(self.KzzInv, self.meanCavity) self.covPosteriorInv = self.KzzInv + LLt self.covPosterior = T.nlinalg.MatrixInversePSD()(self.covPosteriorInv) self.meanPosterior = T.dot(self.covPosterior, self.mParamPost) self.Kxz = compute_kernel(self.lls, self.lsf, self.input_means, self.z) self.B = T.dot(self.KzzInvcovCavity, self.KzzInv) - self.KzzInv v_out = T.exp(self.lsf) + T.dot(self.Kxz * T.dot(self.Kxz, self.B), T.ones_like(self.z[ : , 0 : 1 ])) if self.ignore_variances: self.output_means = T.dot(self.Kxz, self.KzzInvmeanCavity) self.output_vars = abs(v_out) + casting(0) * T.sum(self.input_vars) else: self.EKxz = compute_psi1(self.lls, self.lsf, self.input_means, self.input_vars, self.z) self.output_means = T.dot(self.EKxz, self.KzzInvmeanCavity) # In other layers we have to compute the expected variance self.B2 = T.outer(T.dot(self.KzzInv, self.meanCavity), T.dot(self.KzzInv, self.meanCavity)) exact_output_vars = True if exact_output_vars: # We compute the exact output variance self.psi2 = compute_psi2(self.lls, self.lsf, self.z, self.input_means, self.input_vars) ll = T.transpose(self.EKxz[ :, None, : ] * self.EKxz[ : , : , None ], [ 1, 2, 0 ]) kk = T.transpose(self.Kxz[ :, None, : ] * self.Kxz[ : , : , None ], [ 1, 2, 0 ]) v1 = T.transpose(T.sum(T.sum(T.shape_padaxis(self.B2, 2) * (self.psi2 - ll), 0), 0, keepdims = True)) v2 = T.transpose(T.sum(T.sum(T.shape_padaxis(self.B, 2) * (self.psi2 - kk), 0), 0, keepdims = True)) else: # We compute the approximate output variance using the unscented kalman filter v1 = 0 v2 = 0 n = self.input_d for j in range(1, n + 1): mask = T.zeros_like(self.input_vars) mask = T.set_subtensor(mask[ :, j - 1 ] , 1) inc = mask * T.sqrt(casting(n) * self.input_vars) self.kplus = T.sqrt(casting(1.0) / casting(2 * n)) * compute_kernel(self.lls, self.lsf, self.input_means + inc, self.z) self.kminus = T.sqrt(casting(1.0) / casting(2 * n)) * compute_kernel(self.lls, self.lsf, self.input_means - inc, self.z) v1 += T.dot(self.kplus * T.dot(self.kplus, self.B2), T.ones_like(self.z[ : , 0 : 1 ])) v1 += T.dot(self.kminus * T.dot(self.kminus, self.B2), T.ones_like(self.z[ : , 0 : 1 ])) v2 += T.dot(self.kplus * T.dot(self.kplus, self.B), T.ones_like(self.z[ : , 0 : 1 ])) v2 += T.dot(self.kminus * T.dot(self.kminus, self.B), T.ones_like(self.z[ : , 0 : 1 ])) v1 -= T.dot(self.EKxz * T.dot(self.EKxz, self.B2), T.ones_like(self.z[ : , 0 : 1 ])) v2 -= T.dot(self.Kxz * T.dot(self.Kxz, self.B), T.ones_like(self.z[ : , 0 : 1 ])) self.output_vars = abs(v_out) + abs(v2) + abs(v1) self.output_vars = self.output_vars + T.exp(self.lvar_noise) return
def expand_to_batch(x, batch_size, dim=-2): """Expand one dimension of `x` to `batch_size`.""" return T.shape_padaxis(x, dim).repeat(batch_size, axis=dim)
def calc_realVal_negative_log_likelihood(data, recon, axis_to_sum=1): if axis_to_sum != 1: # addresses the case where we marginalize data = T.extra_ops.repeat(T.shape_padaxis(data, axis=1), repeats = recon.shape[1], axis=1) return .5 * T.sum( (data - recon)**2, axis=axis_to_sum )
def calc_poissonVal_negative_log_likelihood(data, recon, axis_to_sum=1): if axis_to_sum != 1: # addresses the case where we marginalize data = T.extra_ops.repeat(T.shape_padaxis(data, axis=1), repeats = recon.shape[1], axis=1) return T.sum( T.exp(recon) - data * recon, axis=axis_to_sum )
def calc_categoricalVal_negative_log_likelihood(data, probabilities, axis_to_sum=1): if axis_to_sum != 1: # addresses the case where we marginalize data = T.extra_ops.repeat(T.shape_padaxis(data, axis=1), repeats = probabilities.shape[1], axis=1) return - T.sum(data * T.log(probabilities), axis=axis_to_sum)