def output_probabilistic(self, m_x, v_x): m_linear = T.dot(m_x, self.m_W[ 0, :, : ]) + T.tile(self.m_b[ 0, :, : ], [ m_x.shape[ 0 ], 1 ]) v_linear = T.dot(m_x**2, self.v_W[ 0, :, : ]) + T.dot(v_x, self.m_W[ 0, :, : ]**2) + T.dot(v_x, self.v_W[ 0, :, : ]) + \ T.tile(self.v_b[ 0, :, : ], [ m_x.shape[ 0 ], 1 ]) if not self.output_layer: # We compute the mean and variance after the ReLU activation alpha = m_linear / T.sqrt(v_linear) gamma = Network_layer.gamma(-alpha) gamma_robust = -alpha - 1.0 / alpha + 2.0 / alpha**3 gamma_final = T.switch(T.lt(-alpha, T.fill(alpha, 30)), gamma, gamma_robust) v_aux = m_linear + T.sqrt(v_linear) * gamma_final m_a = Network_layer.n_cdf(alpha) * v_aux v_a = m_a * v_aux * Network_layer.n_cdf(-alpha) + Network_layer.n_cdf(alpha) * v_linear * (1 - gamma_final * (gamma_final + alpha)) return (m_a, v_a) else: return (m_linear, v_linear)
def make_gaussian_filter(self): W_shape = self.get_W_shape() k = self.filter_size[0] k_low = int(np.floor(-(k-1)/2)) k_high = k_low+k W_std = T.exp(self.W_logstd) std_array = T.tile( W_std.dimshuffle('x', 0, 'x'), (self.num_input_channels, 1, k) ) x = np.arange(k_low, k_high).reshape((1, 1, -1)) x = T.tile( x, (self.num_input_channels, self.num_input_channels, 1) ).astype(floatX) p1 = (1./(np.sqrt(2.*np.pi))).astype(floatX) p2 = np.asarray(2., dtype=floatX) gf = (p1/std_array)*T.exp(-x**2/(p2*(std_array**2))) # gf = gf.astype(theano.config.floatX) mask = np.zeros(W_shape) rg = np.arange(self.num_input_channels) mask[rg, rg, :] = 1 mask = mask.astype(floatX) gf = gf*mask return gf
def setup_generate(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() n_batch, n_time = chord_roots.shape specs = [lstmstack.prepare_sample_scan( start_pos=T.alloc(np.array(encoding.STARTING_POSITION, np.int32), (n_batch)), start_out=T.tile(encoding.initial_encoded_form(), (n_batch,1)), timestep=T.tile(T.arange(n_time), (n_batch,1)), cur_chord_type=chord_types, cur_chord_root=chord_roots, deterministic_dropout=True ) for lstmstack, encoding in zip(self.lstmstacks, self.encodings)] updates, all_chosen, all_probs, indiv_probs = helper_generate_from_spec(specs, self.lstmstacks, self.encodings, self.srng, n_batch, n_time, self.bounds, self.normalize_artic_only) self.generate_fun = theano.function( inputs=[chord_roots, chord_types], updates=updates, outputs=all_chosen, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.generate_visualize_fun = theano.function( inputs=[chord_roots, chord_types], updates=updates, outputs=[all_chosen, all_probs] + indiv_probs, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def fwd(self, x, V, A, L): """ x : signal V : eigenvectors A : area L : eigenvalues """ V = V[:,:self.K] L = L[:self.K] L = L.dimshuffle('x','x',0) rho = T.sqrt(T.sum(A)) # Q x 1 x K, a window for each input function ghat = self.activation_interp( T.batched_dot(T.tile(L, [self.nin,1,1]), self.Winterp)) # Q x K x N V_ = T.tile(V.dimshuffle('x',1,0), [self.nin, 1, 1]) # Q x K x N tmp = (ghat * V).dimshuffle(0,2,1) # Q x N x N transl = rho * T.batched_dot(V_.dimshuffle(0,2,1), tmp) transl = A.dimshuffle('x',0,'x') * transl # Q x K x N tmp = (V.dimshuffle(0,'x',1) * x.dimshuffle(0,1,'x')).dimshuffle(1,2,0) # Q x K x N desc = rho * T.batched_dot(tmp, transl) desc = T.abs_(desc) desc = desc.dimshuffle(2,0,'x',1) # BC01 format : N x Q x 1 x K return self.activation(theano.tensor.nnet.conv.conv2d(desc, self.W).flatten(2) + self.b)
def apply(self, v): [h_vals, _], _ = theano.scan(fn=self.step, sequences = v, outputs_info = [T.tile(self.h0, (v.shape[1], 1)), T.tile(self.c0, (v.shape[1], 1))] ) return h_vals
def lcn_3d_input(data, kernel_shape, n_maps): """ :param data: [examples, depth, filters, height, width] :param kernel_shape: int :param n_maps: int :return: new_x: [examples, depth, filters, height, width] """ # create symbolic variable for the input data ftensor5 = T.TensorType('float32', [False] * 5) x = ftensor5() # # determine the number of maps # n_maps = data.shape[2] # create 3d filter that spans across all channels / feature maps # todo: kernel is not really in 3d; need 3d implementation instead of 2d repeated across third dimension # todo: alternative is to keep 2d kernel and extend short range given data size in z-plane; change first kernel_sh. filter_shape = (1, kernel_shape[0], n_maps, kernel_shape[1], kernel_shape[2]) filters = np.resize(gaussian_filter(kernel_shape[1]), filter_shape) filters = filters / np.sum(filters) filters = sharedX(filters) # convolve filter with input signal convolution_out = conv3d( signals=x, filters=filters, signals_shape=data.shape, filters_shape=filter_shape, border_mode='valid' ) # for each pixel, remove mean of 9x9 neighborhood mid_0 = int(np.floor(kernel_shape[0] / 2.)) mid_1 = int(np.floor(kernel_shape[1] / 2.)) mid_2 = int(np.floor(kernel_shape[2] / 2.)) mean = T.tile(convolution_out, (1, 1, n_maps, 1, 1)) padded_mean = T.zeros_like(x) padded_mean = T.set_subtensor(padded_mean[:, mid_0:-mid_0, :, mid_1:-mid_1, mid_2:-mid_2], mean) centered_data = data - padded_mean # scale down norm of 9x9 patch if norm is bigger than 1 sum_sqr_xx = conv3d(signals=T.sqr(data), filters=filters) denominator = T.tile(T.sqrt(sum_sqr_xx), (1, 1, n_maps, 1, 1)) padded_denominator = T.ones_like(x) padded_denominator = T.set_subtensor( padded_denominator[:, mid_0:-mid_0, :, mid_1:-mid_1, mid_2:-mid_2], denominator ) per_img_mean = padded_denominator.mean(axis=[1, 2, 3, 4]) divisor = T.largest( per_img_mean.dimshuffle(0, 'x', 'x', 'x', 'x'), padded_denominator ) new_x = centered_data / T.maximum(1., divisor) # compile theano function f = theano.function([x], new_x) return f(data)
def est_log_part_fun(self): # init first visible data v_mean = T.nnet.softmax(self.base_vbias)[0] v_mean_rep = T.tile(v_mean, (self.numruns,)).reshape((self.numruns, self.model.num_vis)) D = T.tile(T.sum(self.base_vbias, axis=0).dimshuffle('x'), (self.numruns,)) v_samples, updates = theano.scan(fn=self.multinom_sampler,non_sequences=[v_mean_rep, D], n_steps=10) v = v_samples[-1] # init logw with beta = 0 logw = - self.log_p_k(v, 0., D) [logw_list, vs, Ds], updates = theano.scan(self.ais_step, sequences = self.betas[1:], outputs_info = [logw, v, None]) logw = logw_list[-1] v = vs[-1] D = Ds[-1] logw += self.log_p_k(v, 1, D) r = logsum(logw) - T.log(self.numruns) log_z_base = T.sum(T.log(1+T.exp(self.base_vbias))) + (self.model.num_hid)*T.log(2) log_z_est = r + log_z_base perform_fun = theano.function([], log_z_est, updates=updates) return perform_fun()
def recurrence(x_t, h_tm1, c_tm1): i = T.nnet.sigmoid(T.dot(x_t, self.wi) + T.dot(h_tm1, self.wih) + self.bi) # input gate c_proposed = T.tanh(T.dot(x_t, self.wc) + T.dot(h_tm1, self.wch) + self.bc) # proposed memory cell content f = T.nnet.sigmoid(T.dot(x_t, self.wf) + T.dot(h_tm1, self.wfh) + self.bf) # forget gate c_t = (T.tile(i, self.memory_size) * c_proposed) + (T.tile(f, self.memory_size) * c_tm1) # new memory cell content o = T.nnet.sigmoid(T.dot(x_t, self.wo) + T.dot(h_tm1, self.woh) + self.bo) # output gate h_t = T.tile(o, self.memory_size) * T.tanh(c_t) return [h_t, c_t]
def weighted_binary_cross_entropy_4(pred, target, class_normalization): # Mix of 0 and 2 # From theano DIM = pred.shape[1] BATCH_SIZE = pred.shape[0] N_on_per_batch = (T.transpose(T.tile(target.sum(axis=1), (DIM, 1))) + 1) N_off_per_batch = (T.transpose(T.tile((1-target).sum(axis=1), (DIM, 1))) + 1) class_norm_tile = T.tile(class_normalization, (BATCH_SIZE, 1)) return -(class_norm_tile * target * T.log(pred) / N_on_per_batch + (1.0 - target) * T.log(1.0 - pred) / N_off_per_batch)
def IRNN(n_input, n_hidden, n_output, input_type='real', out_every_t=False, loss_function='CE'): np.random.seed(1234) rng = np.random.RandomState(1234) x, y = initialize_data_nodes(loss_function, input_type, out_every_t) inputs = [x, y] h_0 = theano.shared(np.zeros((1, n_hidden), dtype=theano.config.floatX)) V = initialize_matrix(n_input, n_hidden, 'V', rng) W = theano.shared(np.identity(n_hidden, dtype=theano.config.floatX)) out_mat = initialize_matrix(n_hidden, n_output, 'out_mat', rng) hidden_bias = theano.shared(np.zeros((n_hidden,), dtype=theano.config.floatX)) out_bias = theano.shared(np.zeros((n_output,), dtype=theano.config.floatX)) parameters = [h_0, V, W, out_mat, hidden_bias, out_bias] def recurrence(x_t, y_t, h_prev, cost_prev, acc_prev, V, W, hidden_bias, out_mat, out_bias): if loss_function == 'CE': data_lin_output = V[x_t] else: data_lin_output = T.dot(x_t, V) h_t = T.nnet.relu(T.dot(h_prev, W) + data_lin_output + hidden_bias.dimshuffle('x', 0)) if out_every_t: lin_output = T.dot(h_t, out_mat) + out_bias.dimshuffle('x', 0) cost_t, acc_t = compute_cost_t(lin_output, loss_function, y_t) else: cost_t = theano.shared(NP_FLOAT(0.0)) acc_t = theano.shared(NP_FLOAT(0.0)) return h_t, cost_t, acc_t non_sequences = [V, W, hidden_bias, out_mat, out_bias] h_0_batch = T.tile(h_0, [x.shape[1], 1]) if out_every_t: sequences = [x, y] else: sequences = [x, T.tile(theano.shared(np.zeros((1,1), dtype=theano.config.floatX)), [x.shape[0], 1, 1])] outputs_info = [h_0_batch, theano.shared(NP_FLOAT(0.0)), theano.shared(NP_FLOAT(0.0))] [hidden_states, cost_steps, acc_steps], updates = theano.scan(fn=recurrence, sequences=sequences, non_sequences=non_sequences, outputs_info = outputs_info) if not out_every_t: lin_output = T.dot(hidden_states[-1,:,:], out_mat) + out_bias.dimshuffle('x', 0) costs = compute_cost_t(lin_output, loss_function, y) else: cost = cost_steps.mean() accuracy = acc_steps.mean() costs = [cost, accuracy] return inputs, parameters, costs
def mmd_full(x_t, y_t, alpha=0.5): """ Implementation of the full kernel MMD statistic (gaussian kernel)""" N = x_t.shape[1] M = y_t.shape[1] term1 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(x_t, N) - T.tile(x_t, N)))) term2 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(x_t, M) - T.tile(y_t, N)))) term3 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(y_t, M) - T.tile(y_t, M)))) return term1 - 2 * term2 + term3
def setup_generate(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() n_batch, n_time = chord_roots.shape spec = self.lstmstack.prepare_sample_scan( start_pos=T.alloc(np.array(self.encoding.STARTING_POSITION, np.int32), (n_batch)), start_out=T.tile(self.encoding.initial_encoded_form(), (n_batch,1)), timestep=T.tile(T.arange(n_time), (n_batch,1)), cur_chord_type=chord_types, cur_chord_root=chord_roots, deterministic_dropout=True ) def _scan_fn(*inputs): # inputs is [ spec_sequences..., last_absolute_position, spec_taps..., spec_non_sequences... ] inputs = list(inputs) last_absolute_chosen = inputs.pop(len(spec.sequences)) scan_rout = self.lstmstack.sample_scan_routine(spec, *inputs) last_rel_pos, last_out, cur_kwargs = scan_rout.send(None) new_pos = self.encoding.get_new_relative_position(last_absolute_chosen, last_rel_pos, last_out, self.bounds.lowbound, self.bounds.highbound, **cur_kwargs) addtl_kwargs = { "last_output": last_out } out_activations = scan_rout.send((new_pos, addtl_kwargs)) out_probs = self.encoding.decode_to_probs(out_activations,new_pos,self.bounds.lowbound, self.bounds.highbound) sampled_note = Encoding.sample_absolute_probs(self.srng, out_probs) encoded_output = self.encoding.note_to_encoding(sampled_note, new_pos, self.bounds.lowbound, self.bounds.highbound) scan_outputs = scan_rout.send(encoded_output) scan_rout.close() return [sampled_note, out_probs] + scan_outputs outputs_info = [{"initial":T.zeros((n_batch,),'int32'), "taps":[-1]}, None] + spec.outputs_info result, updates = theano.scan(fn=_scan_fn, sequences=spec.sequences, non_sequences=spec.non_sequences, outputs_info=outputs_info) all_chosen = result[0].dimshuffle((1,0)) all_probs = result[1].dimshuffle((1,0,2)) self.generate_fun = theano.function( inputs=[chord_roots, chord_types], updates=updates, outputs=all_chosen, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.generate_visualize_fun = theano.function( inputs=[chord_roots, chord_types], updates=updates, outputs=[all_chosen, all_probs], allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def get_input_vectors(shape, phases, scaling, offset): x = T.repeat(offset[0] + T.arange(shape[0]) / scaling, shape[1] * phases).reshape( (shape[0], shape[1], phases)) * T.pow(2, T.arange(phases)) y = T.repeat(T.tile(offset[1] + T.arange(shape[1]) / scaling, shape[0]).reshape( (shape[0], shape[1], 1)), phases, axis=2) * T.pow(2, T.arange(phases)) z = T.tile(offset[2] + 10 * T.arange(phases), shape[0] * shape[1]).reshape((shape[0], shape[1], phases, 1)) x = x.reshape((shape[0], shape[1], phases, 1)) y = y.reshape((shape[0], shape[1], phases, 1)) return T.concatenate([x, y, z], axis=3).reshape((shape[0] * shape[1] * phases, 3)).astype('float32')
def initial_states(self, batch_size, *args, **kwargs): states_dict = self.fst.expand({self.fst.fst.start: 0.0}) states = tensor.as_tensor_variable( self.transition.pad(states_dict.keys(), NOT_STATE)) states = tensor.tile(states[None, :], (batch_size, 1)) weights = tensor.as_tensor_variable( self.transition.pad(states_dict.values(), 0)) weights = tensor.tile(weights[None, :], (batch_size, 1)) add = self.probability_computer(states, weights) return states, weights, add
def _loopoverallball(self, ballid,batchid): ox=self.middle[batchid][ballid*2].reshape((1,1)) print "ox:",ox.ndim x=T.tile(ox,(self.height,self.width)) oy=self.middle[batchid][ballid*2+1].reshape((1,1)) y=T.tile(oy,(self.height,self.width)) w=T.tile(T.arange(0,self.width),(self.height,)).reshape((self.height,self.width)) h=T.tile(T.arange(0,self.height).reshape((self.height,1)),(1,self.width)) cof=(T.pow(x-w,2)+T.pow(y-h,2))*(-1.0/self.sigma) print T.exp(cof).ndim return T.exp(cof)
def __init__(self, rng, input, num_filters, input_shape): self.K = num_filters self.N = input_shape[2] * input_shape[3] self.D = input_shape[1] self.B = input_shape[0] self.input = input filter_shape = (self.K, self.D, 1, 1) fan_in = numpy.prod(filter_shape[1:]) fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:])) W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared( numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) c_bound = numpy.sqrt(1. / (self.K * self.D)) self.c = theano.shared( numpy.asarray( rng.uniform(low=-c_bound, high=c_bound, size=(self.K, self.D)), dtype=theano.config.floatX ), borrow=True ) conved = conv2d(input, self.W, input_shape=input_shape, filter_shape=filter_shape) conved = conved + self.b.dimshuffle('x', 0, 'x', 'x') conved = conved.reshape((self.B, self.K, self.N)) a = self.softmax3d(conved) x = input.reshape((self.B, self.D, self.N)) v = theano.shared(numpy.zeros((self.B, self.K, self.D), dtype=theano.config.floatX)) for k in range(self.K): ar = T.tile(a[:,k], (1,self.D)).reshape((self.B, self.D, self.N)) cr = T.tile(self.c[k].reshape((1,self.D,1)), (self.B, 1, self.N)) vr = (ar*(x+cr)).sum(2) g = T.sqrt((vr**2).sum(1)) # add eps? v = T.set_subtensor(v[:,k,:], vr/T.tile(g.reshape((self.B, 1)), (1, self.D))) # v = v/T.sqrt((v**2).sum()) # whole normalize self.output = v self.params = [self.W, self.b, self.c]
def apply(self, v): if self.n_batch == 1: h_init = [T.tile(self.h0, (v.shape[1], 1)), T.tile(self.c0, (v.shape[1], 1))] else: h_init = [self.h0, self.c0] [h_vals, _], _ = theano.scan(fn=self.step, sequences = v, outputs_info = h_init) return h_vals
def get_output_for(self, input, get_details=False, **kwargs): input = input.dimshuffle(1, 0, 2) def step(x_t, M_tm1, h_tm1, state_tm1, ww_tm1, wr_tm1, *params): # Update the memory (using w_tm1 of the writing heads & M_tm1) M_t = self.write_heads.write(h_tm1, ww_tm1, M_tm1) # Get the read vector (using w_tm1 of the reading heads & M_t) r_t = self.read_heads.read(wr_tm1, M_t) # Apply the controller (using x_t, r_t & the requirements for the controller) h_t, state_t = self.controller.step(x_t, r_t, h_tm1, state_tm1) # Update the weights (using h_t, M_t & w_tm1) ww_t = self.write_heads.get_weights(h_t, ww_tm1, M_t) wr_t = self.read_heads.get_weights(h_t, wr_tm1, M_t) return [M_t, h_t, state_t, ww_t, wr_t] memory_init = T.tile(self.memory.memory_init, (input.shape[1], 1, 1)) memory_init = T.unbroadcast(memory_init, 0) write_weights_init = T.tile(self.write_heads.weights_init, (input.shape[1], 1, 1)) write_weights_init = T.unbroadcast(write_weights_init, 0) read_weights_init = T.tile(self.read_heads.weights_init, (input.shape[1], 1, 1)) read_weights_init = T.unbroadcast(read_weights_init, 0) non_seqs = self.controller.get_params() + self.memory.get_params() + \ self.write_heads.get_params() + self.read_heads.get_params() hids, _ = theano.scan( fn=step, sequences=input, outputs_info=[memory_init] + self.controller.outputs_info(input.shape[1]) + \ [write_weights_init, read_weights_init], non_sequences=non_seqs, strict=True) # dimshuffle back to (n_batch, n_time_steps, n_features) if get_details: hid_out = [ hids[0].dimshuffle(1, 0, 2, 3), hids[1].dimshuffle(1, 0, 2), hids[2].dimshuffle(1, 0, 2), hids[3].dimshuffle(1, 0, 2, 3), hids[4].dimshuffle(1, 0, 2, 3)] else: if self.only_return_final: hid_out = hids[1][-1] else: hid_out = hids[1].dimshuffle(1, 0, 2) return hid_out
def log_f_hat(self): v_W = 1.0 / (1.0 / self.N * (1.0 / self.v_W - 1.0 / self.v_prior)) m_W = 1.0 / self.N * self.m_W / self.v_W * v_W v_b = 1.0 / (1.0 / self.N * (1.0 / self.v_b - 1.0 / self.v_prior)) m_b = 1.0 / self.N * self.m_b / self.v_b * v_b log_f_hat_W = T.sum(-0.5 * T.tile(1.0 / v_W, [ self.n_samples, 1, 1 ]) * self.W**2 + \ T.tile(m_W / v_W, [ self.n_samples, 1, 1 ]) * self.W, axis = [ 1, 2 ], keepdims = True)[ :, :, 0 ] log_f_hat_b = T.sum(-0.5 * T.tile(1.0 / v_b, [ self.n_samples, 1, 1 ]) * self.b**2 + \ T.tile(m_b / v_b, [ self.n_samples, 1, 1 ]) * self.b, axis = [ 1, 2 ], keepdims = True)[ :, :, 0 ] return log_f_hat_W + log_f_hat_b
def getKMeansLoss(self, latent_space_expression, soft_assignments, t_cluster_centers, num_clusters, latent_space_dim, num_samples, soft_loss=False): # Kmeans loss = weighted sum of latent space representation of inputs from the cluster centers z = latent_space_expression.reshape((num_samples, 1, latent_space_dim)) z = T.tile(z, (1, num_clusters, 1)) u = t_cluster_centers.reshape((1, num_clusters, latent_space_dim)) u = T.tile(u, (num_samples, 1, 1)) distances = (z - u).norm(2, axis=2).reshape((num_samples, num_clusters)) if soft_loss: weighted_distances = distances * soft_assignments loss = weighted_distances.sum(axis=1).mean() else: loss = distances.min(axis=1).mean() return loss
def update_sample_weights(self): # We update the mean and variances of q self.v_W = self.v_prior * self.logistic(self.log_var_param_W) self.m_W = self.mean_param_W self.v_b = self.v_prior * self.logistic(self.log_var_param_b) self.m_b = self.mean_param_b # We update the random samples for the network weights self.W = self.randomness_W * T.tile(T.sqrt(self.v_W), [ self.n_samples, 1, 1 ]) + T.tile(self.m_W, [ self.n_samples, 1, 1 ]) self.b = self.randomness_b * T.tile(T.sqrt(self.v_b), [ self.n_samples, 1, 1 ]) + T.tile(self.m_b, [ self.n_samples, 1, 1 ])
def compute_log_averaged_ei(self, x, X, randomness, incumbent): # We compute the old predictive mean at x Kzz = compute_kernel(self.lls, self.lsf, self.z, self.z) + T.eye(self.z.shape[ 0 ]) * self.jitter * T.exp(self.lsf) KzzInv = T.nlinalg.MatrixInversePSD()(Kzz) LLt = T.dot(self.LParamPost, T.transpose(self.LParamPost)) covCavityInv = KzzInv + LLt * casting(self.n_points - self.set_for_training) / casting(self.n_points) covCavity = T.nlinalg.MatrixInversePSD()(covCavityInv) meanCavity = T.dot(covCavity, casting(self.n_points - self.set_for_training) / casting(self.n_points) * self.mParamPost) KzzInvmeanCavity = T.dot(KzzInv, meanCavity) Kxz = compute_kernel(self.lls, self.lsf, x, self.z) m_old_x = T.dot(Kxz, KzzInvmeanCavity) # We compute the old predictive mean at X KXz = compute_kernel(self.lls, self.lsf, X, self.z) m_old_X = T.dot(KXz, KzzInvmeanCavity) # We compute the required cross covariance matrices KXX = compute_kernel(self.lls, self.lsf, X, X) - T.dot(T.dot(KXz, KzzInv), KXz.T) + T.eye(X.shape[ 0 ]) * self.jitter * T.exp(self.lsf) KXXInv = T.nlinalg.MatrixInversePSD()(KXX) KxX = compute_kernel(self.lls, self.lsf, x, X) xX = T.concatenate([ x, X ], 0) KxXz = compute_kernel(self.lls, self.lsf, xX, self.z) KxX = KxX - T.dot(T.dot(KxXz[ 0 : x.shape[ 0], : ], KzzInv), KxXz[ x.shape[ 0 ] : xX.shape[ 0 ], : ].T) # We compute the new posterior mean samples_internal = T.dot(MatrixChol()(KXX), randomness) new_predictive_mean = T.tile(m_old_x, [ 1, randomness.shape[ 1 ] ]) + T.dot(KxX, T.dot(KXXInv, samples_internal)) # We compute the new posterior variance z_expanded = T.concatenate([ self.z, X ], 0) Kxz_expanded = compute_kernel(self.lls, self.lsf, x, z_expanded) Kzz_expanded = compute_kernel(self.lls, self.lsf, z_expanded, z_expanded) + T.eye(z_expanded.shape[ 0 ]) * self.jitter * T.exp(self.lsf) Kzz_expandedInv = T.nlinalg.MatrixInversePSD()(Kzz_expanded) v_out = T.exp(self.lsf) - T.dot(Kxz_expanded * T.dot(Kxz_expanded, Kzz_expandedInv), T.ones_like(z_expanded[ : , 0 : 1 ])) new_predictive_var = T.tile(v_out, [ 1, randomness.shape[ 1 ] ]) s = (incumbent - new_predictive_mean) / T.sqrt(new_predictive_var) log_ei = T.log((incumbent - new_predictive_mean) * ratio(s) + T.sqrt(new_predictive_var)) + log_n_pdf(s) return T.mean(LogSumExp(log_ei, 1), 1)
def _transform_thin_plate_spline( dest_offsets, input, right_mat, L_inv, source_points, out_height, out_width, precompute_grid, downsample_factor): num_batch, num_channels, height, width = input.shape num_control_points = source_points.shape[1] # reshape destination offsets to be (num_batch, 2, num_control_points) # and add to source_points dest_points = source_points + T.reshape( dest_offsets, (num_batch, 2, num_control_points)) # Solve as in ref [2] coefficients = T.dot(dest_points, L_inv[:, 3:].T) if precompute_grid: # Transform each point on the source grid (image_size x image_size) right_mat = T.tile(right_mat.dimshuffle('x', 0, 1), (num_batch, 1, 1)) transformed_points = T.batched_dot(coefficients, right_mat) else: # Transformed grid out_height = T.cast(height / downsample_factor[0], 'int64') out_width = T.cast(width / downsample_factor[1], 'int64') orig_grid = _meshgrid(out_height, out_width) orig_grid = orig_grid[0:2, :] orig_grid = T.tile(orig_grid, (num_batch, 1, 1)) # Transform each point on the source grid (image_size x image_size) transformed_points = _get_transformed_points_tps( orig_grid, source_points, coefficients, num_control_points, num_batch) # Get out new points x_transformed = transformed_points[:, 0].flatten() y_transformed = transformed_points[:, 1].flatten() # dimshuffle input to (bs, height, width, channels) input_dim = input.dimshuffle(0, 2, 3, 1) input_transformed = _interpolate( input_dim, x_transformed, y_transformed, out_height, out_width) output = T.reshape(input_transformed, (num_batch, out_height, out_width, num_channels)) output = output.dimshuffle(0, 3, 1, 2) # dimshuffle to conv format return output
def decode_to_probs(self, activations, relative_position, low_bound, high_bound): assert (low_bound%12==0) and (high_bound-low_bound == self.num_octaves*12), "Circle of thirds must evenly divide into octaves" squashed = T.reshape(activations, (-1,self.RAW_ENCODING_WIDTH)) rsp = T.nnet.softmax(squashed[:,:3]) c1 = T.nnet.softmax(squashed[:,3:7]) c2 = T.nnet.softmax(squashed[:,7:10]) octave_choice = T.nnet.softmax(squashed[:,10:]) octave_notes = T.tile(c1,(1,3)) * T.tile(c2,(1,4)) full_notes = T.reshape(T.shape_padright(octave_choice) * T.shape_padaxis(octave_notes, 1), (-1,12*self.num_octaves)) full_probs = T.concatenate([rsp[:,:2], T.shape_padright(rsp[:,2])*full_notes], 1) newshape = T.concatenate([activations.shape[:-1],[2+high_bound-low_bound]],0) fixed = T.reshape(full_probs, newshape, ndim=activations.ndim) return fixed
def logp_theano_claims(l,nObs,T,Z,L,X,O_on): #O_on = O_on.astype(np.bool) # tempVec is 1-X*Z tempVec = (1. - X.reshape((nObs,1,X.shape[1]))*(Z.T).reshape((1,Z.shape[1],Z.shape[0]))) # Add the contribution from O = 1 logLike = TT.log(1-(1-TT.tile(L[np.newaxis,:],(nObs,1))[O_on.nonzero()])*TT.prod(tempVec[O_on.nonzero()],axis=1,no_zeros_in_input=True)).sum() #logLike = TT.log(1-(1-TT.tile(L[np.newaxis,:],(nObs,1))[O_on.nonzero()])*tempVec[O_on.nonzero()].prod(axis=1,no_zeros_in_input=True)).sum() #logLike = TT.log(1-(1-TT.tile(L[np.newaxis,:],(nObs,1))[O_on.nonzero()])*tempVec[O_on.nonzero()].prod(axis=1)).sum() # Add the contribution from O = 0 logLike += TT.log((1-TT.tile(L[np.newaxis,:],(nObs,1))[(1-O_on).nonzero()])*TT.prod(tempVec[(1-O_on).nonzero()],axis=1,no_zeros_in_input=True)).sum() #logLike += TT.log((1-TT.tile(L[np.newaxis,:],(nObs,1))[(1-O_on).nonzero()])*tempVec[(1-O_on).nonzero()].prod(axis=1)).sum() return logLike
def def_invert(self, model, batch_size=1, d_weight=0.5, nc=1, lr=0.1, b1=0.9, nz=100, use_bin=True): d_weight_r = sharedX(d_weight) x_c = T.tensor4() m_c = T.tensor4() x_e = T.tensor4() m_e = T.tensor4() z0 = T.matrix() z = sharedX(floatX(np_rng.uniform(-1., 1., size=(batch_size, nz)))) gx = model.model_G(z) # input: im_c: 255: no edge; 0: edge; transform=> 1: no edge, 0: edge if nc == 1: # gx, range [0, 1] => edge, 1 gx3 = 1.0-gx #T.tile(gx, (1, 3, 1, 1)) else: gx3 = gx mm_c = T.tile(m_c, (1, gx3.shape[1], 1, 1)) color_all = T.mean(T.sqr(gx3 - x_c) * mm_c, axis=(1, 2, 3)) / (T.mean(m_c, axis=(1, 2, 3)) + sharedX(1e-5)) gx_edge = self.hog.get_hog(gx3) x_edge = self.hog.get_hog(x_e) mm_e = T.tile(m_e, (1, gx_edge.shape[1], 1, 1)) sum_e = T.sum(T.abs_(mm_e)) sum_x_edge = T.sum(T.abs_(x_edge)) edge_all = T.mean(T.sqr(x_edge - gx_edge) * mm_e, axis=(1, 2, 3)) / (T.mean(m_e, axis=(1, 2, 3)) + sharedX(1e-5)) rec_all = color_all + edge_all * sharedX(0.2) z_const = sharedX(5.0) init_all = T.mean(T.sqr(z0 - z)) * z_const if d_weight > 0: print('using D') p_gen = model.model_D(gx) real_all = T.nnet.binary_crossentropy(p_gen, T.ones(p_gen.shape)).T cost_all = rec_all + d_weight_r * real_all[0] + init_all else: print('without D') cost_all = rec_all + init_all real_all = T.zeros(cost_all.shape) cost = T.sum(cost_all) d_updater = updates.Adam(lr=sharedX(lr), b1=sharedX(b1)) output = [gx, cost, cost_all, rec_all, real_all, init_all, sum_e, sum_x_edge] print('COMPILING...') t = time() z_updates = d_updater([z], cost) _invert = theano.function(inputs=[x_c, m_c, x_e, m_e, z0], outputs=output, updates=z_updates) print('%.2f seconds to compile _invert function' % (time() - t)) return [_invert, z_updates, z, d_weight_r, z_const]
def flow(init_W,init_b,nData): import theano import theano.tensor as T n_layers = len(init_b) bias = [] weights = [] muStates = [] for layer_i in xrange(n_layers): bias.append(theano.shared(value=init_b[layer_i], name='b'+str(layer_i), borrow=True)) weights.append(theano.shared(value=init_W[layer_i], name='W'+str(layer_i), borrow=True)) muStates.append(T.matrix('mu'+str(layer_i))) for layer_i in xrange(n_layers): diffe = T.tile(bias[layer_i].copy(), (nData,1)) # All layers except top if layer_i < (n_layers-1): W_h = weights[layer_i].dot(muStates[layer_i+1].T).T diffe += W_h if layer_i > 0: vT_W = muStates[layer_i-1].dot(weights[layer_i-1]) diffe += vT_W exK = muStates[layer_i]*T.exp(.5*-diffe) + (1.-muStates[layer_i])*T.exp(.5*diffe) flows += exK.sum() return flows
def get_regs(self, states_0_, states, M): """ Additional regularization terms. """ regs = 0 if self.L1_Wrec > 0: W = self.params['Wrec'] regs += self.L1_Wrec * tensor.mean(abs(W)) if self.L2_Wrec > 0: W = self.params['Wrec'] regs += self.L2_Wrec * tensor.mean(tensor.sqr(W)) #--------------------------------------------------------------------------------- # Firing rates #--------------------------------------------------------------------------------- if self.L2_r > 0: baseline = 0. M_ = (tensor.tile(M.T, (states.shape[-1], 1, 1))).T states_all = tensor.concatenate( [states_0_.reshape((1, states_0_.shape[0], states_0_.shape[1])), states], axis=0 ) r = self.f_hidden(states_all) regs += self.L2_r * tensor.sum(tensor.sqr(r - baseline)*M_)/tensor.sum(M_) #--------------------------------------------------------------------------------- return regs
def _create_maximum_activation_update(output, record, streamindex, topn): """ Calculates update of the topn maximums for one batch of outputs. """ dims, maximums, indices, snapshot = record counters = tensor.tile(tensor.shape_padright( tensor.arange(output.shape[0]) + streamindex), (1, output.shape[1])) if len(dims) == 1: # output is a 2d tensor, (cases, units) -> activation tmax = output # counters is a 2d tensor broadcastable (cases, units) -> case_index tind = counters else: # output is a 4d tensor: fmax flattens it to 3d fmax = output.flatten(ndim=3) # fargmax is a 2d tensor containing rolled maximum locations fargmax = fmax.argmax(axis=2) # fetch the maximum. tmax is 2d, (cases, units) -> activation tmax = _apply_index(fmax, fargmax, axis=2) # targmax is a tuple that separates rolled-up location into (x, y) targmax = divmod(fargmax, dims[2]) # tind is a 3d tensor (cases, units, 3) -> case_index, maxloc # this will match indices which is a 3d tensor also tind = tensor.stack((counters, ) + targmax, axis=2) cmax = tensor.concatenate((maximums, tmax), axis=0) cind = tensor.concatenate((indices, tind), axis=0) cargsort = (-cmax).argsort(axis=0)[:topn] newmax = _apply_perm(cmax, cargsort, axis=0) newind = _apply_perm(cind, cargsort, axis=0) updates = [(maximums, newmax), (indices, newind)] if snapshot: csnap = tensor.concatenate((snapshot, output), axis=0) newsnap = _apply_perm(csnap, cargsort, axis=0) updates.append((snapshot, newsnap)) return updates
def nin(X, param): w1, w2, w3, b1, b2, b3 = param X = X.dimshuffle(0, 1, 'x', 2, 3) # (n,32,1,r,c) w1 = w1.dimshuffle(0, 1, 2, 'x', 3, 4) # (64,32,16,1,3,3) w2 = w2.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,16,1,1) w3 = w3.dimshuffle(0, 1, 2, 'x', 'x') # (64,2,32,1,1) b1 = b1.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,16,1,1) b2 = b2.dimshuffle(0, 1, 'x', 2, 'x', 'x') # (64,32,1,1,1,1) b3 = b3.dimshuffle(0, 'x', 1, 'x', 'x') # (64,1,2,1,1) indexi = T.arange(w1.shape[0], dtype='int32') # (0:64) indexi = T.repeat(indexi, w1.shape[1], axis=0) indexj = T.arange(w1.shape[1], dtype='int32') # (0:64) indexj = T.tile(indexj, w1.shape[0]) results, updates = scan(fn=metaOp1, sequences=[indexi, indexj], outputs_info=None, non_sequences=[X, w1, w2, b1, b2], strict=True) # (64*32,n,1,r,c) metaShape1 = results.shape[-4], results.shape[-2], results.shape[-1] reshaped1 = results.reshape((w1.shape[0], w1.shape[1]) + metaShape1) # (64,32,n,r,c) permuted1 = T.transpose(reshaped1, axes=(0, 2, 1, 3, 4)) # (64,n,32,r,c) indexi = T.arange(w1.shape[0], dtype='int32') # (0:64) results, updates = scan(fn=metaOp2, sequences=[indexi], outputs_info=None, non_sequences=[permuted1, w3, b3], strict=True) # (64,n,2,r,c) permuted2 = T.transpose(results, axes=(1, 0, 2, 3, 4)) # (n,64,2,r,c) metaShape2 = permuted2.shape[-2], permuted2.shape[-1] reshaped2 = permuted2.reshape((permuted2.shape[0], -1) + metaShape2) # (n,128,r,c) return reshaped2
def build_model(tparams,options): """ @function:建立模型 """ opt_ret = dict() trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) x_mask = tensor.matrix('x_mask',dtype='float32') y = tensor.matrix('y',dtype='int64') y_mask = tensor.matrix('y_mask',dtype='float32') # 编码器 x,ctx = build_encoder(tparams,options,trng,use_noise,x_mask,sampling=False) n_samples = x.shape[1] n_timesteps_trg = y.shape[0] if options['use_dropout']: retain_probability_emb = 1-options['dropout_embedding'] retain_probability_hidden = 1-options['dropout_hidden'] retain_probability_target = 1-options['dropout_target'] if options['model_version'] < 0.1: scaled = False else: scaled = True rec_dropout_d = shared_dropout_layer((5, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) emb_dropout_d = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) ctx_dropout_d = shared_dropout_layer((4, n_samples, 2*options['dim']), use_noise, trng, retain_probability_hidden, scaled) target_dropout = shared_dropout_layer((n_timesteps_trg, n_samples, 1), use_noise, trng, retain_probability_target, scaled) target_dropout = tensor.tile(target_dropout, (1,1,options['dim_word'])) else: rec_dropout_d = theano.shared(numpy.array([1.]*5, dtype='float32')) emb_dropout_d = theano.shared(numpy.array([1.]*2, dtype='float32')) ctx_dropout_d = theano.shared(numpy.array([1.]*4, dtype='float32')) # mean of the context (across time) will be used to intialize decoder rnn ctx_mean = (ctx*x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None] # or you can use the last state of forward+backward encoder rnns # ctx_mean = concatenate([proj[0][-1],projr[0][-1]],axis=proj[0].ndim-2) if options['use_dropout']: ctx_mean *= shared_dropout_layer((n_samples,2*options['dim']),use_noise,trng,retain_probability_hidden,scaled) # initial decoder state init_state = fflayer(tparams,ctx_mean,options, prefix='ff_state',activ='tanh') # word embedding (target), we will shift the target sequence one time step # to the right. This is done because of the bi-gram connections in the # readout and decoder rnn. The first target will be all zeros and we will # not condition on the last output. emb = tparams['Wemb_dec'][y.flatten()] emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted if options['use_dropout']: emb *= target_dropout # decoder - pass through the decoder conditional gru with attention proj = gru_cond_layer(tparams, emb, options, prefix='decoder', mask=y_mask, context=ctx, context_mask=x_mask, one_step=False, init_state=init_state, emb_dropout=emb_dropout_d, ctx_dropout=ctx_dropout_d, rec_dropout=rec_dropout_d, profile=profile) # hidden states of the decoder gru proj_h = proj[0] # weighted averages of context, generated by attention module ctxs = proj[1] if options['use_dropout']: proj_h *= shared_dropout_layer((n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) emb *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) ctxs *= shared_dropout_layer((n_samples, 2*options['dim']), use_noise, trng, retain_probability_hidden, scaled) # weights (alignment matrix) #####LIUCAN: this is where the attention vector is. opt_ret['dec_alphas'] = proj[2] # compute word probabilities logit_lstm = fflayer(tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') logit_prev = fflayer(tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = fflayer(tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx) if options['use_dropout']: logit *= shared_dropout_layer((n_samples, options['dim_word']), use_noise, trng, retain_probability_hidden, scaled) # 生成tj,用于获取质量向量 tt = logit logit = fflayer(tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) # cost y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words_tgt'] + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx]) cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, ctx, tt
def build_encoder(tparams,options,trng,use_noise,x_mask=None,sampling=False): x = tensor.matrix('x',dtype='int64') x.tag.test_value = (numpy.random.rand(5,10)*100).astype('int64') #for the backward rnn, we just need to invert x xr = x[::-1] #此处有区别 xr = x[:,::-1] if x_mask is None: #测试的时候 xr_mask = None else: xr_mask = x_mask[::-1] #时间步数,和样本个数 n_timesteps = x.shape[0] n_samples = x.shape[1] #是否使用 dropout if options['use_dropout']: retain_probability_emb = 1-options['dropout_embedding'] retain_probability_hidden = 1-options['dropout_hidden'] retain_probability_source = 1-options['dropout_source'] if sampling: if options['model_version'] < 0.1: rec_dropout = theano.shared(numpy.array([retain_probability_hidden]*2, dtype='float32')) rec_dropout_r = theano.shared(numpy.array([retain_probability_hidden]*2, dtype='float32')) emb_dropout = theano.shared(numpy.array([retain_probability_emb]*2, dtype='float32')) emb_dropout_r = theano.shared(numpy.array([retain_probability_emb]*2, dtype='float32')) source_dropout = theano.shared(numpy.float32(retain_probability_source)) else: rec_dropout = theano.shared(numpy.array([1.]*2, dtype='float32')) rec_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32')) emb_dropout = theano.shared(numpy.array([1.]*2, dtype='float32')) emb_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32')) source_dropout = theano.shared(numpy.float32(1.)) else: if options['model_version'] < 0.1: scaled = False else: scaled = True rec_dropout = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) rec_dropout_r = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) emb_dropout_r = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) source_dropout = shared_dropout_layer((n_timesteps, n_samples, 1), use_noise, trng, retain_probability_source, scaled) source_dropout = tensor.tile(source_dropout, (1,1,options['dim_word'])) else: rec_dropout = theano.shared(numpy.array([1.]*2, dtype='float32')) rec_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32')) emb_dropout = theano.shared(numpy.array([1.]*2, dtype='float32')) emb_dropout_r = theano.shared(numpy.array([1.]*2, dtype='float32')) # word embedding for forward rnn (source) emb = tparams['Wemb'][x.flatten()] #此处不同 emb = emb.reshape([n_timesteps,n_samples,options['dim_word']]) if options['use_dropout']: emb *= source_dropout proj = gru_layer(tparams,emb,options, prefix='encoder', mask=x_mask, emb_dropout=emb_dropout, rec_dropout=rec_dropout, profile=profile) # word embedding for backward rnn (source) embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps,n_samples,options['dim_word']]) if options['use_dropout']: if sampling: embr *= source_dropout else: embr *= source_dropout[::-1] projr = gru_layer(tparams,embr,options, prefix='encoder_r', mask=xr_mask, emb_dropout=emb_dropout_r, rec_dropout=rec_dropout, profile=profile) #context will be the concatenation of forward and backward rnns ctx = concatenate([proj[0],projr[0][::-1]],axis=proj[0].ndim-1) return x,ctx
def _setup_functions(self): # Actual parameter lengths. #sh_w_n = (self.n_state + self.n_actions + 1, self.n_state + 1, self.n_state) #print("sh_w_n", sh_w_n) sh_w_n = (self.n_actions + 1, self.n_state + 1, self.n_state) print("sh_w_n", sh_w_n) sh_w_t = (self.n_tex + 1, self.n_state + 1, self.n_ray) print("sh_w_t", sh_w_t) sh_l1 = (self.n_ray + self.n_key, self.n_interaction) print("sh_l1", sh_l1) sh_l2 = (self.n_interaction, 1) print("sh_l2", sh_l2) # Memory cells. sh_mk = (self.n_scene, self.n_key) sh_mc = (self.n_scene, 4) print("sh_mk", sh_mk) print("sh_mc", sh_mc) if not hasattr(self, "params"): print('generating weights') # (A+1)x(S+1)xS wn = uniform(sh_w_n, scale=0.2) # (P+1)x(S+1)xR wt = uniform(sh_w_t, scale=0.2) # (R+K)xH wl1 = uniform(sh_l1, scale=0.2) # H wb1 = shared0s((self.n_interaction, )) # Hx1 wl2 = uniform(sh_l2, scale=0.2) # MxK wmk = uniform(sh_mk, scale=0.2) # MxC wmc = uniform(sh_mc, scale=0.2) self.params = [wn, wt, wl1, wb1, wl2, wmk, wmc] else: wn, wt, wl1, wb1, wl2, wmk, wmc = self.params #TxNxA A = sharedX(np.zeros((2, 2, 2)), name="A") #TxNxP P = sharedX(np.zeros((2, 2, 2)), name="P") #TxNxC y = sharedX(np.zeros((2, 2, 2)), name="y") self.inputs = {"A": A, "P": P, "y": y} # Inputs: NxS, NxA def state_transform(a_, s_): # Nx(S+1)xS temp_ = T.tensordot(T.concatenate( [a_, T.ones((s_.shape[0], 1))], axis=1), wn, axes=[1, 0]) # NxS return T.sum( temp_ * T.concatenate([s_, T.ones( (s_.shape[0], 1))], axis=1).dimshuffle([0, 1, 'x']), axis=1) #return s_ # TxNxS S, _ = theano.scan(fn=state_transform, outputs_info=[T.zeros([A.shape[1], self.n_state])], sequences=[A]) # TxNx(S+1)xR temp_ = T.tensordot(T.concatenate( [P, T.ones([S.shape[0], S.shape[1], 1])], axis=2), wt, axes=[2, 0]) # TxNxR Ray Elements. R = T.sum(temp_ * T.concatenate([S, T.ones((S.shape[0], S.shape[1], 1))], axis=2).dimshuffle([0, 1, 2, 'x']), axis=2) # TxNxMx(R+K) Transformation input. R_2 = T.concatenate([ T.tile(R.dimshuffle([0, 1, 'x', 2]), [1, 1, self.n_scene, 1]), T.tile(wmk.dimshuffle(['x', 'x', 0, 1]), [R.shape[0], R.shape[1], 1, 1]) ], axis=3) # TxNxMxH L1 = sigmoid( T.tensordot(R_2, wl1, axes=[3, 0]) + wb1.dimshuffle(['x', 'x', 'x', 0])) # TxNxM Soft attention weights. Att_temp = T.exp(T.tensordot(L1, wl2, axes=[3, 0]).sum(axis=3)) Att = Att_temp / (T.sum(Att_temp, axis=2, keepdims=True) + 0.01) #Att = sigmoid( T.tensordot(L1, wl2, axes=[3,0]).sum( axis=3 ) ) # TxNxC final colors. Col = T.tensordot(Att, wmc, axes=[2, 0]) rec_cost = T.sum(T.sqr(Col - y)) # / T.cast(X.shape[0], 'float32') cost = rec_cost print('getting updates') #updates = Adam([wt,wn,wmk,wl1,wb1,wl2,wmc], cost) updates = Adam(self.params, cost) print('compiling') self._fit_function = theano.function([], cost, updates=updates) theano.printing.debugprint(self._fit_function) #self._predict = theano.function([A, P], Col, allow_input_downcast=True) self._predict = theano.function([], Col, allow_input_downcast=True) #self._next_state = theano.function([A], S, allow_input_downcast=True) self._next_state = theano.function([], S, allow_input_downcast=True) #self._predict_attn = theano.function([A, P], Att, allow_input_downcast=True) self._predict_attn = theano.function([], Att, allow_input_downcast=True) # Output just the cost to check with a test set. #self._cost = theano.function([A,P,y], cost, allow_input_downcast=True) self._cost = theano.function([], cost, allow_input_downcast=True)
def LSTM(n_input, n_hidden, n_output, input_type='real', out_every_t=False, loss_function='CE'): np.random.seed(1234) rng = np.random.RandomState(1234) W_i = initialize_matrix(n_input, n_hidden, 'W_i', rng) W_f = initialize_matrix(n_input, n_hidden, 'W_f', rng) W_c = initialize_matrix(n_input, n_hidden, 'W_c', rng) W_o = initialize_matrix(n_input, n_hidden, 'W_o', rng) U_i = initialize_matrix(n_hidden, n_hidden, 'U_i', rng) U_f = initialize_matrix(n_hidden, n_hidden, 'U_f', rng) U_c = initialize_matrix(n_hidden, n_hidden, 'U_c', rng) U_o = initialize_matrix(n_hidden, n_hidden, 'U_o', rng) V_o = initialize_matrix(n_hidden, n_hidden, 'V_o', rng) b_i = theano.shared(np.zeros((n_hidden, ), dtype=theano.config.floatX)) b_f = theano.shared(np.ones((n_hidden, ), dtype=theano.config.floatX)) b_c = theano.shared(np.zeros((n_hidden, ), dtype=theano.config.floatX)) b_o = theano.shared(np.zeros((n_hidden, ), dtype=theano.config.floatX)) h_0 = theano.shared(np.zeros((1, n_hidden), dtype=theano.config.floatX)) state_0 = theano.shared(np.zeros((1, n_hidden), dtype=theano.config.floatX)) out_mat = initialize_matrix(n_hidden, n_output, 'out_mat', rng) out_bias = theano.shared(np.zeros((n_output, ), dtype=theano.config.floatX)) parameters = [ W_i, W_f, W_c, W_o, U_i, U_f, U_c, U_o, V_o, b_i, b_f, b_c, b_o, h_0, state_0, out_mat, out_bias ] x, y = initialize_data_nodes(loss_function, input_type, out_every_t) def recurrence(x_t, y_t, h_prev, state_prev, cost_prev, acc_prev, W_i, W_f, W_c, W_o, U_i, U_f, U_c, U_o, V_o, b_i, b_f, b_c, b_o, out_mat, out_bias): if loss_function == 'CE': x_t_W_i = W_i[x_t] x_t_W_c = W_c[x_t] x_t_W_f = W_f[x_t] x_t_W_o = W_o[x_t] else: x_t_W_i = T.dot(x_t, W_i) x_t_W_c = T.dot(x_t, W_c) x_t_W_f = T.dot(x_t, W_f) x_t_W_o = T.dot(x_t, W_o) input_t = T.nnet.sigmoid(x_t_W_i + T.dot(h_prev, U_i) + b_i.dimshuffle('x', 0)) candidate_t = T.tanh(x_t_W_c + T.dot(h_prev, U_c) + b_c.dimshuffle('x', 0)) forget_t = T.nnet.sigmoid(x_t_W_f + T.dot(h_prev, U_f) + b_f.dimshuffle('x', 0)) state_t = input_t * candidate_t + forget_t * state_prev output_t = T.nnet.sigmoid(x_t_W_o + T.dot(h_prev, U_o) + T.dot(state_t, V_o) + b_o.dimshuffle('x', 0)) h_t = output_t * T.tanh(state_t) if out_every_t: lin_output = T.dot(h_t, out_mat) + out_bias.dimshuffle('x', 0) cost_t, acc_t = compute_cost_t(lin_output, loss_function, y_t) else: cost_t = theano.shared(np.float32(0.0)) acc_t = theano.shared(np.float32(0.0)) return h_t, state_t, cost_t, acc_t non_sequences = [ W_i, W_f, W_c, W_o, U_i, U_f, U_c, U_o, V_o, b_i, b_f, b_c, b_o, out_mat, out_bias ] h_0_batch = T.tile(h_0, [x.shape[1], 1]) state_0_batch = T.tile(state_0, [x.shape[1], 1]) if out_every_t: sequences = [x, y] else: sequences = [ x, T.tile(theano.shared(np.zeros((1, 1), dtype=theano.config.floatX)), [x.shape[0], 1, 1]) ] outputs_info = [ h_0_batch, state_0_batch, theano.shared(np.float32(0.0)), theano.shared(np.float32(0.0)) ] [hidden_states, states, cost_steps, acc_steps], updates = theano.scan(fn=recurrence, sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info) if not out_every_t: lin_output = T.dot(hidden_states[-1, :, :], out_mat) + out_bias.dimshuffle('x', 0) costs = compute_cost_t(lin_output, loss_function, y) else: cost = cost_steps.mean() accuracy = acc_steps.mean() costs = [cost, accuracy] return [x, y], parameters, costs
def train_auto(fun, train, transform, testdir, outdir, num_epochs_mse=30, num_epochs_ILD=10, model="1.pkl", scale_factor=0.3, load=False, skip_train_mse=False, skip_train_ILD=False, skip_sep=False, chunk_size=60, chunk_overlap=2, nsamples=40, batch_size=32, batch_memory=50, time_context=30, overlap=25, nprocs=4, mult_factor_in=0.3, mult_factor_out=0.3, mix_type='mixture'): """ Trains a network built with \"fun\" with the data generated with \"train\" and then separates the files in \"testdir\",writing the result in \"outdir\" Parameters ---------- fun : lasagne network object, Theano tensor The network to be trained transform : transformFFT object The Transform object which was used to compute the features (see compute_features_DSD100.py) testdir : string, optional The directory where the files to be separated are located outdir : string, optional The directory where to write the separated files num_epochs : int, optional The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network) model : string, optional The path where to save the trained model (theano tensor containing the network) scale_factor : float, optional Scale the magnitude of the files to be separated with this factor Yields ------ losser : list The losses for each epoch, stored in a list """ logging.info("Building Autoencoder") input_var = T.tensor4('inputs') input_mask = T.tensor4('input_mask') target_var = T.tensor4('targets') theano_rng = RandomStreams(128) eps = 1e-12 sources = ['vocals', 'bass', 'drums', 'other'] nchannels = int(train.channels_in) nsources = int(train.channels_out / train.channels_in) print 'nchannels: ', nchannels print 'nsources: ', nsources input_size = int(float(transform.frameSize) / 2 + 1) rand_num = theano_rng.normal(size=(batch_size, nsources, time_context, input_size), avg=0.0, std=0.1, dtype=theano.config.floatX) net = fun(input_var=input_var, batch_size=batch_size, time_context=time_context, feat_size=input_size, nchannels=nchannels, nsources=nsources) network = net['l_out'] if load: params = load_model(model) lasagne.layers.set_all_param_values(network, params) prediction = lasagne.layers.get_output(network, deterministic=True) sourceall = [] errors_insts = [] loss = 0 sep_chann = [] # prediction example for 2 sources in 2 channels: # 0, 1 source 0 in channel 0 and 1 # 2, 3 source 1 in channel 0 and 1 for j in range(nchannels): masksum = T.sum(prediction[:, j::nchannels, :, :], axis=1) temp = T.tile(masksum.dimshuffle(0, 'x', 1, 2), (1, nsources, 1, 1)) mask = prediction[:, j::nchannels, :, :] / (temp + eps * rand_num) source = mask * T.tile(input_var[:, j:j + 1, :, :], (1, nsources, 1, 1)) + eps * rand_num sourceall.append(source) sep_chann.append(source) train_loss_recon = lasagne.objectives.squared_error( source, target_var[:, j::nchannels, :, :]) errors_inst = abs(train_loss_recon.sum(axis=(0, 2, 3))) errors_insts.append(errors_inst) loss = loss + abs(train_loss_recon.sum()) params1 = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adadelta(loss, params1) train_fn_mse = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) train_fn1 = theano.function([input_var, target_var], errors_insts, allow_input_downcast=True) #----------NEW ILD LOSS CONDITION---------- rand_num2 = theano_rng.normal( size=(batch_size, nsources, time_context, input_size), avg=0.0, std=0.1, dtype=theano.config.floatX) #nsources a primera dim? #estimate interaural_spec_est = sep_chann[0] / (sep_chann[1] + eps * rand_num2) alpha_est = 20 * np.log10(abs(interaural_spec_est + eps * rand_num2)) alpha_est_mean = alpha_est.mean(axis=(0, 1, 2)) #groundtruth interaural_spec_gt = target_var[:, 0::nchannels, :, :] / ( target_var[:, 1::nchannels, :, :] + eps * rand_num2) alpha_gt = 20 * np.log10(abs(interaural_spec_gt + eps * rand_num2)) alpha_gt_mean = alpha_gt.mean(axis=(0, 1, 2)) train_loss_ild = lasagne.objectives.squared_error(alpha_est_mean, alpha_gt_mean) loss = loss + (abs(train_loss_ild.sum()) / 500) #------------------------------------------ predict_function = theano.function([input_var], sourceall, allow_input_downcast=True) losser = [] if not skip_train_mse: logging.info("1st MSE training stage...") for epoch in range(num_epochs_mse): train_err = 0 train_batches = 0 errs = np.zeros((nchannels, nsources)) start_time = time.time() for batch in range(train.iteration_size): inputs, target = train() train_err += train_fn_mse(inputs, target) errs += np.array(train_fn1(inputs, target)) train_batches += 1 logging.info("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs_mse, time.time() - start_time)) logging.info(" training loss:\t\t{:.6f}".format(train_err / train_batches)) for j in range(nchannels): for i in range(nsources): logging.info(" training loss for " + sources[i] + " in mic " + str(j) + ":\t\t{:.6f}".format(errs[j][i] / train_batches)) model_noILD = model[:-4] + '_noILD' + model[-4:] print 'model_noILD: ', model_noILD save_model(model_noILD, network) losser.append(train_err / train_batches) #NEW ILD TRAINING--------------------------------------------------------- if not skip_train_ILD: if not skip_train_mse: params = load_model(model_noILD) lasagne.layers.set_all_param_values(network, params) params1 = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adadelta(loss, params1) train_fn_ILD = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) logging.info("ILD training stage...") for epoch in range(num_epochs_ILD): train_err = 0 train_batches = 0 start_time = time.time() for batch in range(train.iteration_size): inputs, target = train() train_err += train_fn_ILD(inputs, target) train_batches += 1 logging.info("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs_ILD, time.time() - start_time)) logging.info(" training loss:\t\t{:.6f}".format(train_err / train_batches)) model_ILD = model[:-4] + '_ILD' + model[-4:] print 'model_ILD: ', model_ILD save_model(model_ILD, network) losser.append(train_err / train_batches) if not skip_train_mse: logging.info("2nd MSE training stage...") params = load_model(model_ILD) lasagne.layers.set_all_param_values(network, params) params1 = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adadelta(loss, params1) for epoch in range(num_epochs_mse): train_err = 0 train_batches = 0 errs = np.zeros((nchannels, nsources)) start_time = time.time() for batch in range(train.iteration_size): inputs, target = train() train_err += train_fn_mse(inputs, target) errs += np.array(train_fn1(inputs, target)) train_batches += 1 logging.info("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs_mse, time.time() - start_time)) logging.info(" training loss:\t\t{:.6f}".format(train_err / train_batches)) for j in range(nchannels): for i in range(nsources): logging.info(" training loss for " + sources[i] + " in mic " + str(j) + ":\t\t{:.6f}".format(errs[j][i] / train_batches)) model_ILD_extra_mse = model[:-4] + '_ILD_extra_mse' + model[-4:] print 'model_ILD_extra_mse: ', model_ILD_extra_mse save_model(model_ILD_extra_mse, network) losser.append(train_err / train_batches) if not skip_sep: logging.info("Separating") subsets = ['Dev', 'Test'] for sub in subsets: for d in sorted(os.listdir(os.path.join(db, 'Mixtures', sub))): if not d.startswith('.'): print os.path.join(db, 'Mixtures', sub, d, mix_type + '.wav') audio, sampleRate, bitrate = util.readAudioScipy( os.path.join(db, 'Mixtures', sub, d, mix_type + '.wav')) nsamples = audio.shape[0] sep_audio = np.zeros( (nsamples, len(sources), audio.shape[1])) mag, ph = transform.compute_transform(audio, phase=True) mag = scale_factor * mag.astype(np.float32) nframes = mag.shape[-2] batches_mag, nchunks = util.generate_overlapadd( mag, input_size=mag.shape[-1], time_context=train.time_context, overlap=train.overlap, batch_size=train.batch_size, sampleRate=sampleRate) mag = None output = [] for b in range(len(batches_mag)): output.append(predict_function(batches_mag[b])) output = np.array(output) for j in range(audio.shape[1]): mm = util.overlapadd_multi(np.swapaxes( output[:, j:j + 1, :, :, :, :], 1, 3), batches_mag, nchunks, overlap=train.overlap) for i in range(len(sources)): audio_out = transform.compute_inverse( mm[i, :ph.shape[1], :] / scale_factor, ph[j]) sep_audio[:, i, j] = audio_out[:len(sep_audio)] print 'Saving separation: ', outdir if not os.path.exists(os.path.join(outdir)): os.makedirs(os.path.join(outdir)) print 'Creating model folder' if not os.path.exists(os.path.join(outdir, 'Sources')): os.makedirs(os.path.join(outdir, 'Sources')) print 'Creating Sources folder: ', os.path.join( outdir, 'Sources') if not os.path.exists(os.path.join(outdir, 'Sources', sub)): os.makedirs(os.path.join(outdir, 'Sources', sub)) print 'Creating subset folder' if not os.path.exists( os.path.join(outdir, 'Sources', sub, d)): os.makedirs(os.path.join(outdir, 'Sources', sub, d)) print 'Creating song folder', os.path.join( outdir, 'Sources', sub, d) for i in range(len(sources)): print 'Final audio file: ', i, os.path.join( outdir, 'Sources', sub, d, sources[i] + '.wav' ), 'nsamples: ', nsamples, 'len sep_audio :', len( sep_audio) util.writeAudioScipy( os.path.join(outdir, 'Sources', sub, d, sources[i] + '.wav'), sep_audio[:nsamples, i, :], sampleRate, bitrate) return losser
def hmetad_rm1way(data: dict, sample_model: bool = True, **kwargs: int): """Compute hierachical meta-d' at the subject level. This is an internal function. The repeated measures model must be called using :py:func:`metadPy.hierarchical.hmetad`. Parameters ---------- data : dict Response data. sample_model : boolean If `False`, only the model is returned without sampling. **kwargs : keyword arguments All keyword arguments are passed to `func::pymc3.sampling.sample`. Returns ------- model : :py:class:`pymc3.Model` instance The pymc3 model. Encapsulates the variables and likelihood factors. trace : :py:class:`pymc3.backends.base.MultiTrace` or :py:class:`arviz.InferenceData` A `MultiTrace` or `ArviZ InferenceData` object that contains the samples. References ---------- .. [#] Fleming, S.M. (2017) HMeta-d: hierarchical Bayesian estimation of metacognitive efficiency from confidence ratings, Neuroscience of Consciousness, 3(1) nix007, https://doi.org/10.1093/nc/nix007 """ nSubj = data["nSubj"] nCond = data["nCond"] nRatings = data["nRatings"] hits = data["hits"].reshape(nSubj, 2) falsealarms = data["falsealarms"].reshape(nSubj, 2) counts = data["counts"] Tol = data["Tol"] cr = data["cr"].reshape(nSubj, 2) m = data["m"].reshape(nSubj, 2) c1 = data["c1"].reshape(nSubj, 2, 1) d1 = data["d1"].reshape(nSubj, 2, 1) with Model() as model: ############# # Hyperpriors ############# mu_c2 = Normal("mu_c2", tau=0.01, shape=(1, ), testval=np.random.rand() * 0.1) sigma_c2 = HalfNormal("sigma_c2", tau=0.01, shape=(1, ), testval=np.random.rand() * 0.1) mu_D = Normal("mu_D", tau=0.001, shape=(1), testval=np.random.rand() * 0.1) sigma_D = HalfNormal("sigma_D", tau=0.1, shape=(1), testval=np.random.rand() * 0.1) mu_Cond1 = Normal("mu_Cond1", mu=0, tau=0.001, shape=(1), testval=np.random.rand() * 0.1) sigma_Cond1 = HalfNormal("sigma_Cond1", tau=0.1, shape=(1), testval=np.random.rand() * 0.1) ############################# # Hyperpriors - Subject level ############################# dbase_tilde = Normal( "dbase_tilde", mu=0, sigma=1, shape=(nSubj, 1, 1), ) dbase = Deterministic("dbase", mu_D + sigma_D * dbase_tilde) Bd_Cond1_tilde = Normal( "Bd_Cond1_tilde", mu=0, sigma=1, shape=(nSubj, 1, 1), ) Bd_Cond1 = Deterministic( "Bd_Cond1", mu_Cond1 + sigma_Cond1 * Bd_Cond1_tilde, ) lambda_logMratio = Gamma( "lambda_logMratio", alpha=0.001, beta=0.001, shape=(nSubj, 1, 1), ) sigma_logMratio = Deterministic("sigma_logMratio", 1 / math.sqrt(lambda_logMratio)) ############################### # Hypterprior - Condition level ############################### mu_regression = [dbase + (Bd_Cond1 * c) for c in range(nCond)] log_mRatio_tilde = Normal("log_mRatio_tilde", mu=0, sigma=1, shape=(nSubj, 1, 1)) log_mRatio = Deterministic( "log_mRatio", tt.stack(mu_regression, axis=1)[:, :, :, 0] + tt.tile(log_mRatio_tilde, (1, 2, 1)) * tt.tile(sigma_logMratio, (1, 2, 1)), ) mRatio = Deterministic("mRatio", tt.exp(log_mRatio)) # Means of SDT distributions metad = Deterministic("metad", mRatio * d1) S2mu = Deterministic("S2mu", metad / 2) S1mu = Deterministic("S1mu", -metad / 2) # TYPE 2 SDT MODEL (META-D) # Multinomial likelihood for response counts # Specify ordered prior on criteria # bounded above and below by Type 1 c cS1_hn = Normal( "cS1_hn", mu=0, sigma=1, shape=(nSubj, nCond, nRatings - 1), testval=np.linspace(-1.5, -0.5, nRatings - 1).reshape( 1, 1, nRatings - 1).repeat(nSubj, axis=0).repeat(nCond, axis=1), ) cS1 = Deterministic("cS1", -mu_c2 + (cS1_hn * sigma_c2)) cS2_hn = Normal( "cS2_hn", mu=0, sigma=1, shape=(nSubj, nCond, nRatings - 1), testval=np.linspace(0.5, 1.5, nRatings - 1).reshape( 1, 1, nRatings - 1).repeat(nSubj, axis=0).repeat(nCond, axis=1), ) cS2 = Deterministic("cS2", mu_c2 + (cS2_hn * sigma_c2)) # Calculate normalisation constants C_area_rS1 = cumulative_normal(c1 - S1mu) I_area_rS1 = cumulative_normal(c1 - S2mu) C_area_rS2 = 1 - cumulative_normal(c1 - S2mu) I_area_rS2 = 1 - cumulative_normal(c1 - S1mu) # Get nC_rS1 probs nC_rS1 = cumulative_normal(cS1 - S1mu) / C_area_rS1 nC_rS1 = Deterministic( "nC_rS1", math.concatenate( ([ cumulative_normal(cS1[:, :, 0].reshape((nSubj, 2, 1)) - S1mu) / C_area_rS1, nC_rS1[:, :, 1:] - nC_rS1[:, :, :-1], ((cumulative_normal(c1 - S1mu) - cumulative_normal(cS1[:, :, (nRatings - 2)].reshape( (nSubj, 2, 1)) - S1mu)) / C_area_rS1), ]), axis=2, ), ) # Get nI_rS2 probs nI_rS2 = (1 - cumulative_normal(cS2 - S1mu)) / I_area_rS2 nI_rS2 = Deterministic( "nI_rS2", math.concatenate( ([ ((1 - cumulative_normal(c1 - S1mu)) - (1 - cumulative_normal(cS2[:, :, 0].reshape( (nSubj, nCond, 1)) - S1mu))) / I_area_rS2, nI_rS2[:, :, :-1] - (1 - cumulative_normal(cS2[:, :, 1:] - S1mu)) / I_area_rS2, (1 - cumulative_normal(cS2[:, :, nRatings - 2].reshape( (nSubj, nCond, 1)) - S1mu)) / I_area_rS2, ]), axis=2, ), ) # Get nI_rS1 probs nI_rS1 = (-cumulative_normal(cS1 - S2mu)) / I_area_rS1 nI_rS1 = Deterministic( "nI_rS1", math.concatenate( ([ cumulative_normal(cS1[:, :, 0].reshape((nSubj, nCond, 1)) - S2mu) / I_area_rS1, nI_rS1[:, :, :-1] + (cumulative_normal(cS1[:, :, 1:] - S2mu)) / I_area_rS1, (cumulative_normal(c1 - S2mu) - cumulative_normal(cS1[:, :, nRatings - 2].reshape( (nSubj, nCond, 1)) - S2mu)) / I_area_rS1, ]), axis=2, ), ) # Get nC_rS2 probs nC_rS2 = (1 - cumulative_normal(cS2 - S2mu)) / C_area_rS2 nC_rS2 = Deterministic( "nC_rS2", math.concatenate( ([ ((1 - cumulative_normal(c1 - S2mu)) - (1 - cumulative_normal(cS2[:, :, 0].reshape( (nSubj, nCond, 1)) - S2mu))) / C_area_rS2, nC_rS2[:, :, :-1] - ((1 - cumulative_normal(cS2[:, :, 1:] - S2mu)) / C_area_rS2), (1 - cumulative_normal(cS2[:, :, nRatings - 2].reshape( (nSubj, nCond, 1)) - S2mu)) / C_area_rS2, ]), axis=2, ), ) # Avoid underflow of probabilities nC_rS1 = math.switch(nC_rS1 < Tol, Tol, nC_rS1) nI_rS2 = math.switch(nI_rS2 < Tol, Tol, nI_rS2) nI_rS1 = math.switch(nI_rS1 < Tol, Tol, nI_rS1) nC_rS2 = math.switch(nC_rS2 < Tol, Tol, nC_rS2) for c in range(nCond): Multinomial( f"CR_counts_{c}", n=cr[:, c], p=nC_rS1[:, c, :], observed=counts[:, c, :nRatings], shape=(nSubj, nRatings), ) Multinomial( f"H_counts_{c}", n=hits[:, c], p=nC_rS2[:, c, :], observed=counts[:, c, nRatings * 3:nRatings * 4], shape=(nSubj, nRatings), ) Multinomial( f"FA_counts_{c}", n=falsealarms[:, c], p=nI_rS2[:, c, :], observed=counts[:, c, nRatings:nRatings * 2], shape=(nSubj, nRatings), ) Multinomial( f"M_counts_{c}", n=m[:, c], p=nI_rS1[:, c, :], observed=counts[:, c, nRatings * 2:nRatings * 3], shape=(nSubj, nRatings), ) if sample_model is True: trace = sample(return_inferencedata=True, **kwargs) return model, trace else: return model
def IRNN(n_input, n_hidden, n_output, input_type='real', out_every_t=False, loss_function='CE'): np.random.seed(1234) rng = np.random.RandomState(1234) x, y = initialize_data_nodes(loss_function, input_type, out_every_t) inputs = [x, y] h_0 = theano.shared(np.zeros((1, n_hidden), dtype=theano.config.floatX)) V = initialize_matrix(n_input, n_hidden, 'V', rng) W = theano.shared(np.identity(n_hidden, dtype=theano.config.floatX)) out_mat = initialize_matrix(n_hidden, n_output, 'out_mat', rng) hidden_bias = theano.shared( np.zeros((n_hidden, ), dtype=theano.config.floatX)) out_bias = theano.shared(np.zeros((n_output, ), dtype=theano.config.floatX)) parameters = [h_0, V, W, out_mat, hidden_bias, out_bias] def recurrence(x_t, y_t, h_prev, cost_prev, acc_prev, V, W, hidden_bias, out_mat, out_bias): if loss_function == 'CE': data_lin_output = V[x_t] else: data_lin_output = T.dot(x_t, V) h_t = T.nnet.relu( T.dot(h_prev, W) + data_lin_output + hidden_bias.dimshuffle('x', 0)) if out_every_t: lin_output = T.dot(h_t, out_mat) + out_bias.dimshuffle('x', 0) cost_t, acc_t = compute_cost_t(lin_output, loss_function, y_t) else: cost_t = theano.shared(np.float32(0.0)) acc_t = theano.shared(np.float32(0.0)) return h_t, cost_t, acc_t non_sequences = [V, W, hidden_bias, out_mat, out_bias] h_0_batch = T.tile(h_0, [x.shape[1], 1]) if out_every_t: sequences = [x, y] else: sequences = [ x, T.tile(theano.shared(np.zeros((1, 1), dtype=theano.config.floatX)), [x.shape[0], 1, 1]) ] outputs_info = [ h_0_batch, theano.shared(np.float32(0.0)), theano.shared(np.float32(0.0)) ] [hidden_states, cost_steps, acc_steps], updates = theano.scan(fn=recurrence, sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info) if not out_every_t: lin_output = T.dot(hidden_states[-1, :, :], out_mat) + out_bias.dimshuffle('x', 0) costs = compute_cost_t(lin_output, loss_function, y) else: cost = cost_steps.mean() accuracy = acc_steps.mean() costs = [cost, accuracy] return inputs, parameters, costs
def UKRNN(n_input, n_hidden, partition, n_output, input_type='real', out_every_t=False, loss_function='CE'): np.random.seed(1234) rng = np.random.RandomState(1234) # Initialize parameters: theta, V_re, V_im, hidden_bias, U, out_bias, h_0 V = initialize_matrix(n_input, 2*n_hidden, 'V', rng) U = initialize_matrix(2 * n_hidden, n_output, 'U', rng) hidden_bias = theano.shared(np.asarray(rng.uniform(low=-0.01, high=0.01, size=(n_hidden,)), dtype=theano.config.floatX), name='hidden_bias') kron_manifold = UnitaryKron(partition) MANIFOLD_NAMES = [manifold.str_id for manifold in kron_manifold._manifolds] UK = [theano.shared(value=manifold.rand_np(), name=manifold.str_id) for manifold in kron_manifold._manifolds] manifolds = {manifold.str_id: manifold for manifold in kron_manifold._manifolds} out_bias = theano.shared(np.zeros((n_output,), dtype=theano.config.floatX), name='out_bias') bucket = np.sqrt(3. / 2 / n_hidden) h_0 = theano.shared(np.asarray(rng.uniform(low=-bucket, high=bucket, size=(1, 2 * n_hidden)), dtype=theano.config.floatX), name='h_0') parameters = [V, U, hidden_bias] + UK + [out_bias, h_0] x, y = initialize_data_nodes(loss_function, input_type, out_every_t) swap_re_im = np.concatenate((np.arange(n_hidden, 2*n_hidden), np.arange(n_hidden))) # define the recurrence used by theano.scan def recurrence(x_t, y_t, h_prev, cost_prev, acc_prev, V, hidden_bias, out_bias, U, *UK): #unitary_step = unitary_transform(h_prev, n_hidden, unitary_matrix) unitary_step = unitary_kron_transform(h_prev, n_hidden, UK) hidden_lin_output = unitary_step # Compute data linear transform if loss_function == 'CE': data_lin_output = V[T.cast(x_t, INT_STR)] else: data_lin_output = T.dot(x_t, V) # Total linear output lin_output = hidden_lin_output + data_lin_output # Apply non-linearity ---------------------------- # scale RELU nonlinearity modulus = T.sqrt(lin_output**2 + lin_output[:, swap_re_im]**2) rescale = T.maximum(modulus + T.tile(hidden_bias, [2]).dimshuffle('x', 0), 0.) / (modulus + 1e-5) h_t = lin_output * rescale if out_every_t: lin_output = T.dot(h_t, U) + out_bias.dimshuffle('x', 0) cost_t, acc_t = compute_cost_t(lin_output, loss_function, y_t) else: cost_t = theano.shared(NP_FLOAT(0.0)) acc_t = theano.shared(NP_FLOAT(0.0)) return h_t, cost_t, acc_t # compute hidden states h_0_batch = T.tile(h_0, [x.shape[1], 1]) non_sequences = [V, hidden_bias, out_bias, U] + UK if out_every_t: sequences = [x, y] else: sequences = [x, T.tile(theano.shared(np.zeros((1,1), dtype=theano.config.floatX)), [x.shape[0], 1, 1])] outputs_info=[h_0_batch, theano.shared(NP_FLOAT(0.0)), theano.shared(NP_FLOAT(0.0))] [hidden_states, cost_steps, acc_steps], updates = theano.scan(fn=recurrence, sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info) if not out_every_t: lin_output = T.dot(hidden_states[-1,:,:], U) + out_bias.dimshuffle('x', 0) costs = compute_cost_t(lin_output, loss_function, y) else: cost = cost_steps.mean() accuracy = acc_steps.mean() costs = [cost, accuracy] return [x, y], parameters, costs, manifolds
def __init__(self, layer_input, mask, shape, is_predicting, beam_decoding): prefix = "WordDecoderLayer_" self.y_emb, self.context, self.init_state, self.xidx, self.state_z = layer_input self.x_mask, self.y_mask = mask self.dim_y, self.hidden_size, self.ctx_size, self.batch_size, self.updated_batch_size, self.latent_size = shape self.is_predicting = is_predicting self.W = init_weights((self.dim_y, self.hidden_size), prefix + "W", num_concatenate=2, axis_concatenate=1) self.U = init_weights((self.hidden_size, self.hidden_size), prefix + "U", "ortho", num_concatenate=2, axis_concatenate=1) self.b = init_bias(self.hidden_size, prefix + "b", num_concatenate=2) self.Wx = init_weights((self.dim_y, self.hidden_size), prefix + "Wx") self.Wxz = init_weights((self.dim_y, self.latent_size), prefix + "Wxz") self.bxz = init_bias(self.latent_size, prefix + "bxz") self.Ux = init_weights((self.hidden_size, self.hidden_size), prefix + "Ux", "ortho") self.bx = init_bias(self.hidden_size, prefix + "bx") self.Wc_att = init_weights((self.ctx_size, self.ctx_size), prefix + "Wc_att", "ortho") self.b_att = init_bias(self.ctx_size, prefix + "b_att") self.W_comb_att = init_weights((self.hidden_size, self.ctx_size), prefix + "W_comb_att") self.U_att = init_weights((self.ctx_size, 1), prefix + "U_att") self.U_nl = init_weights((self.hidden_size, self.hidden_size), prefix + "U_nl", "ortho", num_concatenate=2, axis_concatenate=1) self.b_nl = init_bias(self.hidden_size, prefix + "b_nl", num_concatenate=2) self.Ux_nl = init_weights((self.hidden_size, self.hidden_size), prefix + "Ux_nl", "ortho") self.bx_nl = init_bias(self.hidden_size, prefix + "bx_nl") self.Wc = init_weights((self.ctx_size, self.hidden_size), prefix + "Wc", num_concatenate=2, axis_concatenate=1) self.Wcx = init_weights((self.ctx_size, self.hidden_size), prefix + "Wcx") self.W_hz = init_weights((self.hidden_size, self.hidden_size), prefix + "W_hz") self.W_zz = init_weights((self.latent_size, self.hidden_size), prefix + "W_zz") self.W_hu = init_weights((self.hidden_size, self.latent_size), prefix + "W_hu") self.b_hu = init_bias(self.latent_size, prefix + "b_hu") self.W_hsigma = init_weights((self.hidden_size, self.latent_size), prefix + "W_hsigma") self.b_hsigma = init_bias(self.latent_size, prefix + "b_hsigma") z_params = [self.W_hu, self.b_hu, self.W_hsigma, self.b_hsigma] self.params = [ self.W, self.U, self.b, self.Wx, self.Ux, self.bx, self.U_nl, self.b_nl, self.Ux_nl, self.bx_nl, self.Wc, self.Wcx, self.Wc_att, self.b_att, self.W_comb_att, self.U_att, self.W_hz, self.W_zz, self.W_hu, self.b_hu, self.W_hsigma, self.b_hsigma, self.Wxz, self.bxz ] if is_predicting: if beam_decoding: self.x_mask = T.tile(self.x_mask, (1, self.batch_size, 1)) self.y_mask = T.ones((self.batch_size, 1)) self.pctx = T.dot(self.context, self.Wc_att) + self.b_att self.x = T.dot(self.y_emb, self.W) + self.b self.xx = T.dot(self.y_emb, self.Wx) + self.bx self.xxz = T.dot(self.y_emb, self.Wxz) + self.bxz def _slice(x, n): if x.ndim == 3: return x[:, :, n * self.hidden_size:(n + 1) * self.hidden_size] return x[:, n * self.hidden_size:(n + 1) * self.hidden_size] def _get_word_atten(pctx, h1, W_comb_att, U_att, x_mask): unreg_att = T.tanh(pctx + T.dot(h1, W_comb_att)) * x_mask unreg_att = T.dot(unreg_att, U_att) word_atten = T.exp( unreg_att - T.max(unreg_att, axis=0, keepdims=True)) * x_mask sum_word_atten = T.sum(word_atten, axis=0, keepdims=True) word_atten = T.switch(T.eq(word_atten, 0.0), 0.0, word_atten / sum_word_atten) word_atten = T.addbroadcast(word_atten, word_atten.ndim - 1) return word_atten def _active(x, xx, xxz, y_mask, pre_h, pre_z, pctx, context, x_mask, U, Ux, U_nl, Ux_nl, b_nl, bx_nl, Wc, Wcx, W_comb_att, U_att, W_hz, W_zz, W_hu, b_hu, W_hsigma, b_hsigma, xidx): tmp1 = T.nnet.sigmoid(T.dot(pre_h, U) + x) r1 = _slice(tmp1, 0) u1 = _slice(tmp1, 1) h1 = T.tanh(T.dot(pre_h * r1, Ux) + xx) h1 = u1 * pre_h + (1.0 - u1) * h1 h1 = y_mask * h1 + (1.0 - y_mask) * pre_h # recurrent-vae encoder xh_z = T.nnet.sigmoid(T.dot(pre_z, W_zz) + T.dot(h1, W_hz) + xxz) mu = T.dot(xh_z, W_hu) + b_hu log_var = T.dot(xh_z, W_hsigma) + b_hsigma var = T.exp(log_var) sigma = T.sqrt(var) eps = 0.0 if not self.is_predicting: eps = floatX( np.random.normal( 0, 1, (self.updated_batch_size, self.latent_size))) eps = T.reshape(eps, mu.shape) eps = T.clip(eps, -5, 5) z = mu + sigma * eps # len(x) * batch_size * 1 word_atten = _get_word_atten(pctx, h1, W_comb_att, U_att, x_mask) atted_ctx = T.sum(word_atten * context, axis=0) tmp2 = T.nnet.sigmoid( T.dot(atted_ctx, Wc) + T.dot(h1, U_nl) + b_nl) r2 = _slice(tmp2, 0) u2 = _slice(tmp2, 1) h2 = T.tanh(T.dot(atted_ctx, Wcx) + T.dot(h1 * r2, Ux_nl) + bx_nl) h2 = u2 * h1 + (1.0 - u2) * h2 h2 = y_mask * h2 + (1.0 - y_mask) * h1 cp_idx = T.argmax(word_atten, axis=0).reshape((self.batch_size, 1)) cp_idx = xidx[cp_idx[:, 0], T.arange(self.batch_size)] return h2, z, atted_ctx, cp_idx, mu, var sequences = [self.x, self.xx, self.xxz, self.y_mask] non_sequences = [ self.pctx, self.context, self.x_mask, self.U, self.Ux, self.U_nl, self.Ux_nl, self.b_nl, self.bx_nl, self.Wc, self.Wcx, self.W_comb_att, self.U_att, self.W_hz, self.W_zz, self.W_hu, self.b_hu, self.W_hsigma, self.b_hsigma, self.xidx.reshape((self.xidx.shape[0], self.batch_size)) ] if self.is_predicting: print "use one-step decoder" hs, zs, ac, cp_idx, mu, var = _active( *(sequences + [self.init_state, self.state_z] + non_sequences)) else: init_z = T.zeros((self.batch_size, self.latent_size), dtype=theano.config.floatX) [hs, zs, ac, cp_idx, mu, var], _ = theano.scan( _active, sequences=sequences, outputs_info=[self.init_state, init_z, None, None, None, None], non_sequences=non_sequences, allow_gc=False, strict=True) self.hidden_status = hs self.atted_context = ac self.word_atten = None self.cp_idx = cp_idx self.dec_z = zs self.dec_mu = mu self.dec_var = var self.z_params = z_params
def get_output_for(self, input, **kwargs): return T.tile( input.reshape((input.shape[0], input.shape[1], input.shape[2], 1)), (1, 1, 1, self.n))
th_g_pred_s = th_g_pred_s + th_g_dot * dt predict = tt.set_subtensor(predict[counter:counter + 1], th_g_pred_s) return predict # In[ ]: model_C = pm.Model() alpha1 = 3. beta1 = 0.05 alpha2 = 1.0 # define the distribution with model_C: sigma2s = pm.InverseGamma('sigma2s', alpha=alpha1, beta=beta1, shape=1) sigma2 = pm.Deterministic('sigma2', tt.tile(sigma2s, th.shape[0])) gamma2 = pm.Exponential(name='gamma2', lam=alpha2) ln_k_guess = pm.Normal(name='ln_k_guess', mu=0, sigma=tt.sqrt(gamma2), shape=1) y_mean = pm.Deterministic('y_mean', Solver(ln_k_guess)) y = pm.Normal(name='y', mu=y_mean, sigma=tt.sqrt(sigma2), observed=thg) # In[12]: with model_C: mcmc_res_C = pm.sample(draws=5000, step=pm.NUTS()) #_=pm.plot_posterior(mcmc_res_C, var_names=['ln_k_guess'])
def calc_log_gauss_fun_theano(self, Y, mean, covs): n_samples, n_dim = Y.shape Yc = Y - T.tile(mean, (Y.shape[0], 1)) exp_val = -0.5*T.sum(Yc**2/T.tile(covs, (Y.shape[0], 1)),1) norm_scal = -0.5*T.log(2*np.pi)*n_dim-0.5*T.sum(T.log(covs)) return exp_val+norm_scal
def get_other3(x, next_modal): fea_other = tensor.tile(x, (maxlen, 1)) fea_other = x.T fea_single = fea_other[:, next_modal] return fea_other, fea_single
deterministic=False) output_before_softmax_gen = ll.get_output(disc_layers[-1], gen_dat, deterministic=False) l_lab = output_before_softmax_lab[T.arange(args.batch_size), labels] l_unl = nn.log_sum_exp(output_before_softmax_unl) l_gen = nn.log_sum_exp(output_before_softmax_gen) loss_lab = -T.mean(l_lab) + T.mean( T.mean(nn.log_sum_exp(output_before_softmax_lab))) loss_unl = -0.5 * T.mean(l_unl) + 0.5 * T.mean( T.nnet.softplus(l_unl)) + 0.5 * T.mean(T.nnet.softplus(l_gen)) # Gradient for disc z_delta_disc = T.tile(z_jacobian, (args.batch_size, 1)) * args.z_delta z_d_disc = T.sum(z_jacobian, axis=1).dimshuffle('x', 0) * args.z_delta x_disc_jacobian_lab = x_lab.repeat(sample_dim, axis=0) labels_jacobian = labels.repeat(sample_dim) gen_dat_del_lab = ll.get_output(gen_layers[-1], { gen_img_input: x_disc_jacobian_lab, gen_noise_input: z_delta_disc }, deterministic=False) gen_dat_zero_lab = ll.get_output(gen_layers[-1], { gen_img_input: x_disc_jacobian_lab, gen_noise_input: T.zeros_like(z_delta_disc) }, deterministic=False) disc_dat_delta_lab = ll.get_output(disc_layers[-1],
def cov_gradients(self, verbose=0): """ Create covariance function for the gradients Returns: theano.tensor.matrix: covariance of the gradients. Shape number of points in dip_pos x number of points in dip_pos """ # Euclidean distances sed_dips_dips = self.squared_euclidean_distances( self.dips_position_tiled, self.dips_position_tiled) if 'sed_dips_dips' in self.verbose: sed_dips_dips = theano.printing.Print('sed_dips_dips')( sed_dips_dips) # Cartesian distances between dips positions h_u = T.vertical_stack( T.tile( self.dips_position[:, 0] - self.dips_position[:, 0].reshape( (self.dips_position[:, 0].shape[0], 1)), self.n_dimensions), T.tile( self.dips_position[:, 1] - self.dips_position[:, 1].reshape( (self.dips_position[:, 1].shape[0], 1)), self.n_dimensions), T.tile( self.dips_position[:, 2] - self.dips_position[:, 2].reshape( (self.dips_position[:, 2].shape[0], 1)), self.n_dimensions)) # Transpose h_v = h_u.T # Perpendicularity matrix. Boolean matrix to separate cross-covariance and # every gradient direction covariance (block diagonal) perpendicularity_matrix = T.zeros_like(sed_dips_dips) # Cross-covariances of x perpendicularity_matrix = T.set_subtensor( perpendicularity_matrix[0:self.dips_position.shape[0], 0:self.dips_position.shape[0]], 1) # Cross-covariances of y perpendicularity_matrix = T.set_subtensor( perpendicularity_matrix[ self.dips_position.shape[0]:self.dips_position.shape[0] * 2, self.dips_position.shape[0]:self.dips_position.shape[0] * 2], 1) # Cross-covariances of z perpendicularity_matrix = T.set_subtensor( perpendicularity_matrix[self.dips_position.shape[0] * 2:self.dips_position.shape[0] * 3, self.dips_position.shape[0] * 2:self.dips_position.shape[0] * 3], 1) # Covariance matrix for gradients at every xyz direction and their cross-covariances C_G = T.switch( T.eq(sed_dips_dips, 0), # This is the condition 0, # If true it is equal to 0. This is how a direction affect another ( # else, following Chiles book (h_u * h_v / sed_dips_dips**2) * (((sed_dips_dips < self.a_T) * # first derivative (-self.c_o_T * ((-14 / self.a_T**2) + 105 / 4 * sed_dips_dips / self.a_T**3 - 35 / 2 * sed_dips_dips**3 / self.a_T**5 + 21 / 4 * sed_dips_dips**5 / self.a_T**7))) + (sed_dips_dips < self.a_T) * # Second derivative self.c_o_T * 7 * (9 * sed_dips_dips**5 - 20 * self.a_T**2 * sed_dips_dips**3 + 15 * self.a_T**4 * sed_dips_dips - 4 * self.a_T**5) / (2 * self.a_T**7)) - ( perpendicularity_matrix * (sed_dips_dips < self.a_T) * # first derivative self.c_o_T * ((-14 / self.a_T**2) + 105 / 4 * sed_dips_dips / self.a_T**3 - 35 / 2 * sed_dips_dips**3 / self.a_T**5 + 21 / 4 * sed_dips_dips**5 / self.a_T**7)))) # Setting nugget effect of the gradients # TODO: This function can be substitued by simply adding the nugget effect to the diag if I remove the condition C_G += T.eye(C_G.shape[0]) * self.nugget_effect_grad_T # Add name to the theano node C_G.name = 'Covariance Gradient' if verbose > 1: theano.printing.pydotprint(C_G, outfile="graphs/" + sys._getframe().f_code.co_name + ".png", var_with_name_simple=True) if str(sys._getframe().f_code.co_name) in self.verbose: C_G = theano.printing.Print('Cov Gradients')(C_G) return C_G
def build_model(tparams, options): ''' x: traing data y: traing label x3: neighbor data, datanum * neighbornum * featuredim y2: neighbor label ''' trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) maskl = tensor.matrix('maskl', dtype=config.floatX) y = tensor.vector('y', dtype='int32') x = tensor.matrix('x', dtype=config.floatX) n_samples = x.shape[0] dim_proj = x.shape[1] maxlen = options['maxlen'] x3 = tensor.tensor3('x3', dtype=config.floatX) y2 = tensor.matrix('y2', dtype='int32') neigh_num = x3.shape[1] x_nerghbors = tensor.reshape(x3, [n_samples * neigh_num, dim_proj]) modal_cost = tensor.vector('modal_cost', dtype=config.floatX) max_cost = tensor.scalar('max_cost', dtype=config.floatX) h = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj) c = tensor.alloc(numpy_floatX(0.), n_samples, dim_proj) h_n = tensor.alloc(numpy_floatX(0.), n_samples * neigh_num, dim_proj) c_n = tensor.alloc(numpy_floatX(0.), n_samples * neigh_num, dim_proj) cost = 0 cost1_mean = [] cost2_mean = [] cost3_mean = [] next_mean = [] mask = tensor.ones_like( x[:, 0], dtype=config.floatX) # maks whether instance enter the ith iter mask_n = tensor.ones_like(x_nerghbors[:, 0], dtype=config.floatX) masks = [] projs = [] masks.append(mask) next_modal = tensor.zeros_like(x[:, 0], dtype='int32') next_modal_n = tensor.zeros_like(x_nerghbors[:, 0], dtype='int32') # cost_vector = tensor.alloc(numpy_floatX(0.),n_samples,1) cost_vector = tensor.alloc(numpy_floatX(0.), 1, n_samples) f_pred_set = [] f_pred_seq_set = [] f_pred_seq_prob_set = [] f_get_fea_set = [] f_fea_other_set = [] def get_other3(x, next_modal): fea_other = tensor.tile(x, (maxlen, 1)) fea_other = x.T fea_single = fea_other[:, next_modal] return fea_other, fea_single def get_other(x): # change the feature x from dim to the form of maxlen * dim fea_other = [] for i in range(maxlen): fea_other.append(x * maskl[i]) return tensor.stack(fea_other) def get_single(x, next_modal): # get the current modal' feature fea_single = x * maskl[next_modal] return fea_single def compute_dist(neighbor, pred_neighbor, fea_single, pred, mask, y, y2): ''' minimize same label neighbor's distance, maximize different label neighbor's distance neighbor: neighbor's feature pred_neighbor: neighbor's netmodal's prediction fea_single: current instance's feature pred: current instance's prediction mask: whether current instance stops y: current instance's label y2: neighbor instance's label ''' loss = 0 if mask: ifsamelabel = -1 for i in range(3): if y == y2[i]: ifsamelabel = 1 else: ifsamelabel = -1 dist = tensor.dot(get_other(neighbor[i]).T, pred_neighbor[i]) - tensor.dot( get_other(fea_single).T, pred) loss += ifsamelabel * tensor.dot(dist, dist.T) return loss / 3 costs = tensor.tile(modal_cost, (n_samples, 1)) xs = [] for i in range(recyl_maxlen): # set high cost for modal that has been used to prevent predict same modal costs = tensor.set_subtensor( costs[tensor.arange(n_samples), next_modal], 1) feas, update = theano.scan( fn=get_single, sequences=[x, next_modal], ) fea_single_n, update_n = theano.scan( fn=get_single, sequences=[x_nerghbors, next_modal_n], ) fea_single = feas max_coefficients_supported = 10000 xs.append(fea_single) [h, c] = get_layer(options['encoder'])[1](tparams, fea_single, options, prefix=options['encoder'], mask=mask, h_before=h, c_before=c) [h_n, c_n] = get_layer(options['encoder'])[1](tparams, fea_single_n, options, prefix=options['encoder'], mask=mask_n, h_before=h_n, c_before=c_n) proj = h proj_n = h_n projs.append(proj) projsmatrix = tensor.stack(projs) proj_pred = tensor.stack(projs) * tensor.stack(masks)[:, :, None] proj_pred = tensor.transpose(proj_pred, (1, 0, 2)) proj_pred = tensor.reshape(proj_pred, [ projsmatrix.shape[1], projsmatrix.shape[0] * projsmatrix.shape[2] ]) # print('h_n.shape', h_n.shape) if options['use_dropout']: proj_pred = dropout_layer(proj_pred, use_noise, trng) pred = tensor.nnet.softmax( tensor.dot(proj_pred, tparams['U_' + str(i)]) + tparams['b_' + str(i)]) print('i', i) f_pred_prob = theano.function([x, maskl, modal_cost, max_cost], pred, name='f_pred_prob', on_unused_input='ignore', allow_input_downcast=True) f_pred = theano.function([x, maskl, modal_cost, max_cost], pred.argmax(axis=1), name='f_pred', on_unused_input='ignore', allow_input_downcast=True) f_pred_set.append(f_pred) off = 1e-8 if pred.dtype == 'float16': off = 1e-6 pred_seq = tensor.nnet.softmax( tensor.dot(proj, tparams['U_seq_' + str(i)]) + tparams['b_seq_' + str(i)]) pred_seq_n = tensor.nnet.softmax( tensor.dot(proj_n, tparams['U_seq_' + str(i)]) + tparams['b_seq_' + str(i)]) f_pred_seq = theano.function([x, maskl, modal_cost, max_cost], pred_seq.argmax(axis=1), name='f_pred_seq', on_unused_input='ignore', allow_input_downcast=True) f_pred_seq_set.append(f_pred_seq) pred_seq_index = pred_seq.argmax(axis=1) next_modal = pred_seq_index next_modal_n = pred_seq_n.argmax(axis=1) next_mean.append(next_modal) cost1_vector = tensor.log(pred[tensor.arange(n_samples), y] + off) cost1 = (cost1_vector * mask).sum() / (mask.sum() + 1) pred_seq_n3 = tensor.reshape(pred_seq_n, [n_samples, neigh_num, maxlen]) result_loss2, update = theano.scan( fn=compute_dist, sequences=[x3, pred_seq_n3, x, pred_seq, mask, y, y2], ) cost2 = result_loss2.mean() cost3 = (costs * pred_seq).mean() cost1_mean.append(cost1) cost2_mean.append(cost2) cost3_mean.append(cost3) lamda1 = 0.001 lamda2 = 0.1 if i == recyl_maxlen - 1: lamda1 = 0.000000001 lamda2 = 0.000000001 cost += -cost1 + lamda1 * cost2 + lamda2 * cost3 # cost += -cost1 # f_fea_other = theano.function([x, x3, y, maskl, modal_cost, max_cost],[nnext, D,cost1,cost2,cost3,mask.sum(),next_modal, fea_single, fea_other, fea_single3, fea_other3], on_unused_input='ignore') # f_fea_other_set.append(f_fea_other) result, update = theano.scan(lambda b, a: a[b], sequences=pred_seq_index, non_sequences=modal_cost) if i == 0: cost_vector = result else: cost_vector += result # mask the instance if its cost larger than max_cost choice = tensor.nonzero(tensor.gt(-cost_vector, -max_cost))[0] mask = tensor.zeros_like(x[:, 0], dtype=config.floatX) mask = theano.tensor.set_subtensor(mask[choice], 1.) masks.append(mask) if i < recyl_maxlen: cost -= (2 * (1 - mask) * cost1_vector).sum() / (mask.sum() + 1) else: cost -= cost1 f_fea_other = theano.function([x, x3, y2, y, maskl, modal_cost, max_cost], [ tensor.stack(cost1_mean), tensor.stack(cost2_mean), tensor.stack(cost3_mean) ], on_unused_input='ignore') return use_noise, x, x3, y2, maskl, y, cost, modal_cost, max_cost, f_pred_set, f_pred_seq_set, f_fea_other
def complex_RNN(n_input, n_hidden, n_output, input_type='real', out_every_t=False, loss_function='CE'): np.random.seed(1234) rng = np.random.RandomState(1234) # Initialize parameters: theta, V_re, V_im, hidden_bias, U, out_bias, h_0 V = initialize_matrix(n_input, 2 * n_hidden, 'V', rng) U = initialize_matrix(2 * n_hidden, n_output, 'U', rng) hidden_bias = theano.shared(np.asarray(rng.uniform(low=-0.01, high=0.01, size=(n_hidden, )), dtype=theano.config.floatX), name='hidden_bias') reflection = initialize_matrix(2, 2 * n_hidden, 'reflection', rng) out_bias = theano.shared(np.zeros((n_output, ), dtype=theano.config.floatX), name='out_bias') theta = theano.shared(np.asarray(rng.uniform(low=-np.pi, high=np.pi, size=(3, n_hidden)), dtype=theano.config.floatX), name='theta') bucket = np.sqrt(3. / 2 / n_hidden) h_0 = theano.shared(np.asarray(rng.uniform(low=-bucket, high=bucket, size=(1, 2 * n_hidden)), dtype=theano.config.floatX), name='h_0') parameters = [V, U, hidden_bias, reflection, out_bias, theta, h_0] x, y = initialize_data_nodes(loss_function, input_type, out_every_t) index_permute = np.random.permutation(n_hidden) index_permute_long = np.concatenate( (index_permute, index_permute + n_hidden)) swap_re_im = np.concatenate((np.arange(n_hidden, 2 * n_hidden), np.arange(n_hidden))) # define the recurrence used by theano.scan def recurrence(x_t, y_t, h_prev, cost_prev, acc_prev, theta, V, hidden_bias, out_bias, U): # Compute hidden linear transform step1 = times_diag(h_prev, n_hidden, theta[0, :], swap_re_im) step2 = do_fft(step1, n_hidden) step3 = times_reflection(step2, n_hidden, reflection[0, :]) step4 = vec_permutation(step3, index_permute_long) step5 = times_diag(step4, n_hidden, theta[1, :], swap_re_im) step6 = do_ifft(step5, n_hidden) step7 = times_reflection(step6, n_hidden, reflection[1, :]) step8 = times_diag(step7, n_hidden, theta[2, :], swap_re_im) hidden_lin_output = step8 # Compute data linear transform if loss_function == 'CE': data_lin_output = V[T.cast(x_t, 'int32')] else: data_lin_output = T.dot(x_t, V) # Total linear output lin_output = hidden_lin_output + data_lin_output # Apply non-linearity ---------------------------- # scale RELU nonlinearity modulus = T.sqrt(lin_output**2 + lin_output[:, swap_re_im]**2) rescale = T.maximum( modulus + T.tile(hidden_bias, [2]).dimshuffle('x', 0), 0.) / (modulus + 1e-5) h_t = lin_output * rescale if out_every_t: lin_output = T.dot(h_t, U) + out_bias.dimshuffle('x', 0) cost_t, acc_t = compute_cost_t(lin_output, loss_function, y_t) else: cost_t = theano.shared(np.float32(0.0)) acc_t = theano.shared(np.float32(0.0)) return h_t, cost_t, acc_t # compute hidden states h_0_batch = T.tile(h_0, [x.shape[1], 1]) non_sequences = [theta, V, hidden_bias, out_bias, U] if out_every_t: sequences = [x, y] else: sequences = [ x, T.tile(theano.shared(np.zeros((1, 1), dtype=theano.config.floatX)), [x.shape[0], 1, 1]) ] outputs_info = [ h_0_batch, theano.shared(np.float32(0.0)), theano.shared(np.float32(0.0)) ] [hidden_states, cost_steps, acc_steps], updates = theano.scan(fn=recurrence, sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info) if not out_every_t: lin_output = T.dot(hidden_states[-1, :, :], U) + out_bias.dimshuffle( 'x', 0) costs = compute_cost_t(lin_output, loss_function, y) else: cost = cost_steps.mean() accuracy = acc_steps.mean() costs = [cost, accuracy] return [x, y], parameters, costs
def f_getid_from_replicated(ids, n_samples): return T.repeat(ids * n_samples, n_samples) + T.tile( T.arange(n_samples), ids.shape[0])
def compute_log_averaged_ei(self, x, X, randomness, incumbent): # We compute the old predictive mean at x Kzz = compute_kernel( self.lls, self.lsf, self.z, self.z) + T.eye(self.z.shape[0]) * self.jitter * T.exp(self.lsf) KzzInv = T.nlinalg.MatrixInversePSD()(Kzz) LLt = T.dot(self.LParamPost, T.transpose(self.LParamPost)) covCavityInv = KzzInv + LLt * casting( self.n_points - self.set_for_training) / casting(self.n_points) covCavity = T.nlinalg.MatrixInversePSD()(covCavityInv) meanCavity = T.dot( covCavity, casting(self.n_points - self.set_for_training) / casting(self.n_points) * self.mParamPost) KzzInvmeanCavity = T.dot(KzzInv, meanCavity) Kxz = compute_kernel(self.lls, self.lsf, x, self.z) m_old_x = T.dot(Kxz, KzzInvmeanCavity) # We compute the old predictive mean at X KXz = compute_kernel(self.lls, self.lsf, X, self.z) m_old_X = T.dot(KXz, KzzInvmeanCavity) # We compute the required cross covariance matrices KXX = compute_kernel(self.lls, self.lsf, X, X) - T.dot( T.dot(KXz, KzzInv), KXz.T) + T.eye(X.shape[0]) * self.jitter * T.exp(self.lsf) KXXInv = T.nlinalg.MatrixInversePSD()(KXX) KxX = compute_kernel(self.lls, self.lsf, x, X) xX = T.concatenate([x, X], 0) KxXz = compute_kernel(self.lls, self.lsf, xX, self.z) KxX = KxX - T.dot(T.dot(KxXz[0:x.shape[0], :], KzzInv), KxXz[x.shape[0]:xX.shape[0], :].T) # We compute the new posterior mean samples_internal = T.dot(MatrixChol()(KXX), randomness) new_predictive_mean = T.tile(m_old_x, [1, randomness.shape[1]]) + T.dot( KxX, T.dot(KXXInv, samples_internal)) # We compute the new posterior variance z_expanded = T.concatenate([self.z, X], 0) Kxz_expanded = compute_kernel(self.lls, self.lsf, x, z_expanded) Kzz_expanded = compute_kernel( self.lls, self.lsf, z_expanded, z_expanded) + T.eye( z_expanded.shape[0]) * self.jitter * T.exp(self.lsf) Kzz_expandedInv = T.nlinalg.MatrixInversePSD()(Kzz_expanded) v_out = T.exp(self.lsf) - T.dot( Kxz_expanded * T.dot(Kxz_expanded, Kzz_expandedInv), T.ones_like(z_expanded[:, 0:1])) new_predictive_var = T.tile(v_out, [1, randomness.shape[1]]) s = (incumbent - new_predictive_mean) / T.sqrt(new_predictive_var) log_ei = T.log((incumbent - new_predictive_mean) * ratio(s) + T.sqrt(new_predictive_var)) + log_n_pdf(s) return T.mean(LogSumExp(log_ei, 1), 1)
def tile(x, n): return T.tile(x, n)
def get_output_for(self, inputs, **kwargs): C = inputs[0] q = inputs[1] input_sentence = inputs[2] input_time = inputs[3] C = C + self.T_[input_time].dimshuffle(('x',0,1)) C_reshaped = T.reshape(C,(-1,C.shape[1],1,self.hid_state_size)) tiled_q = T.tile(T.reshape( q,(-1,1,1,self.hid_state_size)),(1,C.shape[1],1,1)) input_sentence_mask = self.sentence_mask_mat[input_sentence-1,:C.shape[1]] W_in_stacked = T.concatenate([self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hid_update], axis=1) W_hid_stacked = T.concatenate([self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hid_update], axis=1) b_stacked = T.concatenate([self.b_resetgate, self.b_updategate, self.b_hid_update], axis=0) def Ep_Gate(c, m, q, Wb, W1, W2, b1, b2): z = T.concatenate([c,m,q,c*q,c*m,T.abs_(c-q),T.abs_(c-m),c*Wb*q,c*Wb*m], axis=2) #g = (T.dot(W2, nonlin.tanh(T.dot(z, W1) + b1)) + b2) <- (big mistake :) g = (T.dot(nonlin.tanh(T.dot(W1, z) + b1), W2) + b2) return g def slice_w(x, n): return x[:, n*self.hid_state_size:(n+1)*self.hid_state_size] def step(hid_previous): tiled_hid_prev = T.tile(T.reshape( hid_previous,(-1,1,1,self.hid_state_size)),(1,C.shape[1],1,1)) g = Ep_Gate(C_reshaped, tiled_hid_prev, tiled_q, self.Wb, self.W1, self.W2, self.b1, self.b2) g = T.reshape(g,(-1,C.shape[1])) g = T.switch(T.eq(input_sentence_mask, 1), g, np.float32(-np.inf)) g = nonlin.softmax(g) e = T.sum(T.reshape(g,(g.shape[0],g.shape[1],1)) * C, axis=1) input_n = e hid_input = T.dot(hid_previous, W_hid_stacked) input_n = T.dot(input_n, W_in_stacked) + b_stacked resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) hid_update_in = slice_w(input_n, 2) hid_update_hid = slice_w(hid_input, 2) hid_update = hid_update_in + resetgate*hid_update_hid hid_update = self.nonlinearity_hid(hid_update) hid = (1 - updategate)*hid_previous + updategate+hid_update return (hid, g) hid = q G = [] for i in xrange(self.n_pass): hid, g = step(hid) G.append(T.reshape(g, (-1,1,C.shape[1]))) return T.reshape(T.concatenate(G, axis=1), (-1,C.shape[1]))
def tile(self, x, n): return T.tile(x, n)
def get_output_for(self, inputs, **kwargs): # input_sentence: sentence size # input_time : sentence position C = inputs[0] q = inputs[1] input_sentence = inputs[2] input_time = inputs[3] # Apply time embedding C = C + self.T_[input_time].dimshuffle(('x',0,1)) # Reshape for parallelizing computation of gates C_reshaped = T.reshape(C,(-1,C.shape[1],1,self.hid_state_size)) tiled_q = T.tile(T.reshape( q,(-1,1,1,self.hid_state_size)),(1,C.shape[1],1,1)) input_sentence_mask = self.sentence_mask_mat[input_sentence-1,:C.shape[1]] W_in_stacked = T.concatenate([self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hid_update], axis=1) W_hid_stacked = T.concatenate([self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hid_update], axis=1) b_stacked = T.concatenate([self.b_resetgate, self.b_updategate, self.b_hid_update], axis=0) def Ep_Gate(c, m, q, Wb, W1, W2, b1, b2): z = T.concatenate([c,m,q,c*q,c*m,T.abs_(c-q),T.abs_(c-m),c*Wb*q,c*Wb*m], axis=2) #g = (T.dot(W2, nonlin.tanh(T.dot(z, W1) + b1)) + b2) <- (big mistake :) g = (T.dot(nonlin.tanh(T.dot(W1, z) + b1), W2) + b2) return g def slice_w(x, n): return x[:, n*self.hid_state_size:(n+1)*self.hid_state_size] # Step for computing summarized episodes recurrently def step(hid_previous): # Computing a summarized episode. tiled_hid_prev = T.tile(T.reshape( hid_previous,(-1,1,1,self.hid_state_size)),(1,C.shape[1],1,1)) g = Ep_Gate(C_reshaped, tiled_hid_prev, tiled_q, self.Wb, self.W1, self.W2, self.b1, self.b2) g = T.reshape(g,(-1,C.shape[1])) g = T.switch(T.eq(input_sentence_mask, 1), g, np.float32(-np.inf)) g = nonlin.softmax(g) e = T.sum(T.reshape(g,(g.shape[0],g.shape[1],1)) * C, axis=1) # After computing the episode, now it is typical GRU. input_n = e hid_input = T.dot(hid_previous, W_hid_stacked) input_n = T.dot(input_n, W_in_stacked) + b_stacked resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) hid_update_in = slice_w(input_n, 2) hid_update_hid = slice_w(hid_input, 2) hid_update = hid_update_in + resetgate*hid_update_hid hid_update = self.nonlinearity_hid(hid_update) hid = (1 - updategate)*hid_previous + updategate+hid_update return hid hid = q # Repeat step process in n_pass times. for i in xrange(self.n_pass): hid = step(hid) return hid
def sample_hier_rbf(model_matrix, sample_kwargs=None): # load the data x_mu_rbf = model_matrix['x_mu_rbf'] x_sd_rbf = model_matrix['x_sd_rbf'] x_sc = model_matrix['x_sc'] subj_idx = model_matrix['subj_idx'] y = model_matrix['y'] n_subj = model_matrix['n_subj'] # fit the first model n, d = x_mu_rbf.shape if sample_kwargs is None: # Here, we use specify NUTS as our sampler (implicitly this is the default) # and use variational inference to initialize sample_kwargs = dict(draws=2000, njobs=2, tune=2000, init='advi+adapt_diag') # to do inference, all we have to do is write down the model in our # probabilistic programming language (PYMC3) and the software will # do inference over it (we can control how this happens, e.g. with # Gibbs sampling, MCMC, Variational Inference, but PYMC3 will default # to hamiltonian-MCMC with the No U-turn sampler ("NUTS")) with pm.Model() as hier_rbf: # here, we write down the model # Define hierarchical parameters # (normal means and standard deviation for regression weights) mu_1 = pm.Normal('mu_beta_rbf_mean', mu=0., sd=100.) mu_2 = pm.Normal('mu_beta_rbf_stdv', mu=0., sd=100.) mu_3 = pm.Normal('mu_beta_stick', mu=0., sd=100.) sigma_1 = pm.HalfCauchy('sigma_rbf_means', beta=100) sigma_2 = pm.HalfCauchy('sigma_rbf_stdev', beta=100) sigma_3 = pm.HalfCauchy('sigma_stick', beta=100) # define subject predictor variables (i.e. regression parameters, # 1 per subject per condition with a hierarchical prior) b_1 = pm.Normal('beta_rbf_mu', mu=mu_1, sd=sigma_1, shape=n_subj) b_2 = pm.Normal('beta_rbf_std', mu=mu_2, sd=sigma_2, shape=n_subj) b_3 = pm.Normal('beta_sc', mu=mu_3, sd=sigma_3, shape=n_subj) # linearly combine the predictors with the subject-specific coefficients # as a scaling factor. In practice, the coefficients have to be broadcast # in to an NxD matric via theano for element-wise multiplication rho = \ tt.tile(tt.reshape(b_1[subj_idx], (n, 1)), d) * x_mu_rbf + \ tt.tile(tt.reshape(b_2[subj_idx], (n, 1)), d) * x_sd_rbf + \ tt.tile(tt.reshape(b_3[subj_idx], (n, 1)), d) * x_sc # pass the resultant vector through a softmax to convert to a probability # distribution. Note, we don't need an additional noise parameter as that # would be collinear with the coefficients. p_hat = softmax(rho) # Data likelihood yl = pm.Categorical('yl', p=p_hat, observed=y) # inference! trace_rbf = pm.sample(**sample_kwargs) return hier_rbf, trace_rbf
def correction_factor(x_diff, y_diff, x_intersection, y_intersection, n_mc = 40000): """Function calculates the correction factor for given model""" # The intersection model x_sq = x_intersection ** 2 x_input = np.concatenate((x_intersection, x_sq), axis = 1) #MCMC model - correction factors and shif x_shared = theano.shared(x_input) gp_mean_coeff = np.array([0, epsilon, c]) gamma_alpha = 1 gamma_beta = 10 inv_gamma_alpha = 1 inv_gamma_beta = 10 with pm.Model() as gp_posteriors_model: #Priors tau_sq = pm.InverseGamma("tau_sq", alpha = inv_gamma_alpha, beta = inv_gamma_beta) sigma_sq = pm.InverseGamma("sigma_sq", alpha = 10, beta= 1) lamb_sq = pm.Gamma("lamb_sq", alpha = gamma_alpha, beta = gamma_beta, shape = 2) theta = pm.Normal("theta", mu= 0, sd = 1) #Shared variables for the input x_input_theta = tt.concatenate([x_shared, tt.tile(theta, (len(x_input), 1))], axis = 1) #GP definition #Mean mean_gp = pm.gp.mean.Linear(coeffs = gp_mean_coeff, intercept = 0) #Covariance cov_gp = tau_sq * pm.gp.cov.ExpQuad(x_input.shape[1] + 1, ls = tt.sqrt(lamb_sq) / 4, active_dims = [0,2]) #GP gp_model = pm.gp.Marginal(mean_func=mean_gp, cov_func= cov_gp) #Marginal likelihoods y_ = gp_model.marginal_likelihood("y_", X = x_input_theta, y = y_intersection, noise = tt.sqrt(sigma_sq)) trace_priors = pm.sample(n_mc, tune = 10000, chains = 1) # The complement model x_sq = x_diff ** 2 x_input = np.concatenate((x_diff, x_sq), axis = 1) #MCMC model - correction factors and shif x_shared = theano.shared(x_input) gp_mean_coeff = np.array([0, epsilon, c]) gamma_alpha = 1 gamma_beta = 10 inv_gamma_alpha = 1 inv_gamma_beta = 10 with pm.Model() as pymc3_model: #Priors tau_sq = pm.InverseGamma("tau_sq", alpha = inv_gamma_alpha, beta = inv_gamma_beta) sigma_sq = pm.InverseGamma("sigma_sq", alpha = 10, beta= 1) lamb_sq = pm.Gamma("lamb_sq", alpha = gamma_alpha, beta = gamma_beta, shape = 2) theta = pm.Normal("theta", mu= 0, sd = 1) #Shared variables for the input x_input_theta = tt.concatenate([x_shared, tt.tile(theta, (len(x_input), 1))], axis = 1) #GP definition #Mean mean_gp = pm.gp.mean.Linear(coeffs = gp_mean_coeff, intercept = 0) #Covariance cov_gp = tau_sq * pm.gp.cov.ExpQuad(x_input.shape[1] + 1, ls = tt.sqrt(lamb_sq) / 4, active_dims = [0,2]) #GP gp_model = pm.gp.Marginal(mean_func=mean_gp, cov_func= cov_gp) #Marginal likelihoods y_gp = gp_model.marginal_likelihood("y_", X = x_input_theta, y = y_diff, noise = tt.sqrt(sigma_sq)) log_likelihood = np.empty(0) mc_integral = np.empty(n_mc) logp = y_gp.logp for i in tqdm(range(n_mc), desc = "Log likelihood eval"): log_likelihood = np.append(log_likelihood, logp(trace_priors[i])) for i in range(n_mc): m = max(log_likelihood[:(i + 1)]) mc_integral[i] = (np.exp(m) * np.sum(np.exp(log_likelihood[:(i + 1)] - m))) / (i + 1) return log_likelihood, mc_integral
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, emb_size=40, batch_size=50, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_sents = np.asarray(all_sentences[0], dtype='int32') train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels = np.asarray(all_labels[0], dtype='int32') train_size = len(train_labels) dev_sents = np.asarray(all_sentences[1], dtype='int32') dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX) dev_labels = np.asarray(all_labels[1], dtype='int32') dev_size = len(dev_labels) ''' combine train and dev ''' train_sents = np.concatenate([train_sents, dev_sents], axis=0) train_masks = np.concatenate([train_masks, dev_masks], axis=0) train_labels = np.concatenate([train_labels, dev_labels], axis=0) train_size = train_size + dev_size test_sents = np.asarray(all_sentences[2], dtype='int32') test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels = np.asarray(all_labels[2], dtype='int32') test_size = len(test_labels) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec', emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec' ], 40) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] # NN_para = multiCNN_para+ACNN_para conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb], axis=1) LR_input_size = hidden_size[0] * 2 + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 12, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(layer_LR.before_softmax) #batch * 12 prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix) loss = -T.mean(T.log(prob_pos)) ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) LR_att_input = T.concatenate([gru_sent_embeddings, bow_emb], axis=1) LR_att_input_size = hidden_size[0] + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_att_a = create_ensemble_para( rng, 12, LR_att_input_size) # the weight matrix hidden_size*2 LR_att_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_att_para = [U_att_a, LR_att_b] layer_att_LR = LogisticRegression( rng, input=LR_att_input, n_in=LR_att_input_size, n_out=12, W=U_att_a, b=LR_att_b ) #basically it is a multiplication between weight matrix and input feature vector att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax) #batch * 12 att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix, att_score_matrix) att_loss = -T.mean(T.log(att_prob_pos)) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l acnn_LR_input = T.concatenate( [sent_att_embeddings, sent_att_embeddings2, bow_emb], axis=1) acnn_LR_input_size = hidden_size[0] * 2 + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a = create_ensemble_para( rng, 12, acnn_LR_input_size) # the weight matrix hidden_size*2 acnn_LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) params = multiCNN_para + LR_para + GRU_NN_para + LR_att_para + ACNN_para + acnn_LR_para # put all model parameters together cost = loss + att_loss + acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) ''' testing ''' ensemble_NN_scores = T.max(T.concatenate([ att_score_matrix.dimshuffle('x', 0, 1), score_matrix.dimshuffle('x', 0, 1), acnn_score_matrix.dimshuffle('x', 0, 1) ], axis=0), axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = 0.6 * ensemble_NN_scores + 0.4 * 0.5 * ( cosine_score_matrix + top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] # max_acc_dev=0.0 max_meanf1_test = 0.0 max_weightf1_test = 0.0 train_indices = range(train_size) cost_i = 0.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) gold_labels = test_labels[test_batch_id:test_batch_id + batch_size] # print 'pred_labels:', pred_labels # print 'gold_labels;', gold_labels all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col( all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test = test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test = test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
# Model prior definitions gamma_alpha = 1 gamma_beta = 10 inv_gamma_alpha = 1 inv_gamma_beta = 10 with pm.Model() as gp_toy_model: #Priors tau_sq = pm.InverseGamma("tau_sq", alpha = inv_gamma_alpha, beta = inv_gamma_beta) sigma_sq = pm.InverseGamma("sigma_sq", alpha = 10, beta= 1) lamb_sq = pm.Gamma("lamb_sq", alpha = gamma_alpha, beta = gamma_beta, shape = 2) theta = pm.Normal("theta", mu= 0, sd = 1) #Shared variables for the input x_input_theta = tt.concatenate([x_shared, tt.tile(theta, (len(x_input), 1))], axis = 1) #GP definition #Mean mean_gp = pm.gp.mean.Linear(coeffs = gp_mean_coeff, intercept = 0) #Covariance cov_gp = tau_sq * pm.gp.cov.ExpQuad(x_input.shape[1] + 1, ls = tt.sqrt(lamb_sq) / 4, active_dims = [0,2]) #GP gp_model = pm.gp.Marginal(mean_func=mean_gp, cov_func= cov_gp) #Marginal likelihoods y_gp = gp_model.marginal_likelihood("y_", X = x_input_theta, y = y_train_a, noise = tt.sqrt(sigma_sq)) # In[28]:
def get_output(self, train=False): X = self.get_input(train) print 'ball model X:', X, X.ndim self.middle = X initial = [[27, 0, 15], [9, 0, 10], [-9, 0, 10], [-27, 0, 10], [27, 0, 30], [9, 0, 30], [-9, 0, 30], [-27, 0, 30], [27, 0, 50], [9, 0, 50], [-9, 0, 50], [-27, 0, 50], [28, 0, 70], [8, 0, 70], [-11, 0, 70], [-28, 0, 70], [41, 0, 15], [45, 0, 15], [49, 0, 15], [53, 0, 15], [67, 0, 15], [81, 0, 15], [94, 0, 15], [105, 0, 15], [28, 0, 88], [28, 0, 104], [28, 0, 119], [28, 0, 133], [28, 0, 145], [28, 0, 156], [8, 0, 90], [8, 0, 110], [8, 0, 127], [8, 0, 141], [8, 0, 155], [8, 0, 169], [-11, 0, 88], [-11, 0, 104], [-11, 0, 119], [-11, 0, 133], [-11, 0, 145], [-11, 0, 156], [-28, 0, 85], [-28, 0, 95], [-28, 0, 105], [-28, 0, 116], [-28, 0, 127], [-28, 0, 135]] outputans = [] for batch in range(0, self.batchsize): d = self.middle[batch].reshape((26, )) ##ndim=0 ro = T.dot(T.dot(self._getrx(d[3]), self._getry(d[4])), self._getrz(d[5])) to1to16 = theano.shared(np.zeros((3, 1), dtype='float32')) ###o'=(d1,d2,d3) to1to16 = T.set_subtensor(to1to16[0], d[0]) to1to16 = T.set_subtensor(to1to16[1], d[1]) to1ti16 = T.set_subtensor(to1to16[2], d[2]) o1to16 = T.tile(to1ti16, (1, 16)) # xi1to16 = theano.shared(np.zeros((16, 3), dtype='float32')) xi1to16 = T.dot(ro, np.array(initial[0:16]).T) + o1to16 ####################################Thumb First ###17-20 r16 = T.dot(self._getry(d[6]), self._getrz(d[7])) ###x16origin rotation joint is sphere 1 txori16 = theano.shared(np.zeros((3, 1), dtype='float32')) txori16 = T.set_subtensor(txori16[0], initial[0][0]) txori16 = T.set_subtensor(txori16[1], initial[0][1]) txori16 = T.set_subtensor(txori16[2], initial[0][2]) xori16 = T.tile(txori16, (1, 4)) to17to20 = theano.shared(np.zeros((3, 1), dtype='float32')) to17to20 = T.set_subtensor(to17to20[0], xi1to16[0][0]) to17to20 = T.set_subtensor(to17to20[1], xi1to16[1][0]) to17to20 = T.set_subtensor(to17to20[2], xi1to16[2][0]) o17to20 = T.tile(to17to20, (1, 4)) xi17to20 = T.dot(T.dot(ro, r16), np.array(initial[16:20]).T - xori16) + o17to20 ######################################Thumb Second ###21-22 r20 = self._getrz(d[8]) txori20 = theano.shared(np.zeros((3, 1), dtype='float32')) txori20 = T.set_subtensor(txori20[0], initial[19][0]) txori20 = T.set_subtensor(txori20[1], initial[19][1]) txori20 = T.set_subtensor(txori20[2], initial[19][2]) xori20 = T.tile(txori20, (1, 2)) to21to22 = theano.shared(np.zeros((3, 1), dtype='float32')) to21to22 = T.set_subtensor(to21to22[0], xi17to20[0][3]) to21to22 = T.set_subtensor(to21to22[1], xi17to20[1][3]) to21to22 = T.set_subtensor(to21to22[2], xi17to20[2][3]) o21to22 = T.tile(to21to22, (1, 2)) xi21to22 = T.dot(T.dot(T.dot(ro, r16), r20), np.array(initial[20:22]).T - xori20) + o21to22 ######################################Thumb Third ###23-24 r22 = self._getrz(d[9]) txori22 = theano.shared(np.zeros((3, 1), dtype='float32')) txori22 = T.set_subtensor(txori22[0], initial[21][0]) txori22 = T.set_subtensor(txori22[1], initial[21][1]) txori22 = T.set_subtensor(txori22[2], initial[21][2]) xori22 = T.tile(txori22, (1, 2)) to23to24 = theano.shared(np.zeros((3, 1), dtype='float32')) to23to24 = T.set_subtensor(to23to24[0], xi21to22[0][1]) to23to24 = T.set_subtensor(to23to24[1], xi21to22[1][1]) to23to24 = T.set_subtensor(to23to24[2], xi21to22[2][1]) o23to24 = T.tile(to23to24, (1, 2)) xi23to24 = T.dot(T.dot(T.dot(T.dot(ro, r16), r20), r22), np.array(initial[22:24]).T - xori22) + o23to24 ######################################First Joint Index ###25-26 r13 = T.dot(self._getrx(d[10]), self._getry(d[11])) txori13 = theano.shared(np.zeros((3, 1), dtype='float32')) txori13 = T.set_subtensor(txori13[0], initial[12][0]) txori13 = T.set_subtensor(txori13[1], initial[12][1]) txori13 = T.set_subtensor(txori13[2], initial[12][2]) xori13 = T.tile(txori13, (1, 2)) to25to26 = theano.shared(np.zeros((3, 1), dtype='float32')) to25to26 = T.set_subtensor(to25to26[0], xi1to16[0][12]) to25to26 = T.set_subtensor(to25to26[1], xi1to16[1][12]) to25to26 = T.set_subtensor(to25to26[2], xi1to16[2][12]) o25to26 = T.tile(to25to26, (1, 2)) xi25to26 = T.dot(T.dot(ro, r13), np.array(initial[24:26]).T - xori13) + o25to26 ######################################First Joint Middle ###31-32 r14 = T.dot(self._getrx(d[14]), self._getry(d[15])) txori14 = theano.shared(np.zeros((3, 1), dtype='float32')) txori14 = T.set_subtensor(txori14[0], initial[13][0]) txori14 = T.set_subtensor(txori14[1], initial[13][1]) txori14 = T.set_subtensor(txori14[2], initial[13][2]) xori14 = T.tile(txori14, (1, 2)) to31to32 = theano.shared(np.zeros((3, 1), dtype='float32')) to31to32 = T.set_subtensor(to31to32[0], xi1to16[0][13]) to31to32 = T.set_subtensor(to31to32[1], xi1to16[1][13]) to31to32 = T.set_subtensor(to31to32[2], xi1to16[2][13]) o31to32 = T.tile(to31to32, (1, 2)) xi31to32 = T.dot(T.dot(ro, r14), np.array(initial[30:32]).T - xori14) + o31to32 ######################################First Joint Ring ###37-38 r15 = T.dot(self._getrx(d[18]), self._getry(d[19])) txori15 = theano.shared(np.zeros((3, 1), dtype='float32')) txori15 = T.set_subtensor(txori15[0], initial[14][0]) txori15 = T.set_subtensor(txori15[1], initial[14][1]) txori15 = T.set_subtensor(txori15[2], initial[14][2]) xori15 = T.tile(txori15, (1, 2)) to37to38 = theano.shared(np.zeros((3, 1), dtype='float32')) to37to38 = T.set_subtensor(to37to38[0], xi1to16[0][14]) to37to38 = T.set_subtensor(to37to38[1], xi1to16[1][14]) to37to38 = T.set_subtensor(to37to38[2], xi1to16[2][14]) o37to38 = T.tile(to37to38, (1, 2)) xi37to38 = T.dot(T.dot(ro, r15), np.array(initial[36:38]).T - xori15) + o37to38 ######################################First Joint Little ###43-44 r16 = T.dot(self._getrx(d[22]), self._getry(d[23])) txori16 = theano.shared(np.zeros((3, 1), dtype='float32')) txori16 = T.set_subtensor(txori16[0], initial[15][0]) txori16 = T.set_subtensor(txori16[1], initial[15][1]) txori16 = T.set_subtensor(txori16[2], initial[15][2]) xori16 = T.tile(txori16, (1, 2)) to43to44 = theano.shared(np.zeros((3, 1), dtype='float32')) to43to44 = T.set_subtensor(to43to44[0], xi1to16[0][15]) to43to44 = T.set_subtensor(to43to44[1], xi1to16[1][15]) to43to44 = T.set_subtensor(to43to44[2], xi1to16[2][15]) o43to44 = T.tile(to43to44, (1, 2)) xi43to44 = T.dot(T.dot(ro, r16), np.array(initial[42:44]).T - xori16) + o43to44 ######################################Second Joint Index ###27-28 r26 = self._getrx(d[12]) txori26 = theano.shared(np.zeros((3, 1), dtype='float32')) txori26 = T.set_subtensor(txori26[0], initial[25][0]) txori26 = T.set_subtensor(txori26[1], initial[25][1]) txori26 = T.set_subtensor(txori26[2], initial[25][2]) xori26 = T.tile(txori26, (1, 2)) to27to28 = theano.shared(np.zeros((3, 1), dtype='float32')) to27to28 = T.set_subtensor(to27to28[0], xi25to26[0][1]) to27to28 = T.set_subtensor(to27to28[1], xi25to26[1][1]) to27to28 = T.set_subtensor(to27to28[2], xi25to26[2][1]) o27to28 = T.tile(to27to28, (1, 2)) xi27to28 = T.dot(T.dot(T.dot(ro, r13), r26), np.array(initial[26:28]).T - xori26) + o27to28 ######################################Second Joint Middle ###33-34 r32 = self._getrx(d[16]) txori32 = theano.shared(np.zeros((3, 1), dtype='float32')) txori32 = T.set_subtensor(txori32[0], initial[31][0]) txori32 = T.set_subtensor(txori32[1], initial[31][1]) txori32 = T.set_subtensor(txori32[2], initial[31][2]) xori32 = T.tile(txori32, (1, 2)) to33to34 = theano.shared(np.zeros((3, 1), dtype='float32')) to33to34 = T.set_subtensor(to33to34[0], xi31to32[0][1]) to33to34 = T.set_subtensor(to33to34[1], xi31to32[1][1]) to33to34 = T.set_subtensor(to33to34[2], xi31to32[2][1]) o33to34 = T.tile(to33to34, (1, 2)) xi33to34 = T.dot(T.dot(T.dot(ro, r14), r32), np.array(initial[32:34]).T - xori32) + o33to34 ######################################Second Joint Ring ###39-40 r38 = self._getrx(d[20]) txori38 = theano.shared(np.zeros((3, 1), dtype='float32')) txori38 = T.set_subtensor(txori38[0], initial[37][0]) txori38 = T.set_subtensor(txori38[1], initial[37][1]) txori38 = T.set_subtensor(txori38[2], initial[37][2]) xori38 = T.tile(txori38, (1, 2)) to39to40 = theano.shared(np.zeros((3, 1), dtype='float32')) to39to40 = T.set_subtensor(to39to40[0], xi37to38[0][1]) to39to40 = T.set_subtensor(to39to40[1], xi37to38[1][1]) to39to40 = T.set_subtensor(to39to40[2], xi37to38[2][1]) o39to40 = T.tile(to39to40, (1, 2)) xi39to40 = T.dot(T.dot(T.dot(ro, r15), r38), np.array(initial[38:40]).T - xori38) + o39to40 ######################################Second Joint Little ###45-46 r44 = self._getrx(d[24]) txori44 = theano.shared(np.zeros((3, 1), dtype='float32')) txori44 = T.set_subtensor(txori44[0], initial[43][0]) txori44 = T.set_subtensor(txori44[1], initial[43][1]) txori44 = T.set_subtensor(txori44[2], initial[43][2]) xori44 = T.tile(txori44, (1, 2)) to45to46 = theano.shared(np.zeros((3, 1), dtype='float32')) to45to46 = T.set_subtensor(to45to46[0], xi43to44[0][1]) to45to46 = T.set_subtensor(to45to46[1], xi43to44[1][1]) to45to46 = T.set_subtensor(to45to46[2], xi43to44[2][1]) o45to46 = T.tile(to45to46, (1, 2)) xi45to46 = T.dot(T.dot(T.dot(ro, r16), r44), np.array(initial[44:46]).T - xori44) + o45to46 #####################################Third Joint Index ###29-30 r28 = self._getrx(d[13]) txori28 = theano.shared(np.zeros((3, 1), dtype='float32')) txori28 = T.set_subtensor(txori28[0], initial[27][0]) txori28 = T.set_subtensor(txori28[1], initial[27][1]) txori28 = T.set_subtensor(txori28[2], initial[27][2]) xori28 = T.tile(txori28, (1, 2)) to29to30 = theano.shared(np.zeros((3, 1), dtype='float32')) to29to30 = T.set_subtensor(to29to30[0], xi27to28[0][1]) to29to30 = T.set_subtensor(to29to30[1], xi27to28[1][1]) to29to30 = T.set_subtensor(to29to30[2], xi27to28[2][1]) o29to30 = T.tile(to29to30, (1, 2)) xi29to30 = T.dot(T.dot(T.dot(T.dot(ro, r13), r26), r28), np.array(initial[28:30]).T - xori28) + o29to30 #####################################Third Joint Middle ###35-36 r34 = self._getrx(d[17]) txori34 = theano.shared(np.zeros((3, 1), dtype='float32')) txori34 = T.set_subtensor(txori34[0], initial[33][0]) txori34 = T.set_subtensor(txori34[1], initial[33][1]) txori34 = T.set_subtensor(txori34[2], initial[33][2]) xori34 = T.tile(txori34, (1, 2)) to35to36 = theano.shared(np.zeros((3, 1), dtype='float32')) to35to36 = T.set_subtensor(to35to36[0], xi33to34[0][1]) to35to36 = T.set_subtensor(to35to36[1], xi33to34[1][1]) to35to36 = T.set_subtensor(to35to36[2], xi33to34[2][1]) o35to36 = T.tile(to35to36, (1, 2)) xi35to36 = T.dot(T.dot(T.dot(T.dot(ro, r14), r32), r34), np.array(initial[34:36]).T - xori34) + o35to36 #####################################Third Joint Ring ###41-42 r40 = self._getrx(d[21]) txori40 = theano.shared(np.zeros((3, 1), dtype='float32')) txori40 = T.set_subtensor(txori40[0], initial[39][0]) txori40 = T.set_subtensor(txori40[1], initial[39][1]) txori40 = T.set_subtensor(txori40[2], initial[39][2]) xori40 = T.tile(txori40, (1, 2)) to41to42 = theano.shared(np.zeros((3, 1), dtype='float32')) to41to42 = T.set_subtensor(to41to42[0], xi39to40[0][1]) to41to42 = T.set_subtensor(to41to42[1], xi39to40[1][1]) to41to42 = T.set_subtensor(to41to42[2], xi39to40[2][1]) o41to42 = T.tile(to41to42, (1, 2)) xi41to42 = T.dot(T.dot(T.dot(T.dot(ro, r15), r38), r40), np.array(initial[40:42]).T - xori40) + o41to42 #####################################Third Joint Little ###47-48 r46 = self._getrx(d[25]) txori46 = theano.shared(np.zeros((3, 1), dtype='float32')) txori46 = T.set_subtensor(txori46[0], initial[45][0]) txori46 = T.set_subtensor(txori46[1], initial[45][1]) txori46 = T.set_subtensor(txori46[2], initial[45][2]) xori46 = T.tile(txori46, (1, 2)) to47to48 = theano.shared(np.zeros((3, 1), dtype='float32')) to47to48 = T.set_subtensor(to47to48[0], xi45to46[0][1]) to47to48 = T.set_subtensor(to47to48[1], xi45to46[1][1]) to47to48 = T.set_subtensor(to47to48[2], xi45to46[2][1]) o47to48 = T.tile(to47to48, (1, 2)) xi47to48 = T.dot(T.dot(T.dot(T.dot(ro, r16), r44), r46), np.array(initial[46:48]).T - xori46) + o47to48 ret = T.concatenate([ xi1to16, xi17to20, xi21to22, xi23to24, xi25to26, xi27to28, xi29to30, xi31to32, xi33to34, xi35to36, xi37to38, xi39to40, xi41to42, xi43to44, xi45to46, xi47to48 ], axis=1) ret = ret.reshape((144, )) outputans.append(ret) outputans = T.reshape(outputans, (self.batchsize, 144)) return outputans
def build_gauss_model_theano(self, X): mean = T.mean(X, 0) Xc = X - T.tile(mean, (X.shape[0], 1)) covs = T.sum(Xc ** 2, 0) / Xc.shape[0] + self.min_cov return mean, covs
def evaluate_lenet5(learning_rate=0.01, n_epochs=4, emb_size=300, batch_size=50, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/' test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt' output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_onlyMT_BBN_epoch4.json' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) word2id = {} # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types( word2id, maxSentLen) train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others( word2id, maxSentLen) test_sents, test_masks, test_lines, word2id = load_official_testData_only_MT( word2id, maxSentLen, test_file_path) label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_p1_sents = np.asarray(train_p1_sents, dtype='int32') train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX) train_p1_labels = np.asarray(train_p1_labels, dtype='int32') train_p1_size = len(train_p1_labels) train_p2_sents = np.asarray(train_p2_sents, dtype='int32') train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX) train_p2_labels = np.asarray(train_p2_labels, dtype='int32') train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32') train_p2_size = len(train_p2_labels) ''' combine train_p1 and train_p2 ''' train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0) train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0) train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0) train_size = train_p1_size + train_p2_size test_sents = np.asarray(test_sents, dtype='int32') test_masks = np.asarray(test_masks, dtype=theano.config.floatX) # test_labels=np.asarray(all_labels[2], dtype='int32') test_size = len(test_sents) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + '100k-ENG-multicca.300.ENG.txt', emb_root + '100k-IL9-multicca.d300.IL9.txt' ], 300) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 other_labels = T.imatrix() #batch*4 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] ''' multi-CNN ''' conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l ''' cross-DNN-dataless ''' #first map label emb into hidden space HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, emb_size, hidden_size[0]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] HL_layer_1 = HiddenLayer(rng, input=bow_des, n_in=emb_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) des_rep_hidden = HL_layer_1.output #(type_size, hidden_size) dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot( des_rep_hidden.T)) #(batch_size, type_size) dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) acnn_LR_input = T.concatenate([ dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix, top_k_score_matrix, sent_embeddings, sent_embeddings2, gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb ], axis=1) acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12) acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size, 16) acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b] acnn_other_layer_LR = LogisticRegression(rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=16, W=acnn_other_U_a, b=acnn_other_LR_b) acnn_other_prob_matrix = T.nnet.softmax( acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4))) acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape( (batch_size, 4, 4)) acnn_other_prob = acnn_other_prob_tensor3[ T.repeat(T.arange(batch_size), 4), T.tile(T.arange(4), (batch_size)), other_labels.flatten()] acnn_other_field_loss = -T.mean(T.log(acnn_other_prob)) params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params # put all model parameters together cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() + (conv_att_W**2).sum() + (conv_att_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) other_paras = params + acnn_other_LR_para cost_other = cost + acnn_other_field_loss other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate) ''' testing ''' ensemble_NN_scores = acnn_score_matrix #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = ensemble_NN_scores #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) ''' test for other fields ''' sum_tensor3 = acnn_other_prob_tensor3 #(batch, 4, 3) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_p1_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') train_p2_model = theano.function([ sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask, other_labels ], cost_other, updates=other_updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], [binarize_prob, ensemble_scores, sum_tensor3], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_train_p2_batches = train_p2_size / batch_size train_p2_batch_start = list(np.arange(n_train_p2_batches) * batch_size) + [train_p2_size - batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] train_p2_batch_start_set = set(train_p2_batch_start) # max_acc_dev=0.0 # max_meanf1_test=0.0 # max_weightf1_test=0.0 train_indices = range(train_size) train_p2_indices = range(train_p2_size) cost_i = 0.0 other_cost_i = 0.0 min_mean_frame = 100.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) random.Random(100).shuffle(train_p2_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_p1_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) if batch_id in train_p2_batch_start_set: train_p2_id_batch = train_p2_indices[batch_id:batch_id + batch_size] other_cost_i += train_p2_model( train_p2_sents[train_p2_id_batch], train_p2_masks[train_p2_id_batch], train_p2_labels[train_p2_id_batch], label_sent, label_mask, train_p2_other_labels[train_p2_id_batch]) # else: # random_batch_id = random.choice(train_p2_batch_start) # train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size] # other_cost_i+=train_p2_model( # train_p2_sents[train_p2_id_batch], # train_p2_masks[train_p2_id_batch], # train_p2_labels[train_p2_id_batch], # label_sent, # label_mask, # train_p2_other_labels[train_p2_id_batch] # ) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), str( other_cost_i / iter), 'uses ', (time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_types = [] pred_confs = [] pred_others = [] for i, test_batch_id in enumerate( test_batch_start): # for each test batch pred_types_i, pred_conf_i, pred_fields_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) if i < len(test_batch_start) - 1: pred_types.append(pred_types_i) pred_confs.append(pred_conf_i) pred_others.append(pred_fields_i) else: pred_types.append(pred_types_i[-n_test_remain:]) pred_confs.append(pred_conf_i[-n_test_remain:]) pred_others.append(pred_fields_i[-n_test_remain:]) pred_types = np.concatenate(pred_types, axis=0) pred_confs = np.concatenate(pred_confs, axis=0) pred_others = np.concatenate(pred_others, axis=0) mean_frame = generate_2018_official_output( test_lines, output_file_path, pred_types, pred_confs, pred_others, min_mean_frame) if mean_frame < min_mean_frame: min_mean_frame = mean_frame print '\t\t\t test over, min_mean_frame:', min_mean_frame print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))