def sample_sticky_only(model_matrix, sample_kwargs=None): # load the data x_sc = model_matrix['x_sc'] subj_idx = model_matrix['subj_idx'] y = model_matrix['y'] n_subj = model_matrix['n_subj'] n, d = model_matrix['x_mu_kal'].shape if sample_kwargs is None: sample_kwargs = dict(draws=2000, njobs=2, tune=2000, init='advi+adapt_diag') with pm.Model() as hier_sticky: mu_1 = pm.Normal('mu_beta_stick', mu=0., sd=100.) sigma_1 = pm.HalfCauchy('sigma_stick', beta=100) b_1 = pm.Normal('beta_sticky', mu=mu_1, sd=sigma_1, shape=n_subj) rho = tt.tile(tt.reshape(b_1[subj_idx], (n, 1)), d) * x_sc p_hat = softmax(rho) # Data likelihood yl = pm.Categorical('yl', p=p_hat, observed=y) # inference! trace_kal_scram = pm.sample(**sample_kwargs) return hier_sticky, trace_kal_scram
def forward(x_t, c_tm1, s_tm1, Wxi, Wsi, Wxf, Wsf, Wxo, Wso, Wxg, Wsg, Wsy, bi, bf, bo, bg, by): i = sigmoid(T.dot(x_t, Wxi) + T.dot(s_tm1, Wsi) + bi) f = sigmoid(T.dot(x_t, Wxf) + T.dot(s_tm1, Wsf) + bf) o = sigmoid(T.dot(x_t, Wxo) + T.dot(s_tm1, Wso) + bo) g = tanh(T.dot(x_t, Wxg) + T.dot(s_tm1, Wsg) + bg) c = c_tm1 * f + g * i s = tanh(c) * o y = softmax(T.dot(s, Wsy) + by) return [c, s, y]
def forward(x_t, c_tm1, s_tm1, Wx, Ws, Wy, b, by): preact = T.dot(x_t, Wx) + T.dot(s_tm1, Ws) + b i = sigmoid(_slice(preact, 0)) f = sigmoid(_slice(preact, 1)) o = sigmoid(_slice(preact, 2)) g = tanh(_slice(preact, 3)) c = c_tm1 * f + g * i s = tanh(c) * o y = softmax(T.dot(s, Wy) + by) return [c, s, y]
def forward(x_t, h_tm1, Wx, Wh, bh, am, ax, ah, Wy, by): h_t = 1 preact = am*T.dot(x_t,Wx)*T.dot(h_tm1,Wh) \ +ax*T.dot(x_t,Wx) \ +ah*T.dot(h_tm1,Wh) \ +bh for i in range(self.order): h_t = h_t * act(_slice(preact, i)) y_t = softmax(T.dot(h_t, Wy) + by) return h_t, y_t, preact
def sample_heir_rbf_kal(model_matrix, sample_kwargs=None): # load the data x_mu_rbf = model_matrix['x_mu_rbf'] x_sd_rbf = model_matrix['x_sd_rbf'] x_mu_kal = model_matrix['x_mu_kal'] x_sd_kal = model_matrix['x_sd_kal'] x_sc = model_matrix['x_sc'] subj_idx = model_matrix['subj_idx'] y = model_matrix['y'] n_subj = model_matrix['n_subj'] n, d = x_mu_rbf.shape if sample_kwargs is None: sample_kwargs = dict(draws=2000, njobs=2, tune=2000, init='advi+adapt_diag') with pm.Model() as hier_rbf_kal: mu_1 = pm.Normal('mu_beta_rbf_mean', mu=0., sd=100.) mu_2 = pm.Normal('mu_beta_rbf_stdv', mu=0., sd=100.) mu_3 = pm.Normal('mu_beta_kal_mean', mu=0., sd=100.) mu_4 = pm.Normal('mu_beta_kal_stdv', mu=0., sd=100.) mu_5 = pm.Normal('mu_beta_stick', mu=0., sd=100.) sigma_1 = pm.HalfCauchy('sigma_rbf_means', beta=100) sigma_2 = pm.HalfCauchy('sigma_rbf_stdev', beta=100) sigma_3 = pm.HalfCauchy('sigma_kal_means', beta=100) sigma_4 = pm.HalfCauchy('sigma_kal_stdev', beta=100) sigma_5 = pm.HalfCauchy('sigma_stick', beta=100) b_1 = pm.Normal('beta_rbf_mu', mu=mu_1, sd=sigma_1, shape=n_subj) b_2 = pm.Normal('beta_rbf_std', mu=mu_2, sd=sigma_2, shape=n_subj) b_3 = pm.Normal('beta_kal_mu', mu=mu_3, sd=sigma_3, shape=n_subj) b_4 = pm.Normal('beta_kal_std', mu=mu_4, sd=sigma_4, shape=n_subj) b_5 = pm.Normal('beta_sc', mu=mu_5, sd=sigma_5, shape=n_subj) rho = \ tt.tile(tt.reshape(b_1[subj_idx], (n, 1)), d) * x_mu_rbf + \ tt.tile(tt.reshape(b_2[subj_idx], (n, 1)), d) * x_sd_rbf + \ tt.tile(tt.reshape(b_3[subj_idx], (n, 1)), d) * x_mu_kal + \ tt.tile(tt.reshape(b_4[subj_idx], (n, 1)), d) * x_sd_kal + \ tt.tile(tt.reshape(b_5[subj_idx], (n, 1)), d) * x_sc p_hat = softmax(rho) # Data likelihood yl = pm.Categorical('yl', p=p_hat, observed=y) # inference! trace_gprbf_kal = pm.sample(**sample_kwargs) return hier_rbf_kal, trace_gprbf_kal
def recurrence1(wrut, wrct, urx_pre1, cpt_pre1): # ResNet更新 ur_t = relu(T.dot(wrut, urx_pre1.T).T + urx_pre1) # (batch_size, d) cp_t = relu(T.dot(cpt_pre1, wrct) + cpt_pre1) # (batch_size, set_size, d) # att计算生成上下文向量 ur_t_emb = T.dot(wa2, ur_t.T).T.dimshuffle(0, 'x', 1) e_t = T.dot(tanh(ur_t_emb + T.dot(cp_t, wa3)), wa1) # shape=(batch_size, set_size) a_t = softmax(e_t) c_t = T.sum(cp_t * a_t.dimshuffle(0, 1, 'x'), axis=1) return [ ur_t, cp_t, c_t ] # (batch_size, d), (batch_size, set_size, d), (batch_size, d)
def forward_propagation(self, x): # The total number of time steps T = len(x) # During forward propagation we save all hidden states in s because need them later. # We add one additional element for the initial hidden, which we set to 0 s = np.zeros((T + 1, self.hidden_dim)) s[-1] = np.zeros(self.hidden_dim) # The outputs at each time step. Again, we save them for later. o = np.zeros((T, self.word_dim)) # For each time step... for t in np.arange(T): # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector. s[t] = np.tanh(self.U[:, x[t]] + self.W.dot(s[t - 1])) print self.V.dot(s[t]) o[t] = softmax(self.V.dot(s[t])) return [o, s]
def test_softmax_optimizations(): from theano.tensor.nnet.nnet import softmax, crossentropy_categorical_1hot x = tensor.fmatrix('x') one_of_n = tensor.lvector('one_of_n') op = crossentropy_categorical_1hot xe = op(x, one_of_n) env = theano.gof.Env([x, one_of_n], [op(softmax(x), one_of_n)]) assert env.outputs[0].owner.op == op mode_with_gpu.optimizer.optimize(env) assert str(env.outputs[0].owner.op) == 'OutputGuard' assert env.outputs[0].owner.inputs[0].owner.op == cuda.host_from_gpu assert env.outputs[0].owner.inputs[0].owner.inputs[ 0].owner.op == cuda.nnet.gpu_crossentropy_softmax_argmax_1hot_with_bias
def sample_heir_scram_kal(model_matrix, sample_kwargs=None): # load the data + scramble Kalman filter data x_mu_kal_scrambled = np.random.permutation(model_matrix['x_mu_kal']) x_sd_kal_scrambled = np.random.permutation(model_matrix['x_sd_kal']) x_sc = model_matrix['x_sc'] subj_idx = model_matrix['subj_idx'] y = model_matrix['y'] n_subj = model_matrix['n_subj'] n, d = x_mu_kal_scrambled.shape if sample_kwargs is None: sample_kwargs = dict(draws=2000, njobs=2, tune=2000, init='advi+adapt_diag') with pm.Model() as hier_kal_scrambeled: mu_1 = pm.Normal('mu_beta_kal_sc_mean', mu=0., sd=100.) mu_2 = pm.Normal('mu_beta_kal_sc_stdv', mu=0., sd=100.) mu_3 = pm.Normal('mu_beta_stick', mu=0., sd=100.) sigma_1 = pm.HalfCauchy('sigma_kal_sc_means', beta=100) sigma_2 = pm.HalfCauchy('sigma_kal_sc_stdev', beta=100) sigma_3 = pm.HalfCauchy('sigma_stick', beta=100) b_1 = pm.Normal('beta_kal_sc_mu', mu=mu_1, sd=sigma_1, shape=n_subj) b_2 = pm.Normal('beta_kal_sc_std', mu=mu_2, sd=sigma_2, shape=n_subj) b_3 = pm.Normal('beta_sc', mu=mu_3, sd=sigma_3, shape=n_subj) rho = \ tt.tile(tt.reshape(b_1[subj_idx], (n, 1)), d) * x_mu_kal_scrambled + \ tt.tile(tt.reshape(b_2[subj_idx], (n, 1)), d) * x_sd_kal_scrambled + \ tt.tile(tt.reshape(b_3[subj_idx], (n, 1)), d) * x_sc p_hat = softmax(rho) # Data likelihood yl = pm.Categorical('yl', p=p_hat, observed=y) # inference! trace_kal_scram = pm.sample(**sample_kwargs) return hier_kal_scrambeled, trace_kal_scram
def test_softmax_optimizations(): from theano.tensor.nnet.nnet import softmax, crossentropy_categorical_1hot x = tensor.fmatrix('x') one_of_n = tensor.lvector('one_of_n') op = crossentropy_categorical_1hot xe = op(x, one_of_n) env = theano.gof.Env( [x, one_of_n], [op(softmax(x), one_of_n)]) assert env.outputs[0].owner.op == op mode_with_gpu.optimizer.optimize(env) assert str(env.outputs[0].owner.op) == 'OutputGuard' assert env.outputs[0].owner.inputs[0].owner.op == cuda.host_from_gpu assert env.outputs[0].owner.inputs[0].owner.inputs[0].owner.op == cuda.nnet.gpu_crossentropy_softmax_argmax_1hot_with_bias
def test_softmax_optimizations(): from theano.tensor.nnet.nnet import softmax, crossentropy_categorical_1hot x = tensor.fmatrix("x") one_of_n = tensor.lvector("one_of_n") op = crossentropy_categorical_1hot xe = op(x, one_of_n) fgraph = theano.gof.FunctionGraph([x, one_of_n], [op(softmax(x), one_of_n)]) assert fgraph.outputs[0].owner.op == op mode_with_gpu.optimizer.optimize(fgraph) assert str(fgraph.outputs[0].owner.op) == "OutputGuard" assert fgraph.outputs[0].owner.inputs[0].owner.op == cuda.host_from_gpu assert ( fgraph.outputs[0].owner.inputs[0].owner.inputs[0].owner.op == cuda.nnet.gpu_crossentropy_softmax_argmax_1hot_with_bias )
def __init__(self, rng, input, n_in, n_out, W=None, b=None, layer_index=0): self.layername = 'Softmax' + str(layer_index) self.input = input # W if W is None: W_bound = numpy.sqrt(6. / (n_in + n_out)) self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=(n_in, n_out)), dtype=theano.config.floatX), borrow=True) else: self.W = theano.shared(value=W.astype(theano.config.floatX), borrow=True) self.W.name = self.layername + '#W' # b if b is None: self.b = theano.shared(numpy.zeros((n_out, ), dtype=theano.config.floatX), borrow=True) else: self.b = theano.shared(value=b.astype(theano.config.floatX), borrow=True) self.b.name = self.layername + '#b' output = relu(T.dot(input, self.W) + self.b) self.softmax_output = softmax(output) self.pred = self.softmax_output.argmax(axis=1) # store parameters of this layer self.params = [self.W, self.b]
def __theano_train__(self): """ 训练阶段跑一遍训练序列 """ # self.alpha_lambda = ['alpha', 'lambda'] # 各种usr_itm输入 uidxs = T.ivector() # n个用户 pqidxs = T.imatrix() # (2, n) 0行: n个正样本。1行: 负样本s。 cidxs = T.imatrix() # (n, set_size) mask = T.ivector() # 当前时刻的mask,标明哪些用户的行为有效/无效。 urxs = self.ux[uidxs] # shape=(n, d) xpqs = self.lx[pqidxs] # shape=(2, n, d) cpts = self.lc[cidxs] # shape=(n, set_size, d) cpqs = self.lc[pqidxs] # shape=(2, n, d) actual_batch_size = mask.shape[0] # 每时刻输入一个item_set,做unique ncpqs = T.concatenate((cidxs, pqidxs.T), axis=1) # 先拼接, shape=(n, set_size+2) uiq_cps = Unique(False, False, False)(ncpqs) # 去重 uiq_c = self.lc[uiq_cps] # 相应的items特征 # 各种权重矩阵。【注意:统一格式,权重 * 变量】 lay = self.layer wru, wrc, wrl = self.wru, self.wrc, self.wrl # resnet wa1, wa2, wa3 = self.wa1, self.wa2, self.wa3 # 一阶att wb1, wb2 = self.wb1, self.wb2 # 二阶att """ 输入t时刻正负样本,计算当前损失并更新user/正负样本. 公式里省略了时刻t # 根据性质:T.dot((n, ), (n, ))得到(1, 1) uij = user * (xp - xq) upq = log(sigmoid(uij)) """ # ============================================================================================================== # 得分1 uij_x = T.sum(urxs * (xpqs[0] - xpqs[1]), axis=1) # shape=(n, ) # ============================================================================================================== # 得分2 # 第0层的att, 获得(batch_size, d)的att vector。 urx_emb = T.dot(wa2, urxs.T).T.dimshuffle(0, 'x', 1) # shape=(batch_size, 1, d) e0 = T.dot(tanh(urx_emb + T.dot(cpts, wa3)), wa1) # shape=(batch_size, set_size) a0 = softmax(e0) # (batch_size, set_size) c0 = T.sum(cpts * a0.dimshuffle(0, 1, 'x'), axis=1) # shape=(batch_size, d), broadcast # 得分2 # ResNet里的att def recurrence1(wrut, wrct, urx_pre1, cpt_pre1): # ResNet更新 ur_t = relu(T.dot(wrut, urx_pre1.T).T + urx_pre1) # (batch_size, d) cp_t = relu(T.dot(cpt_pre1, wrct) + cpt_pre1) # (batch_size, set_size, d) # att计算生成上下文向量 ur_t_emb = T.dot(wa2, ur_t.T).T.dimshuffle(0, 'x', 1) e_t = T.dot(tanh(ur_t_emb + T.dot(cp_t, wa3)), wa1) # shape=(batch_size, set_size) a_t = softmax(e_t) c_t = T.sum(cp_t * a_t.dimshuffle(0, 1, 'x'), axis=1) return [ ur_t, cp_t, c_t ] # (batch_size, d), (batch_size, set_size, d), (batch_size, d) [urs, cps, cs], _ = theano.scan( # cs.shape = (layer, batch_size, d) fn=recurrence1, sequences=[wru, wrc], outputs_info=[urxs, cpts, None], n_steps=lay, truncate_gradient=-1) # 得分2 # 二阶att c0 = c0.dimshuffle(0, 'x', 1) # (batch_size, 1, d) cs = cs.dimshuffle(1, 0, 2) # (batch_size, layer, d) context = T.concatenate((c0, cs), axis=1) # (batch_size, layer+1, d) e1 = T.dot(tanh(T.dot(context, wb2)), wb1) # shape=(batch_size, layer+1) a1 = softmax(e1) c1 = T.sum(context * a1.dimshuffle(0, 1, 'x'), axis=1) # shape=(batch_size, d) # 得分2 uij_c = T.sum(c1 * (cpqs[0] - cpqs[1]), axis=1) # shape=(n, ) # ============================================================================================================== # 得分3 # 以resnet的输出c1重新计算一个新的resnet def recurrence2(wrlt, h_pre1): # ResNet更新 hl_t = relu(T.dot(wrlt, h_pre1.T).T + h_pre1) # shape=(batch_size, d) return hl_t hls, _ = theano.scan(fn=recurrence2, sequences=wrl, outputs_info=c1, n_steps=lay, truncate_gradient=-1) # 得分3 uij_l = T.sum(hls[-1] * (cpqs[0] - cpqs[1]), axis=1) # shape=(n, ) # ============================================================================================================== # 总的得分 loss = T.log(sigmoid(uij_x + uij_c + uij_l)) # shape=(n,) # loss *= mask # 只在损失这里乘一下0/1向量就可以了 # ---------------------------------------------------------------------------- # cost, gradients, learning rate, L2 regularization lr, l2 = self.alpha_lambda[0], self.alpha_lambda[1] l2_sqr = (T.sum([ T.sum(par**2) for par in [urxs, xpqs, cpts, cpqs, wru, wrc, wrl, wa1, wa2, wa3, wb1, wb2] ])) upq = T.sum(loss) / actual_batch_size costs = (-upq + 0.5 * l2 * l2_sqr) # self.params grads = T.grad(costs, self.params) updates = [(par, par - lr * gra) for par, gra in zip(self.params, grads)] # 1个user,2个items,这种更新求导是最快的。直接对sub求导,并非对par求导。 subs_pars_idxs = [[urxs, self.ux, uidxs], [xpqs, self.lx, pqidxs], [uiq_c, self.lc, uiq_cps]] tmp = [(par, T.set_subtensor(sub, sub - lr * T.grad(costs, par)[idx])) for sub, par, idx in subs_pars_idxs] updates.extend(tmp) # ---------------------------------------------------------------------------- # 输入用户、正负样本及其它参数后,更新变量,返回损失。 self.train = theano.function(inputs=[uidxs, pqidxs, cidxs, mask], outputs=-upq, updates=updates, on_unused_input='warning')
def __theano_predict__(self): """ 测试阶段再跑一遍训练序列得到各个隐层。用全部数据一次性得出所有用户的表达 """ # 各种权重矩阵。【注意:统一格式,权重 * 变量】 lay = self.layer wru, wrc, wrl = self.wru, self.wrc, self.wrl # resnet wa1, wa2, wa3 = self.wa1, self.wa2, self.wa3 # 一阶att wb1, wb2 = self.wb1, self.wb2 # 二阶att # givens给数据 start_end = T.ivector() tra_mask = T.imatrix() # shape=(n, 157) actual_batch_size = tra_mask.shape[0] # user vector urxs = T.fmatrix() # shape=(batch_size, d) cps_idxs = T.itensor3() # shape=(batch_size, 各用户set形式的序列) cpt_idxs = cps_idxs[ # shape=(batch_size, set_size) T.arange(actual_batch_size), # 花式索引,取出每个用户序列的最后一组item_idxs。 T.sum(tra_mask, axis=1) - 1] # item vector (每个user一个set) cpts = self.lc[cpt_idxs] # shape=(batch_size, set_size, d) # ============================================================================================================== # 得分2 # 第0层的att, 获得(batch_size, d)的att vector。 urx_emb = T.dot(wa2, urxs.T).T.dimshuffle(0, 'x', 1) # shape=(batch_size, 1, d) e0 = T.dot(tanh(urx_emb + T.dot(cpts, wa3)), wa1) # shape=(batch_size, set_size) a0 = softmax(e0) # (batch_size, set_size) c0 = T.sum(cpts * a0.dimshuffle(0, 1, 'x'), axis=1) # shape=(batch_size, d), broadcast # 得分2 # ResNet里的att def recurrence1(wrut, wrct, urx_pre1, cpt_pre1): # ResNet更新 ur_t = relu(T.dot(wrut, urx_pre1.T).T + urx_pre1) # (batch_size, d) cp_t = relu(T.dot(cpt_pre1, wrct) + cpt_pre1) # (batch_size, set_size, d) # att计算生成上下文向量 ur_t_emb = T.dot(wa2, ur_t.T).T.dimshuffle(0, 'x', 1) e_t = T.dot(tanh(ur_t_emb + T.dot(cp_t, wa3)), wa1) # shape=(batch_size, set_size) a_t = softmax(e_t) c_t = T.sum(cp_t * a_t.dimshuffle(0, 1, 'x'), axis=1) return [ ur_t, cp_t, c_t ] # (batch_size, d), (batch_size, set_size, d), (batch_size, d) [urs, cps, cs], _ = theano.scan( # cs.shape = (layer, batch_size, d) fn=recurrence1, sequences=[wru, wrc], outputs_info=[urxs, cpts, None], n_steps=lay, truncate_gradient=-1) # 得分2 # 二阶att c0 = c0.dimshuffle(0, 'x', 1) # (batch_size, 1, d) cs = cs.dimshuffle(1, 0, 2) # (batch_size, layer, d) context = T.concatenate((c0, cs), axis=1) # (batch_size, layer+1, d) e1 = T.dot(tanh(T.dot(context, wb2)), wb1) # shape=(batch_size, layer+1) a1 = softmax(e1) c1 = T.sum(context * a1.dimshuffle(0, 1, 'x'), axis=1) # shape=(batch_size, d) # ============================================================================================================== # 得分3 # 以resnet的输出c1重新计算一个新的resnet def recurrence2(wrlt, h_pre1): # ResNet更新 hl_t = relu(T.dot(wrlt, h_pre1.T).T + h_pre1) # shape=(batch_size, d) return hl_t hls, _ = theano.scan(fn=recurrence2, sequences=wrl, outputs_info=c1, n_steps=lay, truncate_gradient=-1) # ============================================================================================================== # 最终总的user vector。经由resnet计算得到的部分 usr_vec_c = c1 usr_vec_l = hls[-1] self.seq_predict = theano.function( inputs=[start_end], outputs=[usr_vec_c, usr_vec_l], # shape=(batch_size, d) givens={ urxs: self.trained_usr_x[start_end], # shape=(batch_size, d) tra_mask: self.tra_masks[start_end], cps_idxs: self.tra_set_masks[start_end] })
def exp_shifted_mKalman(sample_kwargs=None): clustering_data = pd.read_pickle( 'Data/exp_shifted/exp_shifted_clustering_means_std.pkl') clustering_data.index = range(len(clustering_data)) lin_gp_data = pd.read_csv('Data/exp_shifted/gplinshifted.csv') lin_gp_data.index = range(len(lin_gp_data)) rbf_gp_data = pd.read_csv('Data/exp_shifted/gprbfshifted.csv') rbf_gp_data.index = range(len(rbf_gp_data)) kalman_data = pd.read_pickle('Data/exp_shifted/kalmanshifted.pkl') kalman_data.index = range(len(kalman_data)) bayes_gp_data = pd.read_pickle('Data/exp_shifted/bayes_gp_exp_shifted.pkl') bayes_gp_data.index = range(len(bayes_gp_data)) raw_data = pd.read_csv('Data/exp_shifted/datashifted_withoffset.csv', header=0) # the GP-RBF can fail if subject always choose the same response. For simplicity, we are dropping those # subjects subjects_to_drop = set() for s in set(raw_data.id): if s not in set(rbf_gp_data.id): subjects_to_drop.add(s) for s in subjects_to_drop: clustering_data = clustering_data[ clustering_data['Subject'] != s].copy() lin_gp_data = lin_gp_data[lin_gp_data.id != s].copy() raw_data = raw_data[raw_data.id != s].copy() kalman_data = kalman_data[kalman_data.Subject != s].copy() bayes_gp_data = bayes_gp_data[bayes_gp_data['Subject'] != s].copy() # construct a sticky choice predictor. This is the same for all of the models x_sc = construct_sticky_choice(raw_data) # PYMC3 doesn't care about the actual subject numbers, so remap these to a sequential list subj_idx = construct_subj_idx(lin_gp_data) n_subj = len(set(subj_idx)) intercept = raw_data['int'].values # prep the predictor vectors x_mu_cls = np.array( [clustering_data.loc[:, 'mu_%d' % ii].values for ii in range(8)]).T x_sd_cls = np.array( [clustering_data.loc[:, 'std_%d' % ii].values for ii in range(8)]).T x_mu_bayes_gp = np.array( [bayes_gp_data.loc[:, 'mu_%d' % ii].values for ii in range(8)]).T x_sd_bayes_gp = np.array( [bayes_gp_data.loc[:, 'std_%d' % ii].values for ii in range(8)]).T x_mu_lin = np.array([ lin_gp_data.loc[:, 'mu_%d' % ii].values + intercept for ii in range(8) ]).T x_sd_lin = np.array( [lin_gp_data.loc[:, 'std_%d' % ii].values for ii in range(8)]).T x_mu_rbf = np.array([ rbf_gp_data.loc[:, 'mu_%d' % ii].values + intercept for ii in range(8) ]).T x_sd_rbf = np.array( [rbf_gp_data.loc[:, 'std_%d' % ii].values for ii in range(8)]).T x_mu_kal = np.array([ kalman_data.loc[:, 'mu_%d' % ii].values + intercept for ii in range(8) ]).T x_sd_kal = np.array( [kalman_data.loc[:, 'std_%d' % ii].values for ii in range(8)]).T y = raw_data['arm'].values - 1 # convert to 0 indexing n, d = x_mu_kal.shape if sample_kwargs is None: sample_kwargs = dict(draws=2000, njobs=2, tune=2000, init='advi+adapt_diag') with pm.Model() as hier_kal: mu_1 = pm.Normal('mu_beta_kal_mean', mu=0., sd=100.) mu_2 = pm.Normal('mu_beta_kal_stdv', mu=0., sd=100.) mu_3 = pm.Normal('mu_beta_stick', mu=0., sd=100.) sigma_1 = pm.HalfCauchy('sigma_rbf_means', beta=100) sigma_2 = pm.HalfCauchy('sigma_rbf_stdev', beta=100) sigma_3 = pm.HalfCauchy('sigma_stick', beta=100) b_1 = pm.Normal('beta_rbf_mu', mu=mu_1, sd=sigma_1, shape=n_subj) b_2 = pm.Normal('beta_rbf_std', mu=mu_2, sd=sigma_2, shape=n_subj) b_3 = pm.Normal('beta_sc', mu=mu_3, sd=sigma_3, shape=n_subj) rho = \ tt.tile(tt.reshape(b_1[subj_idx], (n, 1)), d) * x_mu_kal + \ tt.tile(tt.reshape(b_2[subj_idx], (n, 1)), d) * x_sd_kal + \ tt.tile(tt.reshape(b_3[subj_idx], (n, 1)), d) * x_sc p_hat = softmax(rho) # Data likelihood yl = pm.Categorical('yl', p=p_hat, observed=y) # inference! trace_kal = pm.sample(**sample_kwargs) ppc = pm.sample_ppc(trace_kal, samples=500, model=hier_kal) for ii in range(500): sim_draws = raw_data.copy() sim_draws['arm_sim'] = ppc['yl'][ii, :] + 1 sim_draws.to_pickle('./Data/PPC/exp_shifted/sim_kal_%d.pkl' % ii)
def forward(x_t, h_tm1, Wx, Wh, bh, am, ax, ah, Wy, by): preact = T.dot(x_t, Wx) + T.dot(h_tm1, Wh) + bh h_t = act(preact) y_t = softmax(T.dot(h_t, Wy) + by) return h_t, y_t, preact
# standardize the x's x_s = (x_s - x_s.mean(0)) / x_s.std(0) # get the groups number groups_number = len(np.unique(iris["species"])) # --------------- specify the probabilistic model ------------------------- # with pm.Model() as softmax_model: alpha = pm.Normal("alpha", mu=0, sd=10, shape=groups_number - 1) beta = pm.Normal("beta", mu=0, sd=10, shape=(x_s.shape[1], groups_number - 1)) alpha_f = tt.tensor.concatenate([[0], alpha]) beta_f = tt.tensor.concatenate([np.zeros((x_s.shape[1], 1)), beta], axis=1) # get the mu mu = pm.Deterministic("mu", alpha_f + pm.math.dot(x_s, beta_f)) # apply the softmax function to the mu theta = softmax(mu) # specify the likelihood of the data y_obs = pm.Categorical("y_obs", p=theta, observed=y_s) # inference step trace = pm.sample() # -------------- check how many cases are classified correctly ----------- # data_pred = trace["mu"].mean(0) log.info("The data pred is: %s", data_pred) y_pred = [np.exp(point) / np.sum(np.exp(point), axis=0) for point in data_pred] print(f'{np.sum(y_s == np.argmax(y_pred, axis=1)) / len(y_s):.2f}')
def sample_hier_rbf(model_matrix, sample_kwargs=None): # load the data x_mu_rbf = model_matrix['x_mu_rbf'] x_sd_rbf = model_matrix['x_sd_rbf'] x_sc = model_matrix['x_sc'] subj_idx = model_matrix['subj_idx'] y = model_matrix['y'] n_subj = model_matrix['n_subj'] # fit the first model n, d = x_mu_rbf.shape if sample_kwargs is None: # Here, we use specify NUTS as our sampler (implicitly this is the default) # and use variational inference to initialize sample_kwargs = dict(draws=2000, njobs=2, tune=2000, init='advi+adapt_diag') # to do inference, all we have to do is write down the model in our # probabilistic programming language (PYMC3) and the software will # do inference over it (we can control how this happens, e.g. with # Gibbs sampling, MCMC, Variational Inference, but PYMC3 will default # to hamiltonian-MCMC with the No U-turn sampler ("NUTS")) with pm.Model() as hier_rbf: # here, we write down the model # Define hierarchical parameters # (normal means and standard deviation for regression weights) mu_1 = pm.Normal('mu_beta_rbf_mean', mu=0., sd=100.) mu_2 = pm.Normal('mu_beta_rbf_stdv', mu=0., sd=100.) mu_3 = pm.Normal('mu_beta_stick', mu=0., sd=100.) sigma_1 = pm.HalfCauchy('sigma_rbf_means', beta=100) sigma_2 = pm.HalfCauchy('sigma_rbf_stdev', beta=100) sigma_3 = pm.HalfCauchy('sigma_stick', beta=100) # define subject predictor variables (i.e. regression parameters, # 1 per subject per condition with a hierarchical prior) b_1 = pm.Normal('beta_rbf_mu', mu=mu_1, sd=sigma_1, shape=n_subj) b_2 = pm.Normal('beta_rbf_std', mu=mu_2, sd=sigma_2, shape=n_subj) b_3 = pm.Normal('beta_sc', mu=mu_3, sd=sigma_3, shape=n_subj) # linearly combine the predictors with the subject-specific coefficients # as a scaling factor. In practice, the coefficients have to be broadcast # in to an NxD matric via theano for element-wise multiplication rho = \ tt.tile(tt.reshape(b_1[subj_idx], (n, 1)), d) * x_mu_rbf + \ tt.tile(tt.reshape(b_2[subj_idx], (n, 1)), d) * x_sd_rbf + \ tt.tile(tt.reshape(b_3[subj_idx], (n, 1)), d) * x_sc # pass the resultant vector through a softmax to convert to a probability # distribution. Note, we don't need an additional noise parameter as that # would be collinear with the coefficients. p_hat = softmax(rho) # Data likelihood yl = pm.Categorical('yl', p=p_hat, observed=y) # inference! trace_rbf = pm.sample(**sample_kwargs) return hier_rbf, trace_rbf