def prep_student(dataset, verbose, alpha, temperature): # if student is not saved beforehand, train and save it model_path = '{}/models/student/'.format(dataset) if not isfile(model_path + 'model/model.h5') or not isfile(model_path + 'model/model.json'): print('Student model doesnt exist, training it...') # load dataset and logits _dataset = get_data(dataset) x_train, y_train, x_test, y_test = _dataset.get_data() train_features = np.load( '{}/models/teacher/npy/train_logits.npy'.format(dataset)) test_features = np.load( '{}/models/teacher/npy/test_logits.npy'.format(dataset)) # normalized output with temperature shape=(num_samples,num_classes) y_train_soft = softmax(train_features / temperature) y_test_soft = softmax(test_features / temperature) # concatenated output labels=(num_samples,2*num_classes) y_train_new = np.concatenate([y_train, y_train_soft], axis=1) y_test_new = np.concatenate([y_test, y_test_soft], axis=1) # build student model student = get_model(_dataset, 'distillation', is_dropout=True) # remove softmax student.layers.pop() # get features logits = student.layers[-1].output # normal softmax output probs = Activation('softmax')(logits) # softmax output with temperature logits_T = Lambda(lambda x: x / temperature)(logits) probs_T = Activation('softmax')(logits_T) # concatanete output = concatenate([probs, probs_T]) # This is our new student model student = Model(student.input, output) compile_model(student, loss=distillation_loss(_dataset.num_classes, alpha), metrics=[acc_distillation(_dataset.num_classes)]) # create a new dataset with generated data dataset_s = DatasetCls(x_train, y_train_new, x_test, y_test_new, dataset_name=dataset) # train student student = train(dataset_s, student, PARAMS[dataset]['epochs'] * 2, PARAMS[dataset]['batch_size'], log_dir=model_path, callbacks=[ early_stop(patience=PARAMS[dataset]['patience'], monitor='val_loss', verbose=verbose) ], verbose=verbose) # save output files save_model_outputs(student, _dataset, model_path) K.clear_session()
def sample_s(self, ps): """ ps = (updated) prior for s """ intercept = self.clf_vera.intercept_ w = self.clf_vera.coef_.T res = np.ones((self.n, 3)) for claim, stance, source in self.d: self.f[claim, source] -= (self.s[stance] - 1) x = intercept + self.f[claim].dot(w) # if sparse matrix, need to take first row #x -= (self.s[stance] - 1) * w[source] # # res = prob of V given each value of S res[stance, 0] = util.softmax(x - w[source])[int(self.v[claim])] res[stance, 1] = util.softmax(x )[int(self.v[claim])] res[stance, 2] = util.softmax(x + w[source])[int(self.v[claim])] prob = res[stance] * ps[stance] prob = prob / (np.sum(prob)) # sample new S self.s[stance]= self.rs.choice(range(3), p = prob) # replace S self.f[claim, source] += (self.s[stance] - 1) return res
def process(results, truth): K = len(results['struct'][0]) probs1 = results['count'] / np.sum(results['count']) probs2 = util.softmax(results['llh']) probs3 = util.softmax(results['llh'] + np.log(results['count'])) # Verify that the means by which we compute posterior probabilities in the # results files hasn't changed. (Even if it has, we don't use # `results['prob']` in this file, so it should be fine.) assert np.allclose(probs3, results['prob']) pard = compute_parent_dist(results['struct'], probs3) parentropy, _, _ = compute_parentropy(pard) truth_num = len(truth['struct']) truth_llh = np.zeros(truth_num) truth_probs = np.ones(truth_num) / truth_num truth_pard = compute_parent_dist(truth['struct'], truth_probs) truth_parentropy, _, _ = compute_parentropy(truth_pard) assert np.allclose(truth_probs, truth['prob']) top_probs = 10 good_thresh = 1e-3 jsd_parents = compute_parents_jsd(pard, truth_pard) jsd_parents_phi_mean = jsd_parents * np.mean(truth['phi'][0, 1:], axis=1) jsd_parents_phi_max = jsd_parents * np.max(truth['phi'][0, 1:], axis=1) stats = {} stats['true_trees'] = truth_num stats['sampled_unique_trees'] = len(probs3) stats['num_good'] = np.sum(probs3 >= good_thresh) stats['prop_good'] = '%.3f' % (np.sum(probs3 >= good_thresh) / len(probs3)) stats['H_trees_truth'] = calc_entropy(truth_probs) stats['H_trees_pairtree_1'] = calc_entropy(probs1) stats['H_trees_pairtree_2'] = calc_entropy(probs2) stats['H_trees_pairtree_3'] = calc_entropy(probs3) stats['H_parents_truth'] = truth_parentropy stats['H_parents_pairtree'] = parentropy stats['prop_truth_recovered'], stats['jaccard'] = compute_indices( results['struct'], truth['struct']) stats['jsd_trees'] = compute_tree_jsd(results['struct'], probs3, truth['struct'], truth_probs) stats['jsd_parents_sum'] = np.sum(jsd_parents) stats['jsd_parents_mean'] = np.sum(jsd_parents) / K stats['jsd_parents_max'] = np.max(jsd_parents) stats['jsd_parents'] = jsd_parents stats['jsd_parents_phi_mean'] = np.max(jsd_parents_phi_mean) stats['jsd_parents_phi_max'] = np.max(jsd_parents_phi_max) stats['jsd_parents_phi_mean_top10'] = make_sorted(jsd_parents_phi_mean) stats['jsd_parents_phi_max_top10'] = make_sorted(jsd_parents_phi_max) stats['top_probs_1_top10'] = make_sorted(probs1) stats['top_probs_2_top10'] = make_sorted(probs2) stats['top_probs_3_top10'] = make_sorted(probs3) keys = list(stats.keys()) vals = [stats[key] for key in keys] for A in (keys, vals): print(*A, sep=',')
def inference(documents, doc_mask, query, query_mask): embedding = tf.get_variable('embedding', [FLAGS.vocab_size, FLAGS.embedding_size], initializer=tf.random_uniform_initializer(minval=-0.05, maxval=0.05)) regularizer = tf.nn.l2_loss(embedding) doc_emb = tf.nn.dropout(tf.nn.embedding_lookup( embedding, documents), FLAGS.dropout_keep_prob) doc_emb.set_shape([None, None, FLAGS.embedding_size]) query_emb = tf.nn.dropout(tf.nn.embedding_lookup( embedding, query), FLAGS.dropout_keep_prob) query_emb.set_shape([None, None, FLAGS.embedding_size]) with tf.variable_scope('document', initializer=orthogonal_initializer()): fwd_cell = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size) back_cell = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size) doc_len = tf.reduce_sum(doc_mask, reduction_indices=1) h, _ = tf.nn.bidirectional_dynamic_rnn( fwd_cell, back_cell, doc_emb, sequence_length=tf.to_int64(doc_len), dtype=tf.float32) h_doc = tf.concat(2, h) with tf.variable_scope('query', initializer=orthogonal_initializer()): fwd_cell = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size) back_cell = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size) query_len = tf.reduce_sum(query_mask, reduction_indices=1) h, _ = tf.nn.bidirectional_dynamic_rnn( fwd_cell, back_cell, query_emb, sequence_length=tf.to_int64(query_len), dtype=tf.float32) h_query = tf.concat(2, h) M = tf.batch_matmul(h_doc, h_query, adj_y=True) M_mask = tf.to_float(tf.batch_matmul(tf.expand_dims( doc_mask, -1), tf.expand_dims(query_mask, 1))) alpha = softmax(M, 1, M_mask) beta = softmax(M, 2, M_mask) query_importance = tf.expand_dims(tf.reduce_sum( beta, 1) / tf.to_float(tf.expand_dims(doc_len, -1)), -1) s = tf.squeeze(tf.batch_matmul(alpha, query_importance), [2]) unpacked_s = zip(tf.unpack(s, FLAGS.batch_size), tf.unpack(documents, FLAGS.batch_size)) y_hat = tf.pack([ tf.unsorted_segment_sum(attentions, sentence_ids, FLAGS.vocab_size) for (attentions, sentence_ids) in unpacked_s ]) return y_hat, regularizer
def main(): weight = np.random.rand(28 * 28, 10) #print weight.shape bias = np.random.rand(10) #print softmax(np.arange(6).reshape(2,3).T).T #print bias.shape #print np.zeros((2,3)).reshape((6,)) #print np.arange(6).reshape((2,3)) #print np.arange(6).shape pos_cnt = 0 total_cnt = 0 batch_size = 2 for _ in range(10): for feature, label in load_train_data_batch(batch_size): feature = feature/256.0 feature = feature.reshape((batch_size, 28*28)) pred = np.matmul(feature, weight) #print pred #print pred.shape #print bias.shape pred = pred + bias #print pred, pred.shape pred = softmax(pred.T).T pred_y = np.argmax(pred, axis=1) #print pred_y #print pred_y #print label pos_cnt += np.sum(np.equal(pred_y, label)) total_cnt += label.shape[0] print pos_cnt, total_cnt, 1.0*pos_cnt/total_cnt
def count_clusters(results): tidxs = np.array(sorted(results.tree_summary.keys())) llhs = np.array([results.tree_summary[tidx]['llh'] for tidx in tidxs]) probs = util.softmax(llhs) clusters = np.array([len(results.tree_summary[tidx]['populations']) for tidx in tidxs]) - 1 expected_clusters = np.sum(probs * clusters) return expected_clusters
def fit(self, sess, train, dev_data_np, dev_seqlen, dev_labels): losses_epochs = [] dev_performances_epochs = [] dev_predictions_epochs = [] dev_predicted_classes_epochs = [] for epoch in range(self.config.n_epochs): print("Epoch %d out of %d", epoch + 1, self.config.n_epochs) loss = self.run_epoch(sess, train) # Computing predictions dev_predictions = self.predict_on_batch(sess, dev_data_np, dev_seqlen) # Computing development performance dev_predictions = softmax(np.array(dev_predictions)) dev_predicted_classes = np.argmax(dev_predictions, axis=1) dev_performance = get_performance(dev_predicted_classes, dev_labels, n_classes=4) # Adding to global outputs #MODIF dev_predictions_epochs.append(dev_predictions) dev_predicted_classes_epochs.append(dev_predicted_classes) dev_performances_epochs.append(dev_performance) losses_epochs.append(loss) return losses_epochs, dev_performances_epochs, dev_predicted_classes_epochs, dev_predictions_epochs
def __forward__(self, x): """ this method is an implementation of forward propagation with one sample at a time. Parameters: x : numpy array (contains one sample of features) Returns: zs : list (contains numpy arrays, each array coresponds to sum(xW+b) of respective layer) activations: list (contains numpy arrays, each array coresponds to output of respective layer) """ # demo shapes l0 = x.T # [1, 784] z1 = np.dot(l0, self.weights['l1'].T) + self.biases['l1'] # [1, 300] = [1, 784] .* [784, 300] + [1, 300] l1 = util.relu(z1) # [1, 300] z2 = np.dot(l1, self.weights['l2'].T) + self.biases['l2'] # [1, 90] = [1, 300] .* [300, 90] + [1, 90] l2 = util.relu(z2) # [1, 90] z3 = np.dot(l2, self.weights['l3'].T) + self.biases['l3'] # [1, 10] = [1, 90] .* [90, 10] + [1, 10] l3 = util.softmax(z3) # [1, 10] zs = [z1, z2, z3] activations = [l0, l1, l2, l3] return zs, activations
def gradient(self, x, t): # 请从参数字典获取网络参数 w1, b1 = self.params['W1'], self.params['b1'] w2, b2 = self.params['W2'], self.params['b2'] # 保存梯度结果 grads = {} # forward a1 = np.dot(x, w1) + b1 h1 = sigmoid(a1) a2 = np.dot(h1, w2) + b2 output = softmax(a2) # backward dy = (output - t) / x.shape[0] grads['W2'] = np.dot(h1.T, dy) grads['b2'] = np.sum(dy, axis=0) """ grads['b2'] = np.sum(dy, axis=0),为什么求和? - 首先输出为多少维度,那么b就是多少维的向量,和样本数量无关 因为正向传播过程中,偏置b向量会分别加到每一个样本数据上,因此只需把这些值加起来就好 也就是说:第一个样本产生由于b产生误差 dy1 第二个样本产生由于b产生误差 dy2 ... b产生的总误差为: dy1 + dy2 + ... """ da1 = np.dot(dy, w2.T) ha1 = sigmoid(a1) dz1 = (1.0 - ha1) * ha1 * da1 grads['W1'] = np.dot(x.T, dz1) grads['b1'] = np.sum(dz1, axis=0) return grads
def _scaled_softmax(A, R=100): # Ensures `max(softmax(A)) / min(softmax(A)) <= R`. # # Typically, I use this as a "softer softmax", ensuring that the largest # element in the softmax is at most 100x the magnitude of the smallest. # Otherwise, given large differences between the minimum and maximum values, # the softmax becomes even more sharply peaked, with one element absorbing # effectively all mass. noninf = np.logical_not(np.isinf(A)) if np.sum(noninf) == 0: return util.softmax(A) delta = np.max(A[noninf]) - np.min(A[noninf]) if np.isclose(0, delta): return util.softmax(A) B = min(1, np.log(R) / delta) return util.softmax(B*A)
def segment(self, pic, ohl, name): with tf.variable_scope('Segmentation'): out_seg = self.segmenter.net(pic) weight_non_nod = tf.constant([[0.05]]) class_weighting = tf.concat( [weight_non_nod, tf.ones(shape=[self.channels - 1, 1])], axis=0) location_weight = tf.tensordot(ohl, class_weighting, axes=[[3], [0]]) raw_ce = tf.nn.softmax_cross_entropy_with_logits(labels=ohl, logits=out_seg) weighted_ce = tf.multiply(tf.expand_dims(raw_ce, axis=3), location_weight) ce = tf.reduce_mean(weighted_ce) # visualization of segmentation seg = self.vis_seg(ohl) seg_net = self.vis_seg(ut.softmax(out_seg)) # the tensorboard logging with tf.name_scope(name): self.sum_seg.append(tf.summary.image('Image', pic, max_outputs=2)) self.sum_seg.append( tf.summary.image('Annotation', seg, max_outputs=2)) self.sum_seg.append( tf.summary.image('Segmentation', seg_net, max_outputs=2)) self.sum_seg.append( tf.summary.image('Weight_map', location_weight, max_outputs=2)) return ce
def call(self, inputs): """Following the routing algorithm from Hinton's paper, but replace b = b + <u,v> with b = <u,v>. This change can improve the feature representation of Capsule. However, you can replace b = K.batch_dot(outputs, hat_inputs, [2, 3]) with b += K.batch_dot(outputs, hat_inputs, [2, 3]) to realize a standard routing. """ if self.share_weights: hat_inputs = K.conv1d(inputs, self.kernel) else: hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1]) batch_size = K.shape(inputs)[0] input_num_capsule = K.shape(inputs)[1] hat_inputs = K.reshape(hat_inputs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3)) b = K.zeros_like(hat_inputs[:, :, :, 0]) for i in range(self.routings): c = util.softmax(b, 1) if K.backend() == 'theano': o = K.sum(o, axis=1) o = self.activation(K.batch_dot(c, hat_inputs, [2, 2])) if i < self.routings - 1: b += K.batch_dot(o, hat_inputs, [2, 3]) if K.backend() == 'theano': o = K.sum(o, axis=1) return o
def fit(self, sess, h_np, b_np, h_len, b_len, y, dev_h, dev_b, dev_h_len, dev_b_len, dev_y): losses_epochs = [] dev_performances_epochs = [] dev_predictions_epochs = [] dev_predicted_classes_epochs = [] for epoch in range(self.config.n_epochs): print('-------new epoch---------') loss = self.run_epoch(sess, h_np, b_np, h_len, b_len, y) # Computing predictions dev_predictions = self.predict_on_batch(sess, dev_h, dev_b, dev_h_len, dev_b_len) # Computing development performance dev_predictions = softmax(np.array(dev_predictions)) dev_predicted_classes = np.argmax(dev_predictions, axis=1) dev_performance = get_performance(dev_predicted_classes, dev_y, n_classes=4) # Adding to global outputs dev_predictions_epochs.append(dev_predictions) dev_predicted_classes_epochs.append(dev_predicted_classes) dev_performances_epochs.append(dev_performance) losses_epochs.append(loss) print('EPOCH: ', epoch, ', LOSS: ', np.mean(loss)) return losses_epochs, dev_performances_epochs, dev_predicted_classes_epochs, dev_predictions_epochs
def make_mutrel_from_trees_and_unique_clusterings(structs, llhs, clusterings): ''' Relative to `make_mutrel_from_trees_and_single_clustering`, this function is slower and more memory intensive, but also more flexible. It differs in two respects: 1. It doesn't assume that the user has already computed counts for all unique samples -- i.e., it allows duplicate samples. 2. It allows unique clusterings for every sample. ''' assert len(structs) == len(llhs) == len(clusterings) weights = util.softmax(llhs) vids = None for struct, clustering, weight in zip(structs, clusterings, weights): adjm = util.convert_parents_to_adjmatrix(struct) mrel = make_mutrel_from_cluster_adj(adjm, clustering) if vids is None: vids = mrel.vids soft_mutrel = np.zeros(mrel.rels.shape) else: assert mrel.vids == vids soft_mutrel += weight * mrel.rels soft_mutrel = fix_rounding_errors(soft_mutrel) return mutrel.Mutrel( vids=vids, rels=soft_mutrel, )
def hyper_parameter_learn(self, x, momentum=0.5): """ Learn the hyper parameters of the model """ opt_C_now = torch.zeros((self.n_weights, self.n_params), dtype=torch.float64) log_w_arr = torch.zeros(self.n_weights, dtype=torch.float64) # Split the parameters into `alpha` and `beta` alpha = x[:self.n_params - 1] beta = x[self.n_params - 1:-2] alpha2 = x[-2:-1] beta2 = x[-1] # Sample noise sample_size = (self.n_weights, self.n_params) eps_arr = self.posterior.sample_epsilon(size=sample_size) for i in range(self.n_weights): eps = eps_arr[i] # Compute the importance weights (and their gradients) log_w_arr[i] = self._log_importance_weight(eps, alpha, beta, alpha2, beta2) z_i, z_i2 = self.posterior.g(eps, alpha, beta, alpha2, beta2) z_i_temp, z_i_temp2 = self.prior.opt_hyper(z_i, z_i2) opt_C_now[i][:-1] = z_i_temp opt_C_now[i][-1] = z_i_temp2 #print('len:',len(opt_C_now[i])) # Temper the weights log_w_arr /= self.weight_temp w_tilde = softmax( log_w_arr).detach() # Detach `w_tilde` from backward computations # Compute the weighted average over all `n_weights` samples opt_C = torch.matmul(w_tilde.unsqueeze(0), opt_C_now).squeeze().to(self.device) self.prior.C = (1 - momentum) * opt_C + momentum * self.prior.C
def get_root(self, state): main_board = state.board[:self.env.DIAGONAL].sum(axis=0) key = hash(main_board.tostring()) if key in self.nodes.keys(): root = self.nodes[key] # This makes this node root root.parent = None else: # Create a node # At the root node, evaluation and backup occurs at creation actions = state.meta.actions[state.board[-1, 0, 0]] prior_raw, V = self.evaluate(state.board) prior = np.zeros_like(prior_raw) noise = np.random.dirichlet(np.ones(len(actions)) * self.alpha) i0, i1, i2 = np.array(actions).T if self.data_format == 'channels_last': i2, i0, i1 = i0, i1, i2 prior[np.zeros_like(i0), i0, i1, i2] = softmax( prior_raw[np.zeros_like(i0), i0, i1, i2]) + noise kwargs = { 'state': state, 'p': None, 'prior': prior, 'parent': None, 'action_in': None, 'actions': actions, } root = Node(**kwargs) self.nodes[key] = root self.backup(V, root) return root
def make_mutrel_from_trees_and_single_clustering(structs, llhs, counts, clustering): # Oftentimes, we will have many samples of the same adjacency matrix paired # with the same clustering. This will produce the same mutrel. As computing # the mutrel from adjm + clustering is expensive, we want to avoid repeating # this unnecessarily. Instead, we just modify the associated weight of the # the pairing to reflect this. # # Observe that if we have `C` copies of the LLH `W`, we obtain # equivalent post-softmax linear-space weights under either of the following # two methods: # # 1. (naive) Represent the associated samples `C` separate times in the softmax # 2. (smart) Set `W' = W + log(C)`, as `exp(W') = Cexp(W)` weights = util.softmax(llhs + np.log(counts)) vids = None for struct, weight in zip(structs, weights): adjm = util.convert_parents_to_adjmatrix(struct) crel = make_clustrel_from_cluster_adj(adjm) if vids is None: vids = crel.vids soft_clustrel = np.zeros(crel.rels.shape) else: assert crel.vids == vids soft_clustrel += weight * crel.rels soft_clustrel = fix_rounding_errors(soft_clustrel) clustrel = mutrel.Mutrel(rels=soft_clustrel, vids=vids) mrel = make_mutrel_from_clustrel(clustrel, clustering) return mrel
def __forward(self, X, weights, biases, activation_func): """ This is the feed-forward section of the neural network, where we actually feed the network data and ask it to predict it's class for us """ # use dictionary to store different activation functions instead of many ifs activate = { 'relu': util.relu, 'tanh': util.tanh, 'sigmoid': util.sigmoid } layers = len(weights.keys()) Z = {} # Z at first hidden layer Z[0] = activate[activation_func](X.dot(weights[0]) + biases[0]) # Z at other hidden layers for i in range(layers - 2): z_list = list(Z.keys()) Z[i + 1] = activate[activation_func](Z[z_list[-1]].dot(weights[i + 1]) + biases[i + 1]) # pY_given_x z_list = list(Z.keys()) pY = util.softmax(Z[z_list[-1]].dot(weights[layers - 1]) + biases[layers - 1]) return Z, pY
def model(self, state, player): ''' Wrap the model to give the proper view and mask actions ''' valid = self._game.valid(state, player) view = self._game.view(state, player) logits, value = self._model.model(view) probs = softmax(logits, valid) return probs, value
def main(): np.set_printoptions(linewidth=400, precision=3, threshold=sys.maxsize, suppress=True) np.seterr(divide='raise', invalid='raise') V1, V2 = create_vars() estimators = ( # lh.calc_lh_quad will be slower than usual on its first invocation due to # Numba JIT compilation. Don't be alarmed by seemingly poor runtime from it # as a result. lh.calc_lh_quad, lh.calc_lh_mc_1D, lh.calc_lh_mc_2D, lh.calc_lh_mc_2D_dumb, lh.calc_lh_grid, ) max_estimator_len = max([len(M.__name__) for M in estimators]) for M in estimators: M_name = M.__name__ M = util.time_exec(M) evidence_per_sample = M(V1, V2) evidence_per_sample[:, common.Models.garbage] = lh.calc_garbage(V1, V2) evidence = np.sum(evidence_per_sample, axis=0) print( M_name.ljust(max_estimator_len), '%.3f ms' % util.time_exec._ms, evidence, util.softmax(evidence), sep='\t', )
def calc_mutdist(cluster_phis, llhs, clusterings, baseline, counts): assert len(cluster_phis) == len(llhs) == len(clusterings) == len(counts) weights = util.softmax(llhs + np.log(counts)) assert np.isclose(1, np.sum(weights)) baseline_phis = baseline.stats vids = None # TODO: make assays meaningful, rather than just always setting it to None. assays = None dists = None for (cluster_phi, clustering, weight) in zip(cluster_phis, clusterings, weights): cluster_phi = evalutil.fix_rounding_errors(cluster_phi) assert np.all(0 <= cluster_phi) and np.all(cluster_phi <= 1) V, membership = util.make_membership_mat(clustering) mphi = np.dot(membership, cluster_phi) if vids is None: vids = V assert V == vids if dists is None: dists = np.zeros(mphi.shape) weighted = weight * _calc_dist(mphi, baseline_phis) assert not np.any(np.isnan(weighted)) and not np.any( np.isinf(weighted)) dists += weighted assert list(vids) == list(baseline.vids) if assays is not None: assert list(assays) == list(baseline.assays) return mutstat.Mutstat(vids=vids, assays=assays, stats=dists)
def sample(self, first=0, stop=-1, nchars=100): first = self.vocab[first].i stop = self.vocab[stop].i res = [first] dynet.renew_cg() state = self.rnn.initial_state() R = dynet.parameter(self.R) bias = dynet.parameter(self.bias) cw = first while True: #if cw.s in self.pron_dict.pdict: # pron_vector = self.pron_dict.pdict[cw.s] # pron_vector = dynet.inputVector(pron_vector) #else: spelling = [ self.s2s.src_vocab[letter] for letter in self.vocab[cw].s.upper() ] embedded_spelling = self.s2s.embed_seq(spelling) pron_vector = self.s2s.encode_seq(embedded_spelling)[-1] x_t = pron_vector state = state.add_input(x_t) y_t = state.output() r_t = bias + (R * y_t) scores = r_t.vec_value() if self.vocab.unk is not None: ydist = util.softmax( scores[:self.vocab.unk.i] + scores[self.vocab.unk.i + 1:]) # remove UNK dist = ydist[:self.vocab.unk.i].tolist() + [ 0 ] + ydist[self.vocab.unk.i:].tolist() else: ydist = util.softmax(scores) dist = ydist rnd = random.random() for i, p in enumerate(dist): rnd -= p if rnd <= 0: break res.append(i) cw = i if cw == stop: break if nchars and len(res) > nchars: break return res
def forward(self, Xtop): """Perform the forward pass, given the top layer output of the net, go through the output layer and compute the output.""" self.Xtop = Xtop if self.act_type == util.OutputSpec.TYPE_LINEAR: self.Y = Xtop.dot(self.W) else: # self.act_type == util.OutputSpec.TYPE_SOFTMAX self.Y = util.softmax(Xtop, self.W)
def selection_by_probs(self): """ 在训练的时候 以softmax(N ** 1/t)的概率选择走法 """ N = [e.n**(1 / self.tree.t) for e in self.sub_edge] probs = util.softmax(N) return util.select_by_prob(self.sub_edge, probs)
def gradients(W, x, y): """Gradient of cost function over all examples""" vec = np.dot(x, W); sigmoid_activation = softmax(vec) e = [compute_gradients(a, c, b) for a, c, b in izip(sigmoid_activation, y, x)] mean1 = np.sum(e, axis=0) return mean1
def main(architecture, folds, tta): test_dataset = InternValidDataset(transform=test_augm()) labels = None for fold in folds: model = get_model(num_classes=test_dataset.num_classes, architecture=architecture) state = torch.load('../results/{}/best-model_{}.pt'.format( architecture, fold)) model.load_state_dict(state['model']) model.eval() labels = [] with open('../results/{}/{}_valid_prob.csv'.format(architecture, fold), "w") as f: for idx in tqdm.tqdm(range(len(test_dataset))): best_conf = 0 best_pred = None for rot in range(4): test_dataset.rot = rot in1 = [] in2 = [] for _ in range(tta): x = test_dataset[idx][0] in1.append(x[0]) in2.append(x[1]) in1 = variable(torch.stack(in1)) in2 = variable(torch.stack(in2)) pred = model(in1, in2).data.cpu().numpy() pred = np.array([softmax(x) for x in pred]) pred = np.sum(pred, axis=0) / len(pred) if np.max(pred) > best_conf: best_conf = np.max(pred) best_pred = pred labels.append(test_dataset[idx][1]) probas = ','.join([str(x) for x in best_pred]) f.write('{}\n'.format(probas)) dfs = [ pd.read_csv('../results/{}/{}_valid_prob.csv'.format(architecture, i), header=None) for i in folds ] classes = [ 'HTC-1-M7', 'LG-Nexus-5x', 'Motorola-Droid-Maxx', 'Motorola-Nexus-6', 'Motorola-X', 'Samsung-Galaxy-Note3', 'Samsung-Galaxy-S4', 'Sony-NEX-7', 'iPhone-4s', 'iPhone-6' ] for df in dfs: df.columns = classes df = dfs[0].copy() for i in np.arange(1, len(folds)): df[classes] += dfs[i][classes] df[classes] /= len(folds) matched = 0 for i in np.arange(len(test_dataset)): pred = df[classes].iloc[i].values.argmax() real = labels[i] if pred == real: matched += 1 print('accuracy = {}'.format(matched / len(test_dataset)))
def gendata(n=FEATURES, m=EXAMPLES, c=CLASSES, seed=0): np.random.seed(seed) X = np.random.rand(m, n) X = np.hstack((X, np.ones((m,1)))) # Add column of 1's for bias factors = np.random.rand(n+1, c) * 20 - 10 factors = normalize_cols(factors) predict = np.dot(X, factors) Y = np.apply_along_axis(np.argmax, axis=1, arr=softmax(predict)) return X, Y, factors
def forward(self, input_layer, W, b): self.Z = np.dot(W, input_layer) + b self.A = util.softmax(self.Z) #print("OUTPUT:", self.Z.T[0]) #print("OUTPUT:", self.A.T[0]) #print("MIN:", np.min(self.A.T[0])) #print("MAX:", np.max(self.A.T[0])) #print("--------------") return self.A
def forward_pass(self, inputs): # decleare variables used forward pass self.inputs = inputs self.n_inp = len(inputs) self.vr = [] self.vz = [] self.v_h = [] self.vo = [] self.r = [] self.z = [] self._h = [] self.h = {} self.o = [] self.h[-1] = np.zeros((self.h_size, 1)) # performing recurrsion for i in range(self.n_inp): # calculating reset gate value # self.vr.append(np.dot(self.w['ur'],inputs[i]) + np.dot(self.w['wr'], self.h[i-1]) + self.b['r']) # self.r.append(sigmoid(self.vr[i])) self.r.append( sigmoid( np.dot(self.w['ur'], inputs[i]) + np.dot(self.w['wr'], self.h[i - 1]) + self.b['r'])) # calculation update gate value # self.vz.append(np.dot(self.w['uz'],inputs[i]) + np.dot(self.w['wz'], self.h[i-1]) + self.b['z']) # self.z.append(sigmoid(self.vz[i])) self.z.append( sigmoid( np.dot(self.w['uz'], inputs[i]) + np.dot(self.w['wz'], self.h[i - 1]) + self.b['z'])) # applying reset gate value # self.v_h.append(np.dot(self.w['u_h'], inputs[i]) + np.dot(self.w['w_h'], np.multiply(self.h[i - 1], self.r[i])) + + self.b['_h']) # self._h.append(tanh(self.v_h[i])) self._h.append( tanh( np.dot(self.w['u_h'], inputs[i]) + np.dot(self.w['w_h'], np.multiply(self.h[i - 1], self.r[i])) + +self.b['_h'])) # applying update gate value self.h[i] = np.multiply(self.z[i], self.h[i - 1]) + np.multiply( 1 - self.z[i], self._h[i]) # calculating output # self.vo.append(np.dot(self.w['wo'], self.h[i]) + self.b['o']) # self.o.append(softmax(self.vo[i])) self.o.append( softmax(np.dot(self.w['wo'], self.h[i]) + self.b['o'])) return self.o
def _calc_llh(var_reads, ref_reads, omega, A, Z, psi): K = len(psi) assert Z.shape == (K, K) assert var_reads.shape == ref_reads.shape == omega.shape eta = util.softmax(psi) phi = np.dot(Z, eta) # Kx1 var_phis = np.dot(A, phi) logp = binom.logpmf(var_reads, ref_reads + var_reads, var_phis * omega) return np.sum(logp)
def sample(parameters, char_to_ix): """ Sample a sequence of characters according to a sequence of probability distributions output of the RNN Arguments: parameters -- python dictionary containing the parameters Waa, Wax, Wya, by, and b. char_to_ix -- python dictionary mapping each character to an index. Returns: indices -- a list of length n containing the indices of the sampled characters. """ # Retrieve parameters and relevant shapes from "parameters" dictionary Waa = parameters['Waa'] Wax = parameters['Wax'] Wya = parameters['Wya'] by = parameters['by'] b = parameters['b'] vocab_size = by.shape[0] n_a = Waa.shape[1] x = np.zeros((vocab_size, 1)) a_prev = np.zeros((n_a, 1)) indices = [] idx = -1 # Loop over time-steps t. At each time-step, sample a character from a # probability distribution and append its index to "indices". We'll stop if # we reach 50 characters (which should be very unlikely with a well trained # model), which helps debugging and prevents entering an infinite loop. newline_character = char_to_ix['\n'] while (idx != newline_character): # Forward propogate a = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b) z = np.dot(Wya, a) + by y = softmax(z) # Sample the index of a character within the vocabulary from the # probability distribution y idx = np.random.choice(list(range(0, vocab_size)), p=np.ndarray.flatten(y)) # Append the index to "indices" indices.append(idx) # Step 4: Overwrite the input character as the one corresponding to the # sampled index. x = np.zeros((vocab_size, 1)) x[idx] = 1 a_prev = a return indices
def expand_and_eval(self, leaf): action = random.choice(leaf.untried) leaf.untried.remove(action) time_s = time() state, reward, done, _ = self.env.step(leaf.state, action, leaf.actions) self.log_['exp/step']['time'] += time() - time_s self.log_['exp/step']['num'] += 1 if done: actions = [] prior = None V = reward reward = reward else: actions = state.meta.actions[state.board[-1, 0, 0]] time_e = time() prior_raw, V = self.evaluate(state.board) prior = np.zeros_like(prior_raw) noise = np.random.dirichlet(np.ones(len(actions)) * self.alpha) i0, i1, i2 = np.array(actions).T if self.data_format == 'channels_last': i0, i1, i2 = i1, i2, i0 prior[np.zeros_like(i0), i0, i1, i2] = softmax( prior_raw[np.zeros_like(i0), i0, i1, i2]) + noise self.log_['exp/eval']['time'] += time() - time_e self.log_['exp/eval']['num'] += 1 reward = None if self.data_format == 'channels_last': p = leaf.prior[(0, action[1], action[2], action[0])] else: p = leaf.prior[(0, *action)] kwargs = { 'state': state, 'p': p, 'prior': prior, 'parent': leaf, 'action_in': action, 'actions': actions, 'reward': reward, 'terminal': done, } child = Node(**kwargs) # Register node main_board = state.board[:self.env.DIAGONAL].sum(axis=0) key = hash(main_board.tostring()) self.nodes[key] = child # Declar child of its parent leaf.children[action] = child return V, child
def forward(self, x): # 请从参数字典获取网络参数 w1, b1 = self.params['W1'], self.params['b1'] w2, b2 = self.params['W2'], self.params['b2'] # 实现第一层的运算 z1 = np.dot(x, w1) + b1 h1 = sigmoid(z1) # 请实现第二层的运算 z2 = np.dot(h1, w2) + b2 return softmax(z2)
def gradients2(W, x, y, reg_lambda=1.0): """Gradient of cost function over all examples""" C = W.shape[1] M, _ = X.shape vec = np.dot(x, W); #print vec probas = softmax(vec) # M x C indicator_y = indicator_matrix(y, C) # M x C gradients = - np.dot((indicator_y - probas).T, x).T # C * N gradients += reg_lambda * W return gradients
def forward_propagation(self,x): # The total number of time steps T=len(x) # During forwoard propagation we save all hidden states in s because need them later. s=np.zeros((T+1,self.hidden_dim)) s[-1]=np.zeros(self.hidden_dim) # The outputs at each time step. Again, we them for later. o=np.zeros((T,self.word_dim)) # For each time step... for t in np.arange(T): # Note that we are indexing U by x[t]. This is the same as multiplying U with a one-hot vector. s[t]=np.tanh(self.U[:,x[t]]+self.W.dot(s[t-1])) o[t]=softmax(self.V.dot(s[t])) return [o,s]
def get_pred(self, img): '''img: face image, typically the cropped face based on bbx''' img = cv2.resize(img, (224, 224)) self.img = copy.deepcopy(img) img = img.transpose(2, 0, 1) img = img[(2, 1, 0), :, :] # TODO img *= 255 img -= self.mean.reshape(3, 224, 224) img *= 0.01 self.net.blobs['data'].reshape(1, 3, 224, 224) self.net.blobs['data'].data[:] = img out = self.net.forward() pred = np.reshape(out[self.layername], (68, 56 * 56)) pred = softmax(pred) return np.reshape(pred, (1, 68, 56, 56))
def supAnalyser(self,X,freq,vocabulary,top=20): result_score=[] result_word=[] for i in range(self.cat): result_score.append([0.0]*top) result_word.append(['']*top) num_sent=np.size(X,0) allKids=[[]]*num_sent for i in range(num_sent): x=X[i] sl=len(x) words_embedded=self.WL[:,x] unsup_tree = self.forwardProp([],words_embedded,False,None,self.theta,freq) allKids[i]=unsup_tree.kids sup_tree=rnntree.rnntree(self.d,sl,words_embedded) nodeUnder = np.ones([2*sl-1,1]) for j in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder kids = allKids[i][j] n1 = nodeUnder[kids[0]] n2 = nodeUnder[kids[1]] nodeUnder[j] = n1+n2 #sentree.catDelta = np.zeros([cat_size, 2*sl-1]) #sentree.catDelta_out = np.zeros([self.d,2*sl-1]) for j in range(2*sl-1): kids = allKids[i][j] c1 = sup_tree.nodeFeatures[:,kids[0]] c2 = sup_tree.nodeFeatures[:,kids[1]] # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1]) p = tanh(np.dot(self.W1,c1) + np.dot(self.W2,c2) + self.b1) # See last paragraph in Section 2.3 p_norm1 = p/norm(p) # Eq. (7) in the paper (for special case of 1d label) #sm = sigmoid(np.dot(Wlab,p_norm1) + blab) sm=softmax(np.dot(self.Wlab,p_norm1) + self.blab) #max_score=max(sm) for ind in range(self.cat): max_score=sm[ind] #ind=list(sm).index(max_score) min_score=min(result_score[ind]) if max_score>min_score: min_ind=result_score[ind].index(min_score) result_score[ind][min_ind]=max_score if j<sl: result_word[ind][min_ind]=vocabulary[x[j]] else: stk=[] stk.extend(list(kids)) stk.reverse() words=[] while len(stk)!=0: current=stk.pop() if current<sl: words.append(vocabulary[x[current]]) else: toExtend=[] toExtend.extend(list(allKids[i][current])) toExtend.reverse() stk.extend(toExtend) result_word[ind][min_ind]=' '.join(words) return (result_score,result_word)
def predict(W, x): """function predicts the probability of input vector x the output y is MX1 vector (M is no of classse) """ values = softmax(np.dot(x, W)) return np.argmax(values, axis=1)
def forward(self, X): # Z = relu(X.dot(self.W1) + self.b1) Z = np.tanh(X.dot(self.W1) + self.b1) return softmax(Z.dot(self.W2) + self.b2), Z
def forwardProp(self,allKids,words_embedded,updateWlab,label,theta,freq): #allkids存的是所有节点,第i行存第i个节点,列表示第i行节点所包含的子节点 (W1,W2,W3,W4,Wlab,b1,b2,b3,blab,WL)=self.getParams(theta) #s1可能是词汇表的大小 sl=np.size(words_embedded,1) sentree=rnntree.rnntree(self.d,sl,words_embedded) collapsed_sentence = range(sl) #计算情感误差 if updateWlab: temp_label=np.zeros(self.cat) #label表示当前标签,label-1主要是因为list从0开始,即当前标签的位置为1 temp_label[label-1]=1.0 nodeUnder = np.ones([2*sl-1,1]) #n1,n2是kids的子节点数 for i in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder kids = allKids[i] n1 = nodeUnder[kids[0]] #左节点 n2 = nodeUnder[kids[1]] #右节点 nodeUnder[i] = n1+n2 #第i个节点的子节点数目 cat_size=self.cat sentree.catDelta = np.zeros([cat_size, 2*sl-1]) sentree.catDelta_out = np.zeros([self.d,2*sl-1]) # classifier on single words for i in range(sl): sm = softmax(np.dot(Wlab,words_embedded[:,i]) + blab) #这里代码部分计算情感误差和论文不太一样,这里直接用yi-h(x)来表示情感误差 lbl_sm = (1-self.alpha)*(temp_label - sm) #这里貌似是在计算J sentree.nodeScores[i] = 1.0/2.0*(np.dot(lbl_sm,(temp_label- sm))) #sentree.nodeScores分为2个部分,这里计算0-s1,下面计算2*s1-1 sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) # sm = sigmoid(self.Wlab*words_embedded + self.blab) #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm) #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm)) #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm) for i in range(sl,2*sl-1): #kids,c1,c2 是什么 kids = allKids[i] c1 = sentree.nodeFeatures[:,kids[0]] #左孩子的词向量 c2 = sentree.nodeFeatures[:,kids[1]] #右孩子的词向量 # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1]) p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + b1) # See last paragraph in Section 2.3 p_norm1 = p/norm(p) # Eq. (7) in the paper (for special case of 1d label) #sm = sigmoid(np.dot(Wlab,p_norm1) + blab) sm=softmax(np.dot(Wlab,p_norm1) + blab) beta=0.5 #论文里面本来是没有beta这个值的 #lbl_sm = beta * (1.0-self.alpha)*(label - sm) lbl_sm = beta * (1.0-self.alpha)*(temp_label - sm) #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm) #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1] #J=-(1.0-self.alpha)*np.log(sm[label-1]) #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm)) sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm))) J = 1.0/2.0*(np.dot(lbl_sm,(temp_label - sm))) sentree.nodeFeatures[:,i] = p_norm1 sentree.nodeFeatures_unnormalized[:,i] = p sentree.nodeScores[i] = J sentree.numkids = nodeUnder sentree.kids = allKids #计算重构误差 else: # Reconstruction Error for j in range(sl-1): size2=np.size(words_embedded,1) c1 = words_embedded[:,0:-1] c2 = words_embedded[:,1:] freq1 = freq[0:-1] freq2 = freq[1:] p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + np.reshape(b1,[self.d,1])*([1]*(size2-1))) p_norm1 =p/np.sqrt(sum(p**2)) #下方y1,y2实际上就是论文的c1,c2,由p分解而来。 y1_unnormalized = tanh(np.dot(W3,p_norm1) + np.reshape(b2,[self.d,1])*([1]*(size2-1))) y2_unnormalized = tanh(np.dot(W4,p_norm1) + np.reshape(b3,[self.d,1])*([1]*(size2-1))) y1 = y1_unnormalized/np.sqrt(sum(y1_unnormalized**2)) y2 = y2_unnormalized/np.sqrt(sum(y2_unnormalized**2)) y1c1 = self.alpha*(y1-c1) y2c2 = self.alpha*(y2-c2) # Eq. (4) in the paper: reconstruction error:重构误差 #(y1-c1)*(y1-c1)的结果是一个数值 J = 1.0/2.0*sum((y1c1)*(y1-c1) + (y2c2)*(y2-c2)) #这个for循环的下面部分没看懂 # finding the pair with smallest reconstruction error for constructing sentree #min(J)是什么意思,J是一个值 J_min= min(J) J_minpos=np.argmin(J) #重构误差最小的重构向量存入树中(c1',c2') sentree.node_y1c1[:,sl+j] = y1c1[:,J_minpos] sentree.node_y2c2[:,sl+j] = y2c2[:,J_minpos] #可能是更新值 sentree.nodeDelta_out1[:,sl+j] = np.dot(norm1tanh_prime(y1_unnormalized[:,J_minpos]) , y1c1[:,J_minpos]) sentree.nodeDelta_out2[:,sl+j] = np.dot(norm1tanh_prime(y2_unnormalized[:,J_minpos]) , y2c2[:,J_minpos]) words_embedded=np.delete(words_embedded,J_minpos+1,1) words_embedded[:,J_minpos]=p_norm1[:,J_minpos] sentree.nodeFeatures[:, sl+j] = p_norm1[:,J_minpos] sentree.nodeFeatures_unnormalized[:, sl+j]= p[:,J_minpos] sentree.nodeScores[sl+j] = J_min sentree.pp[collapsed_sentence[J_minpos]] = sl+j sentree.pp[collapsed_sentence[J_minpos+1]] = sl+j sentree.kids[sl+j,:] = [collapsed_sentence[J_minpos], collapsed_sentence[J_minpos+1]] sentree.numkids[sl+j] = sentree.numkids[sentree.kids[sl+j,0]] + sentree.numkids[sentree.kids[sl+j,1]] freq=np.delete(freq,J_minpos+1) freq[J_minpos] = (sentree.numkids[sentree.kids[sl+j,0]]*freq1[J_minpos] + sentree.numkids[sentree.kids[sl+j,1]]*freq2[J_minpos])/(sentree.numkids[sentree.kids[sl+j,0]]+sentree.numkids[sentree.kids[sl+j,1]]) collapsed_sentence=np.delete(collapsed_sentence,J_minpos+1) collapsed_sentence[J_minpos]=sl+j return sentree
def rbmFit(X, numHid, y, isSaveModel=False, name=None, **kwargs) : """ X ... data. should be binary, or in [0,1] interpreted as ... probabilities numhid ... number of hidden units y ... List of discrete labels nClass number of classes method CD or SML eta learning rate momentum momentum for smoothness amd to prevent overfitting NOTE: momentum is not recommended with SML maxepoch # of epochs: each is a full pass through train data avglast how many epochs before maxepoch to start averaging before. Procedure suggested for faster convergence by Kevin Swersky in his MSc thesis batchsize The number of training instances per batch verbose For printing progress model.weight The weights of the connections model.biasH The biases of the hidden layer model.biasV The biases of the visible layer model.weightlabel ... The weights on labels layer model.biasLabel ... The biases on labels layer errors The errors in reconstruction at each epoch """ arg = util.processOptions(kwargs, \ nClass = np.unique(y).size, \ method = "CD", \ eta = 0.1, \ momentum = 0.5,\ maxEpoch = 500, \ avgLast = 0, \ penalty = 0, \ batchSize = 100, \ verbose = True) [nClass, method, eta, momentum, maxEpoch, avgLast, penalty, batchSize, verbose] = [\ arg["nClass"],\ arg["method"],\ arg["eta"],\ arg["momentum"],\ arg["maxEpoch"],\ arg["avgLast"],\ arg["penalty"],\ arg["batchSize"],\ arg["verbose"] ] if verbose : print "Processing data ..." # from which step, we start to compute the average # avgStart = maxEpoch - avgLast # for weight decay use # oldPenalty = penalty # numCases : number of example # numDims : the length of each example # each row is an example [numCases, numDims] = list(X.shape) numVis = numDims uniqueLabel = np.unique(y) numBatch = util.ceil(numCases, batchSize) y = util.matrixLabel(y) # shuffle data and label data = copy.deepcopy(X) [data, label] = util.shuffle(data, y) # init CUDA cm.cublas_init() cm.CUDAMatrix.init_random(100) deviceData = cm.CUDAMatrix(cm.reformat(data)) deviceLabel = cm.CUDAMatrix(cm.reformat(label)) # init weights weight = cm.CUDAMatrix(0.1*np.random.randn(numVis,numHid)) biasV = cm.CUDAMatrix(np.zeros((1, numVis))) biasH = cm.CUDAMatrix(np.zeros((1, numHid))) weightLabel = cm.CUDAMatrix(0.1*np.random.randn(nClass, numHid)) biasLabel = cm.CUDAMatrix(np.zeros((1,nClass))) # init weight update weightInc = cm.CUDAMatrix(np.zeros((numVis,numHid))) biasVInc = cm.CUDAMatrix(np.zeros((1,numVis))) biasHInc = cm.CUDAMatrix(np.zeros((1,numHid))) weightLabelInc = cm.CUDAMatrix(np.zeros((nClass, numHid))) biasLabelInc = cm.CUDAMatrix(np.zeros((1,nClass))) #init temporary storage visActP = cm.empty((batchSize, numVis)) hidActP = cm.empty((batchSize, numHid)) hidState = cm.empty((batchSize, numHid)) for epoch in range(maxEpoch) : error = [] for batch in range(numBatch) : # train each data batch if batchSize*(batch+1) > numCases : visTrue = deviceData.get_row_slice(batchSize*batch, numCases) labelTrue = deviceLabel.get_row_slice(batchSize*batch, numCases) batchSize = visTrue.shape[0] visActP = cm.empty((batchSize, numVis)) hidActP = cm.empty((batchSize, numHid)) hidState = cm.empty((batchSize, numHid)) else : visTrue = deviceData.get_row_slice(batchSize*batch, batchSize*(batch+1)) labelTrue = deviceLabel.get_row_slice(batchSize*batch, batchSize*(batch+1)) batchSize = visTrue.shape[0] visActP.assign(visTrue) #apply momentum weightInc.mult(momentum) biasVInc.mult(momentum) biasHInc.mult(momentum) weightLabel.mult(momentum) biasLabel.mult(momentum) # positive phase cm.dot(visActP, weight, target = hidActP) hidActP.add_dot(labelTrue, weightLabel) hidActP.add_row_vec(biasH) hidActP.apply_sigmoid() weightInc.add_dot(visActP.T, hidActP) biasVInc.add_sums(visActP, axis=0) biasHInc.add_sums(hidActP, axis=0) weightLabelInc.add_dot(labelTrue.T, hidActP) biasLabelInc.add_sums(labelTrue, axis=0) hidState.fill_with_rand() hidState.less_than(hidActP, target=hidActP) if cmp(method, "SML") == 0 : if np.logical_and(np.equal(epoch,1), np.equal(batch,1)) : pass # here does not need in practical use elif cmp(method, "CD") == 0 : pass # negative phase cm.dot(hidActP, weight.T, target = visActP) visActP.add_row_vec(biasV) visActP.apply_sigmoid() cm.dot(hidActP, weightLabel.T, target = labelTrue) labelTrue.add_row_vec(biasLabel) labelTrue = util.softmax(labelTrue) # another positive phase cm.dot(visActP, weight, target = hidActP) hidActP.add_dot(labelTrue, weightLabel) hidActP.add_row_vec(biasH) hidActP.apply_sigmoid() weightInc.subtract_dot(visActP.T, hidActP) biasVInc.add_sums(visActP, axis=0, mult=-1) biasHInc.add_sums(hidActP, axis=0, mult=-1) weightLabelInc.subtract_dot(labelTrue.T, hidActP) biasLabelInc.add_sums(labelTrue, axis=0, mult=-1) # update weights and bias weight.add_mult(weightInc, eta/batchSize) biasV.add_mult(biasVInc, eta/batchSize) biasH.add_mult(biasHInc, eta/batchSize) weightLabel.add_mult(weightLabelInc, eta/batchSize) biasLabel.add_mult(biasLabelInc, eta/batchSize) # calculate reconstruction error visTrue.subtract(visActP) error.append(visTrue.euclid_norm()**2) # free memory visTrue.free_device_memory() labelTrue.free_device_memory() if verbose : print "Epoch %d/%d, reconstruction error is %f " % (epoch+1, maxEpoch, sum(error)) # save rbm model weight.copy_to_host() biasV.copy_to_host() biasH.copy_to_host() weightLabel.copy_to_host() biasLabel.copy_to_host() model_ = m.rbmModel(weight.numpy_array, biasV.numpy_array, biasH.numpy_array, \ weightLabel = weightLabel.numpy_array,\ biasLabel = biasLabel.numpy_array, labels = uniqueLabel) # free device memory deviceData.free_device_memory() deviceLabel.free_device_memory() weight.free_device_memory() biasV.free_device_memory() biasH.free_device_memory() weightLabel.free_device_memory() biasLabel.free_device_memory() weightInc.free_device_memory() biasVInc.free_device_memory() biasHInc.free_device_memory() weightLabelInc.free_device_memory() biasLabelInc.free_device_memory() hidActP.free_device_memory() visActP.free_device_memory() hidState.free_device_memory() cm.shutdown() if isSaveModel : modelList = [] modelList.append(model_) model = np.array(modelList) np.save(name,model) return model_
outRoot = sys.argv[3] nets = [] for i in range(len(protos)): nets.append(caffe.Net(protos[i], models[i], caffe.TEST)) filenames = get_filenames(filelists) filenames = filenames[555:] random.shuffle(filenames) # save f = 'ibug/image_051_1.jpg' img = caffe.io.load_image(root + f) for i in range(len(nets)): pred = get_preds_single(nets[i], layernames[i], img) response_map = pred[0, 0] shape = response_map.shape response_map = softmax(response_map.reshape((1, shape[0]*shape[1]))) response_map = np.reshape(response_map, shape) plt.imsave(outRoot + str(i) + "_" + f[5:], response_map, cmap='gray', vmin=response_map.min(), vmax=response_map.max()) exit(0) for f in filenames: print f img = caffe.io.load_image(root + f) for i in range(len(nets)): pred = get_preds_single(nets[i], layernames[i], img) response_map = pred[0, (0, 36, 30, 57)] shape = response_map.shape response_map = softmax(response_map.reshape((shape[0], shape[1]*shape[2]))) response_map = response_map.reshape(shape) for j in range(shape[0]): plt.subplot(4, 4, i*4 + j + 1) plt.imshow(response_map[j], cmap='gray', vmin=0, vmax=0.2)
if __name__ == '__main__': # usage: python shift_exp.py prototxt model layername root # filelists outRoot prototxt = sys.argv[1] model = sys.argv[2] layername = sys.argv[3] root = sys.argv[4] filelists = sys.argv[5] outRoot = sys.argv[6] net = caffe.Net(prototxt, model, caffe.TEST) (filenames, bbxs) = get_filenames_bbx(filelists) index = range(len(filenames)) random.shuffle(index) for i in index: print i, filenames[i], bbxs[i][0], bbxs[i][1], bbxs[i][2], bbxs[i][3] img_crops = shift_exp(root, filenames[i], bbxs[i], outRoot) preds = get_preds_multiple(net, layername, img_crops) preds_shape = preds.shape preds = softmax(np.reshape(preds, (preds_shape[0]*preds_shape[1], \ preds_shape[2]*preds_shape[3]))) preds = np.reshape(preds, preds_shape) (hp, wp) = get_index(preds) hp = hp * 4 wp = wp * 4 for i in range(9): plt.subplot(3, 3, i+1) plt.imshow(img_crops[i]) plt.plot(wp[i], hp[i],'.g', hold=True) plt.show()
def forwardProp(self,allKids,words_embedded,updateWlab,label,theta,freq): (W1,W2,W3,W4,Wlab,b1,b2,b3,blab,WL)=self.getParams(theta) sl=np.size(words_embedded,1) sentree=rnntree.rnntree(self.d,sl,words_embedded) collapsed_sentence = range(sl) if updateWlab: temp_label=np.zeros(self.cat) temp_label[label-1]=1.0 nodeUnder = np.ones([2*sl-1,1]) for i in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder kids = allKids[i] n1 = nodeUnder[kids[0]] n2 = nodeUnder[kids[1]] nodeUnder[i] = n1+n2 cat_size=self.cat sentree.catDelta = np.zeros([cat_size, 2*sl-1]) sentree.catDelta_out = np.zeros([self.d,2*sl-1]) # classifier on single words for i in range(sl): sm = softmax(np.dot(Wlab,words_embedded[:,i]) + blab) lbl_sm = (1-self.alpha)*(temp_label - sm) sentree.nodeScores[i] = 1.0/2.0*(np.dot(lbl_sm,(temp_label- sm))) sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) # sm = sigmoid(self.Wlab*words_embedded + self.blab) #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm) #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm)) #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm) for i in range(sl,2*sl-1): kids = allKids[i] c1 = sentree.nodeFeatures[:,kids[0]] c2 = sentree.nodeFeatures[:,kids[1]] # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1]) p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + b1) # See last paragraph in Section 2.3 p_norm1 = p/norm(p) # Eq. (7) in the paper (for special case of 1d label) #sm = sigmoid(np.dot(Wlab,p_norm1) + blab) sm=softmax(np.dot(Wlab,p_norm1) + blab) beta=0.5 #lbl_sm = beta * (1.0-self.alpha)*(label - sm) lbl_sm = beta * (1.0-self.alpha)*(temp_label - sm) #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm) #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1] #J=-(1.0-self.alpha)*np.log(sm[label-1]) #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm)) sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm))) J = 1.0/2.0*(np.dot(lbl_sm,(temp_label - sm))) sentree.nodeFeatures[:,i] = p_norm1 sentree.nodeFeatures_unnormalized[:,i] = p sentree.nodeScores[i] = J sentree.numkids = nodeUnder sentree.kids = allKids else: # Reconstruction Error for j in range(sl-1): size2=np.size(words_embedded,1) c1 = words_embedded[:,0:-1] c2 = words_embedded[:,1:] freq1 = freq[0:-1] freq2 = freq[1:] p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + np.reshape(b1,[self.d,1])*([1]*(size2-1))) p_norm1 =p/np.sqrt(sum(p**2)) y1_unnormalized = tanh(np.dot(W3,p_norm1) + np.reshape(b2,[self.d,1])*([1]*(size2-1))) y2_unnormalized = tanh(np.dot(W4,p_norm1) + np.reshape(b3,[self.d,1])*([1]*(size2-1))) y1 = y1_unnormalized/np.sqrt(sum(y1_unnormalized**2)) y2 = y2_unnormalized/np.sqrt(sum(y2_unnormalized**2)) y1c1 = self.alpha*(y1-c1) y2c2 = self.alpha*(y2-c2) # Eq. (4) in the paper: reconstruction error J = 1.0/2.0*sum((y1c1)*(y1-c1) + (y2c2)*(y2-c2)) # finding the pair with smallest reconstruction error for constructing sentree J_min= min(J) J_minpos=np.argmin(J) sentree.node_y1c1[:,sl+j] = y1c1[:,J_minpos] sentree.node_y2c2[:,sl+j] = y2c2[:,J_minpos] sentree.nodeDelta_out1[:,sl+j] = np.dot(norm1tanh_prime(y1_unnormalized[:,J_minpos]) , y1c1[:,J_minpos]) sentree.nodeDelta_out2[:,sl+j] = np.dot(norm1tanh_prime(y2_unnormalized[:,J_minpos]) , y2c2[:,J_minpos]) words_embedded=np.delete(words_embedded,J_minpos+1,1) words_embedded[:,J_minpos]=p_norm1[:,J_minpos] sentree.nodeFeatures[:, sl+j] = p_norm1[:,J_minpos] sentree.nodeFeatures_unnormalized[:, sl+j]= p[:,J_minpos] sentree.nodeScores[sl+j] = J_min sentree.pp[collapsed_sentence[J_minpos]] = sl+j sentree.pp[collapsed_sentence[J_minpos+1]] = sl+j sentree.kids[sl+j,:] = [collapsed_sentence[J_minpos], collapsed_sentence[J_minpos+1]] sentree.numkids[sl+j] = sentree.numkids[sentree.kids[sl+j,0]] + sentree.numkids[sentree.kids[sl+j,1]] freq=np.delete(freq,J_minpos+1) freq[J_minpos] = (sentree.numkids[sentree.kids[sl+j,0]]*freq1[J_minpos] + sentree.numkids[sentree.kids[sl+j,1]]*freq2[J_minpos])/(sentree.numkids[sentree.kids[sl+j,0]]+sentree.numkids[sentree.kids[sl+j,1]]) collapsed_sentence=np.delete(collapsed_sentence,J_minpos+1) collapsed_sentence[J_minpos]=sl+j return sentree
def activate(self, input): a = np.dot(input, self.W) + self.b ret = softmax(a) return ret
def forwardProp(self,allKids,words_embedded,updateWlab,label,theta,freq): (W1,W2,W3,W4,Wlab,b1,b2,b3,blab,WL)=self.getParams(theta) #sl是words_embedded的个数,一句话单词的个数 # allKids一开始没有值,是因为训练之前,语法树本来就没有构建完,树结构是训练完了以后才出现的。但是,allkids内容应该会随着算法的进行而变化 sl=np.size(words_embedded,1) sentree=rnntree.rnntree(self.d,sl,words_embedded) collapsed_sentence = range(sl) # updateWlab主要是获得情感误差,修正情感的权值 # 情感误差也是需要p作为输入的,因此也需要计算出p if updateWlab: temp_label=np.zeros(self.cat) #假设cat = 4, temp_label就是(0,0,0,0)。下面这句话的意思是label对应的位置为1 temp_label[label-1]=1.0 nodeUnder = np.ones([2*sl-1,1]) # 这个for循环是计算出,某个节点底下一共有多少个子节点 # kids存了两个值,分别代表左右孩子。 # 可以推测出,allkids存的东西,allkids[i]代表第i个非叶子节点,allkids[i][0]是左孩子,allkids[i][1]是右孩子 for i in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder kids = allKids[i] n1 = nodeUnder[kids[0]] n2 = nodeUnder[kids[1]] nodeUnder[i] = n1+n2 cat_size=self.cat sentree.catDelta = np.zeros([cat_size, 2*sl-1]) sentree.catDelta_out = np.zeros([self.d,2*sl-1]) # classifier on single words # 处理所有单词,即叶子节点 # 这里有个问题就是,为什么叶子节点也要计算情感误差 for i in range(sl): sm = softmax(np.dot(Wlab,words_embedded[:,i]) + blab) #这里不管情感误差是如何计算的,sentree.nodeScores存的是情感误差没错了。 #sentree.catDelta存的什么不清楚,但是和情感误差有关 lbl_sm = (1-self.alpha)*(temp_label - sm) sentree.nodeScores[i] = 1.0/2.0*(np.dot(lbl_sm,(temp_label- sm))) sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) # sm = sigmoid(self.Wlab*words_embedded + self.blab) #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm) #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm)) #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm) #超过sl的部分是单词的父亲节点 for i in range(sl,2*sl-1): kids = allKids[i] #c1,c2,是左右孩子的向量 c1 = sentree.nodeFeatures[:,kids[0]] c2 = sentree.nodeFeatures[:,kids[1]] # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1]) #计算p,显然p是个数值,即得分,用于判断哪两个节点合并 p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + b1) # See last paragraph in Section 2.3 p_norm1 = p/norm(p) # Eq. (7) in the paper (for special case of 1d label) #sm = sigmoid(np.dot(Wlab,p_norm1) + blab) #这里是计算节点的情感标签,sm sm = softmax(np.dot(Wlab,p_norm1) + blab) beta=0.5 #lbl_sm = beta * (1.0-self.alpha)*(label - sm) lbl_sm = beta * (1.0-self.alpha)*(temp_label - sm) #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm) #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1] #J=-(1.0-self.alpha)*np.log(sm[label-1]) #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm)) sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm))) J = 1.0/2.0*(np.dot(lbl_sm,(temp_label - sm))) sentree.nodeFeatures[:,i] = p_norm1 sentree.nodeFeatures_unnormalized[:,i] = p sentree.nodeScores[i] = J sentree.numkids = nodeUnder sentree.kids = allKids else: # 这里主要是计算重构误差 # Reconstruction Error for j in range(sl-1): size2=np.size(words_embedded,1) """ 经过测试,p有多个值 也就不难怪这里c1,c2里面分别存了多个单词的向量 因此,这个算法并不是一个个依次算p的,而是一次性一起算出来p 也因此J的值应该也是有多个值。代表两两单词计算的不同结果。 """ c1 = words_embedded[:,0:-1] # 去掉最后一个单词 c2 = words_embedded[:,1:] # 去掉第一个单词 freq1 = freq[0:-1] freq2 = freq[1:] p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + np.reshape(b1,[self.d,1])*([1]*(size2-1))) p_norm1 =p/np.sqrt(sum(p**2)) y1_unnormalized = tanh(np.dot(W3,p_norm1) + np.reshape(b2,[self.d,1])*([1]*(size2-1))) y2_unnormalized = tanh(np.dot(W4,p_norm1) + np.reshape(b3,[self.d,1])*([1]*(size2-1))) y1 = y1_unnormalized/np.sqrt(sum(y1_unnormalized**2)) y2 = y2_unnormalized/np.sqrt(sum(y2_unnormalized**2)) y1c1 = self.alpha*(y1-c1) y2c2 = self.alpha*(y2-c2) # Eq. (4) in the paper: reconstruction error J = 1.0/2.0*sum((y1c1)*(y1-c1) + (y2c2)*(y2-c2)) # finding the pair with smallest reconstruction error for constructing sentree J_min= min(J) J_minpos=np.argmin(J) """ 只有非叶子节点才会有重构节点,因此,sentree.node_y1c1需要从sl+j开始存y1c1. """ sentree.node_y1c1[:,sl+j] = y1c1[:,J_minpos] sentree.node_y2c2[:,sl+j] = y2c2[:,J_minpos] sentree.nodeDelta_out1[:,sl+j] = np.dot(norm1tanh_prime(y1_unnormalized[:,J_minpos]) , y1c1[:,J_minpos]) sentree.nodeDelta_out2[:,sl+j] = np.dot(norm1tanh_prime(y2_unnormalized[:,J_minpos]) , y2c2[:,J_minpos]) #一对节点被选中以后,需要删除words_embedded对应的向量 #还要把合成的节点加入words_embedded words_embedded=np.delete(words_embedded,J_minpos+1,1) words_embedded[:,J_minpos]=p_norm1[:,J_minpos] sentree.nodeFeatures[:, sl+j] = p_norm1[:,J_minpos] sentree.nodeFeatures_unnormalized[:, sl+j]= p[:,J_minpos] sentree.nodeScores[sl+j] = J_min # pp存的可能是父节点信息,因为两个孩子拥有同一个父亲 sentree.pp[collapsed_sentence[J_minpos]] = sl+j sentree.pp[collapsed_sentence[J_minpos+1]] = sl+j sentree.kids[sl+j,:] = [collapsed_sentence[J_minpos], collapsed_sentence[J_minpos+1]] sentree.numkids[sl+j] = sentree.numkids[sentree.kids[sl+j,0]] + sentree.numkids[sentree.kids[sl+j,1]] freq=np.delete(freq,J_minpos+1) freq[J_minpos] = (sentree.numkids[sentree.kids[sl+j,0]]*freq1[J_minpos] + sentree.numkids[sentree.kids[sl+j,1]]*freq2[J_minpos])/(sentree.numkids[sentree.kids[sl+j,0]]+sentree.numkids[sentree.kids[sl+j,1]]) collapsed_sentence=np.delete(collapsed_sentence,J_minpos+1) collapsed_sentence[J_minpos]=sl+j print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") print(sentree.pp) print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") print(sentree.kids) return sentree
def forward(self, X): return softmax(X.dot(self.W) + self.b)
def forward(self, x, t): self.t = t self.y = softmax(x) self.loss = cross_entropy_error(self.y, self.t) return self.loss