def errors(self, y, mean = False): if not self.CONNECTED: raise RuntimeError("Asked to compute errors, but I'm not connected atm") if mean: return T.mean(T.neq(self.y_pred, y)) else: return T.neq(self.y_pred, y)
def step(self, y_m, yb_m, hf, cf, hb, cb): # y_m/yb_m are what shape? should be batch_size (x 1) print y_m.ndim # one-hot encode y,yb (NEED TO SAVE PREVIOUS VALUES FOR MASKING!!!) y = to_one_hot(y_m, self.bs, self.K) yb = to_one_hot(yb_m, self.bs, self.K) # get forward and backward inputs values y_f_in = self.forward_in.run(y) y_b_in = self.backward_in.run(yb) # run forward and backward LSTMs hf_t,cf_t = self.forward_lstm.run(y_f_in, hf, cf) hb_t,cb_t = self.backward_lstm.run(y_b_in, hb, cb) # but only if y/yb is not 0 (apply mask) mask_y = y_m.reshape((self.bs, 1))#.repeat(self.m//2, axis=1) # these lines *shouldnt* be needed... mask_yb = yb_m.reshape((self.bs, 1))#.repeat(self.m//2, axis=1) hf = T.switch(T.neq(mask_y, 0), hf_t, hf) cf = T.switch(T.neq(mask_y, 0), cf_t, cf) # and backward hb = T.switch(T.neq(mask_yb, 0), hb_t, hb) cb = T.switch(T.neq(mask_yb, 0), cb_t, cb) # return the new values return hf,cf,hb,cb
def ber(y, pred): a = (tensor.neq(y, 1) * tensor.neq(pred, 1)).sum() b = (tensor.neq(y, 1) * tensor.eq(pred, 1)).sum() c = (tensor.eq(y, 1) * tensor.neq(pred, 1)).sum() d = (tensor.eq(y, 1) * tensor.eq(pred, 1)).sum() [a, b, c, d] = [tensor.cast(x, dtype=theano.config.floatX) for x in [a, b, c, d]] return (b / (a + b) + c / (c + d)) / numpy.float32(2)
def getRpRnTpTnForTrain0OrVal1(self, y, training0OrValidation1): # The returned list has (numberOfClasses)x4 integers: >numberOfRealPositives, numberOfRealNegatives, numberOfTruePredictedPositives, numberOfTruePredictedNegatives< for each class (incl background). # Order in the list is the natural order of the classes (ie class-0 RP,RN,TPP,TPN, class-1 RP,RN,TPP,TPN, class-2 RP,RN,TPP,TPN ...) # param y: y = T.itensor4('y'). Dimensions [batchSize, r, c, z] yPredToUse = self.y_pred_train if training0OrValidation1 == 0 else self.y_pred_val checkDimsOfYpredAndYEqual(y, yPredToUse, "training" if training0OrValidation1 == 0 else "validation") returnedListWithNumberOfRpRnTpTnForEachClass = [] for class_i in xrange(0, self._numberOfOutputClasses) : #Number of Real Positive, Real Negatives, True Predicted Positives and True Predicted Negatives are reported PER CLASS (first for WHOLE). tensorOneAtRealPos = T.eq(y, class_i) tensorOneAtRealNeg = T.neq(y, class_i) tensorOneAtPredictedPos = T.eq(yPredToUse, class_i) tensorOneAtPredictedNeg = T.neq(yPredToUse, class_i) tensorOneAtTruePos = T.and_(tensorOneAtRealPos,tensorOneAtPredictedPos) tensorOneAtTrueNeg = T.and_(tensorOneAtRealNeg,tensorOneAtPredictedNeg) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtRealPos) ) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtRealNeg) ) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtTruePos) ) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtTrueNeg) ) return returnedListWithNumberOfRpRnTpTnForEachClass
def __init__(self, rng, batchsize, epochs=100, alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-08, l1_weight=0.0, l2_weight=0.1, cost='mse'): self.alpha = alpha self.beta1 = beta1 self.beta2 = beta2 self.eps = eps self.l1_weight = l1_weight self.l2_weight = l2_weight self.rng = rng self.theano_rng = RandomStreams(rng.randint(2 ** 30)) self.epochs = epochs self.batchsize = batchsize # Where cost is always the cost which is minimised in supervised training # the T.nonzero term ensures that the cost is only calculated for examples with a label # # Convetion: We mark unlabelled examples with a vector of zeros in lieu of a one-hot vector if cost == 'mse': self.y_pred = lambda network, x: network(x) self.error = lambda network, y_pred, y: T.zeros((1,)) self.cost = lambda network, x, y: T.mean((network(x)[T.nonzero(y)] - y[T.nonzero(y)]**2)) elif cost == 'binary_cross_entropy': self.y_pred = lambda network, x: network(x) self.cost = lambda network, y_pred, y: T.nnet.binary_crossentropy(y_pred[T.nonzero(y)], y[T.nonzero(y)]).mean() # classification error self.error = lambda network, y_pred, y: T.mean(T.neq(T.argmax(y_pred, axis=1), T.argmax(y, axis=1))) elif cost == 'cross_entropy': self.y_pred = lambda network, x: network(x) self.cost = lambda network, y_pred, y: T.nnet.categorical_crossentropy(y_pred[T.nonzero(y)], y[T.nonzero(y)]).mean() # classification error self.error = lambda network, y_pred, y: T.mean(T.neq(T.argmax(y_pred, axis=1), T.argmax(y, axis=1))) else: self.y_pred = lambda network, x: network(x) self.error = lambda network, y_pred, y: T.zeros((1,)) self.cost = cost
def errors(self, y): """Return a float representing the number of errors in the minibatch over the total number of examples of the minibatch ; zero one loss over the size of the minibatch :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label """ # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError( 'y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type) ) # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction return T.mean(T.neq(self.y_pred, y)) else: #raise NotImplementedError() # print y.shape[0] # for i in range(1, y.shape[0].eval()): # print('%f | %f' % (self.y_pred[i], y[i])) #print T.mean(T.neq(self.y_pred, y)) #print self.y_pred.eval() return T.mean(T.neq(self.y_pred, y))
def matrix_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes, epsilon=1e-7): """ :param xs: :param es: :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: :param epsilon: :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes v1 = create_shared_variable(np.zeros((n_samples, n_in, n_out))) rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) xr_decayed = xr*rx er_decayed = er*re spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) v2 = xr_decayed[:, :, None]*er_decayed[:, None, :] dws = (spikes*(v2-v1))/(rx*re-1) new_xr = xr_decayed + xs/(kp_x+kd_x) new_er = er_decayed + es/(kp_e+kd_e) add_update(v1, tt.switch(spikes, new_xr[:, :, None]*new_er[:, None, :], v1)) add_update(xr, new_xr) add_update(er, new_er) return dws.sum(axis=0)
def get_monitoring_channels(self, model, X, Y = None): rval = OrderedDict() history = model.mf(X, return_history = True) q = history[-1] if self.supervised: assert Y is not None Y_hat = q[-1] true = T.argmax(Y,axis=1) pred = T.argmax(Y_hat, axis=1) #true = Print('true')(true) #pred = Print('pred')(pred) wrong = T.neq(true, pred) err = T.cast(wrong.mean(), X.dtype) rval['misclass'] = err if len(model.hidden_layers) > 1: q = model.mf(X, Y = Y) pen = model.hidden_layers[-2].upward_state(q[-2]) Y_recons = model.hidden_layers[-1].mf_update(state_below = pen) pred = T.argmax(Y_recons, axis=1) wrong = T.neq(true, pred) rval['recons_misclass'] = T.cast(wrong.mean(), X.dtype) return rval
def theano_metrics(y_pred, y_true, n_classes, void_labels): """ Returns the intersection I and union U (to compute the jaccard I/U) and the accuracy. :param y_pred: tensor of predictions. shape (b*0*1, c) with c = n_classes :param y_true: groundtruth, shape (b,0,1) or (b,c,0,1) with c=1 :param n_classes: int :param void_labels: list of indexes of void labels :return: return tensors I and U of size (n_classes), and scalar acc """ # Put y_pred and y_true under the same shape y_true = T.flatten(y_true) y_pred = T.argmax(y_pred, axis=1) # We use not_void in case the prediction falls in the void class of the groundtruth for i in range(len(void_labels)): if i == 0: not_void = T.neq(y_true, void_labels[i]) else: not_void = not_void * T.neq(y_true, void_labels[i]) I = T.zeros(n_classes) U = T.zeros(n_classes) for i in range(n_classes): y_true_i = T.eq(y_true, i) y_pred_i = T.eq(y_pred, i) I = T.set_subtensor(I[i], T.sum(y_true_i * y_pred_i)) U = T.set_subtensor(U[i], T.sum(T.or_(y_true_i, y_pred_i) * not_void)) accuracy = T.sum(I) / T.sum(not_void) return I, U, accuracy
def errors(self, y): if y.dtype.startswith('int') and y.ndim == 3: mask = T.neq(y, -1) total = T.sum(mask, dtype='float32') return T.sum(T.neq(self.y_pred, y)*mask)/total else: raise NotImplementedError()
def __call__(self, model, X, Y, ** kwargs): if self.use_dropout: Y_hat = model.dropout_fprop(X, default_input_include_prob=self.default_input_include_prob, input_include_probs=self.input_include_probs, default_input_scale=self.default_input_scale, input_scales=self.input_scales ) else: Y_hat = model.fprop(X) if self.missing_target_value is not None: assert (self.cost_type == 'default') costMatrix = model.layers[-1].cost_matrix(Y, Y_hat) costMatrix *= T.neq(Y, self.missing_target_value) # This sets to zero all elements where Y == -1 cost = costMatrix.sum()/(T.neq(Y, -1).sum()) cost = T.cast(cost, 'float32') #cost = model.cost_from_cost_matrix(costMatrix) else: if self.cost_type == 'default': cost = model.cost(Y, Y_hat) elif self.cost_type == 'nll': cost = (-Y * T.log(Y_hat)).sum(axis=1).mean() elif self.cost_type == 'crossentropy': cost = (-Y * T.log(Y_hat) - (1 - Y) \ * T.log(1 - Y_hat)).sum(axis=1).mean() else: raise NotImplementedError() return cost
def get_tagging_channels_from_state(self, state, target): missingValuesFilter = T.neq(target, -1) rval = OrderedDict() y_hat = state > 0.5 y = target > 0.5 wrong_bit = T.cast(T.neq(y, y_hat), state.dtype) * missingValuesFilter rval['mistagging'] = T.cast(wrong_bit.sum() / missingValuesFilter.sum(), state.dtype) y = T.cast(y, state.dtype) y_hat = T.cast(y_hat, state.dtype) tp = (y * y_hat * missingValuesFilter).sum() fp = ((1-y) * y_hat * missingValuesFilter).sum() precision = tp / T.maximum(1., tp + fp) recall = tp / T.maximum(1., (y * missingValuesFilter).sum()) rval['precision'] = precision rval['recall'] = recall rval['f1'] = 2. * precision * recall / T.maximum(1, precision + recall) tp = (y * y_hat * missingValuesFilter).sum(axis=0) fp = ((1-y) * y_hat * missingValuesFilter).sum(axis=0) precision = tp / T.maximum(1., tp + fp) rval['per_output_precision.max'] = precision.max() rval['per_output_precision.mean'] = precision.mean() rval['per_output_precision.min'] = precision.min() recall = tp / T.maximum(1., (y * missingValuesFilter).sum(axis=0)) rval['per_output_recall.max'] = recall.max() rval['per_output_recall.mean'] = recall.mean() rval['per_output_recall.min'] = recall.min() f1 = 2. * precision * recall / T.maximum(1, precision + recall) rval['per_output_f1.max'] = f1.max() rval['per_output_f1.mean'] = f1.mean() rval['per_output_f1.min'] = f1.min() # Add computation of the mean average recision from pylearn2_ECCV2014 import meanAvgPrec (rval['min_avg_prec'], rval['mean_avg_prec'], rval['max_avg_prec'], rval['mean_avg_prec_AnswerPhone'], rval['mean_avg_prec_DriveCar'], rval['mean_avg_prec_Eat'], rval['mean_avg_prec_FightPerson'], rval['mean_avg_prec_GetOutCar'], rval['mean_avg_prec_HandShake'], rval['mean_avg_prec_HugPerson'], rval['mean_avg_prec_Kiss'], rval['mean_avg_prec_Run'], rval['mean_avg_prec_SitDown'], rval['mean_avg_prec_SitUp'], rval['mean_avg_prec_StandUp']) = meanAvgPrec.meanAveragePrecisionTheano(target, state) return rval
def multiclassRealPosAndNegAndTruePredPosNegTraining0OrValidation1(self, y, training0OrValidation1): """ The returned list has (numberOfClasses)x4 integers: >numberOfRealPositives, numberOfRealNegatives, numberOfTruePredictedPositives, numberOfTruePredictedNegatives< for each class (incl background). Order in the list is the natural order of the classes (ie class-0 RP,RN,TPP,TPN, class-1 RP,RN,TPP,TPN, class-2 RP,RN,TPP,TPN ...) """ returnedListWithNumberOfRpRnPpPnForEachClass = [] for class_i in xrange(0, self.numberOfOutputClasses) : #Number of Real Positive, Real Negatives, True Predicted Positives and True Predicted Negatives are reported PER CLASS (first for WHOLE). vectorOneAtRealPositives = T.eq(y, class_i) vectorOneAtRealNegatives = T.neq(y, class_i) if training0OrValidation1 == 0 : #training: yPredToUse = self.y_pred else: #validation yPredToUse = self.y_pred_inference vectorOneAtPredictedPositives = T.eq(yPredToUse, class_i) vectorOneAtPredictedNegatives = T.neq(yPredToUse, class_i) vectorOneAtTruePredictedPositives = T.and_(vectorOneAtRealPositives,vectorOneAtPredictedPositives) vectorOneAtTruePredictedNegatives = T.and_(vectorOneAtRealNegatives,vectorOneAtPredictedNegatives) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtRealPositives) ) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtRealNegatives) ) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtTruePredictedPositives) ) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtTruePredictedNegatives) ) return returnedListWithNumberOfRpRnPpPnForEachClass
def trainer(X,Y,alpha,lr,predictions,updates,data,labels): data = U.create_shared(data, dtype=np.int8) labels = U.create_shared(labels,dtype=np.int8) index_start = T.lscalar('start') index_end = T.lscalar('end') print "Compiling function..." train_model = theano.function( inputs = [index_start,index_end,alpha,lr], outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)), updates = updates, givens = { X: data[index_start:index_end], Y: labels[index_start:index_end] } ) test_model = theano.function( inputs = [index_start,index_end], outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)), givens = { X: data[index_start:index_end], Y: labels[index_start:index_end] } ) print "Done." return train_model,test_model
def each_loss(outpt, inpt): # y 是填充了blank之后的ans blank = 26 y_nblank = T.neq(inpt, blank) n = T.dot(y_nblank, y_nblank) # 真实的字符长度 N = 2 * n + 1 # 填充后的字符长度,去除尾部多余的填充 labels = inpt[:N] labels2 = T.concatenate((labels, [blank, blank])) sec_diag = T.neq(labels2[:-2], labels2[2:]) * T.eq(labels2[1:-1], blank) recurrence_relation = \ T.eye(N) + \ T.eye(N, k=1) + \ T.eye(N, k=2) * sec_diag.dimshuffle((0, 'x')) pred_y = outpt[:, labels] fwd_pbblts, _ = theano.scan( lambda curr, accum: T.switch(T.eq(curr*T.dot(accum, recurrence_relation), 0.0), T.dot(accum, recurrence_relation) , curr*T.dot(accum, recurrence_relation)), sequences=[pred_y], outputs_info=[T.eye(N)[0]] ) #return fwd_pbblts #liklihood = fwd_pbblts[0, 0] liklihood = fwd_pbblts[-1, -1] + fwd_pbblts[-1, -2] #liklihood = T.switch(T.lt(liklihood, 1e-35), 1e-35, liklihood) #loss = -T.log(T.cast(liklihood, "float32")) #loss = 10 * (liklihood - 1) * (liklihood - 100) loss = (T.le(liklihood, 1.0)*(10*(liklihood-1)*(liklihood-100)))+(T.gt(liklihood, 1.0)*(-T.log(T.cast(liklihood, "float32")))) return loss
def f1_score(self, y, labels=[0, 2]): """ Mean F1 score between two classes (positive and negative as specified by the labels array). """ y_tr = y y_pr = self.y_pred correct = T.eq(y_tr, y_pr) wrong = T.neq(y_tr, y_pr) label = labels[0] tp_neg = T.sum(correct * T.eq(y_tr, label)) fp_neg = T.sum(wrong * T.eq(y_pr, label)) fn_neg = T.sum(T.eq(y_tr, label) * T.neq(y_pr, label)) tp_neg = T.cast(tp_neg, theano.config.floatX) prec_neg = tp_neg / T.maximum(1, tp_neg + fp_neg) recall_neg = tp_neg / T.maximum(1, tp_neg + fn_neg) f1_neg = 2. * prec_neg * recall_neg / T.maximum(1, prec_neg + recall_neg) label = labels[1] tp_pos = T.sum(correct * T.eq(y_tr, label)) fp_pos = T.sum(wrong * T.eq(y_pr, label)) fn_pos = T.sum(T.eq(y_tr, label) * T.neq(y_pr, label)) tp_pos = T.cast(tp_pos, theano.config.floatX) prec_pos = tp_pos / T.maximum(1, tp_pos + fp_pos) recall_pos = tp_pos / T.maximum(1, tp_pos + fn_pos) f1_pos = 2. * prec_pos * recall_pos / T.maximum(1, prec_pos + recall_pos) return 0.5 * (f1_pos + f1_neg) * 100
def get_train(self, batchsize=None, testsize=None): sx = tt.tensor4() sy = tt.ivector() yc = self._propup(sx, batchsize, noise=False) if 1: cost = -tt.log(tt.nnet.softmax(yc))[tt.arange(sy.shape[0]), sy].mean() else: from hinge import multi_hinge_margin cost = multi_hinge_margin(yc, sy).mean() error = tt.neq(tt.argmax(yc, axis=1), sy).mean() # get updates params = self.params grads = dict(zip(params, theano.grad(cost, params))) updates = collections.OrderedDict() for layer in self.layers: updates.update(layer.updates(grads)) train = theano.function( [sx, sy], [cost, error], updates=updates) # --- make test function y_pred = tt.argmax(self._propup(sx, testsize, noise=False), axis=1) error = tt.mean(tt.neq(y_pred, sy)) test = theano.function([sx, sy], error) return train, test
def get_output_for(self, input, **kwargs): ''' The input is a batch of matrices of word vectors. The output the sum of the word embeddings divided by the number of non-zero word embeddings in the input. The idea with the normalisers is similar as in the normal averageLayer ''' # Sums of word embeddings (so the zero embeddings don't matter here) sums = input.sum(axis=2) # Can we do this cheaper (as in, more efficient)? # NOTE that we explicitly cast the output of the last sum() to floatX # as otherwise Theano will cast the result of 'sums / normalizers' to # float64 normalisers = T.neq((T.neq(input, 0.0)).sum(axis=3, dtype='int32'), 0.0).sum(axis=2, dtype='floatX').reshape((-1, self.iNrOfSentences, 1)) averages = sums / normalisers if self.fGradientClippingBound is not None: averages = theano.gradient.grad_clip(averages, - self.fGradientClippingBound, self.fGradientClippingBound) return averages
def nll_simple(Y, Y_hat, cost_mask=None, cost_ent_mask=None, cost_ent_desc_mask=None): probs = Y_hat pred = TT.argmax(probs, axis=1).reshape(Y.shape) errors = TT.neq(pred, Y) ent_errors = None if cost_ent_mask is not None: pred_ent = TT.argmax(probs * cost_ent_mask.dimshuffle('x', 0), axis=1).reshape(Y.shape) ent_errors = TT.neq(pred_ent, Y).mean() ent_desc_errors = None if cost_ent_desc_mask is not None: pred_desc_ent = TT.argmax(probs * cost_ent_desc_mask, axis=1).reshape(Y.shape) ent_desc_errors = TT.neq(pred_desc_ent, Y).mean() LL = TT.log(_grab_probs(probs, Y) + 1e-8).reshape(Y.shape) if cost_mask is not None: total = cost_mask * LL errors = cost_mask * errors ncosts = TT.sum(cost_mask) mean_errors = TT.sum(errors) / (ncosts) ave = -TT.sum(total) / Y.shape[1] else: mean_errors = TT.mean(errors) ave = -TT.sum(LL) / Y.shape[0] return ave, mean_errors, ent_errors, ent_desc_errors
def apply_log_domain(self, l, probs, l_len=None, probs_mask=None): # Does the same computation as apply, but alpha is in the log domain # This avoids numerical underflow issues that were not corrected in the previous version. def _log(a): return tensor.log(tensor.clip(a, 1e-12, 1e12)) def _log_add(a, b): maximum = tensor.maximum(a, b) return (maximum + tensor.log1p(tensor.exp(a + b - 2 * maximum))) def _log_mul(a, b): return a + b # See comments above B = probs.shape[1] C = probs.shape[2]-1 L = l.shape[0] S = 2*L+1 l_blk = C * tensor.ones((S, B), dtype='int32') l_blk = tensor.set_subtensor(l_blk[1::2,:], l) l_blk = l_blk.T # now l_blk is B x S alpha0 = tensor.concatenate([ tensor.ones((B, 1)), tensor.zeros((B, S-1)) ], axis=1) alpha0 = _log(alpha0) l_blk_2 = tensor.concatenate([-tensor.ones((B,2)), l_blk[:,:-2]], axis=1) l_case2 = tensor.neq(l_blk, C) * tensor.neq(l_blk, l_blk_2) def recursion(p, p_mask, prev_alpha): prev_alpha_1 = tensor.concatenate([tensor.zeros((B,1)),prev_alpha[:,:-1]], axis=1) prev_alpha_2 = tensor.concatenate([tensor.zeros((B,2)),prev_alpha[:,:-2]], axis=1) alpha_bar1 = tensor.set_subtensor(prev_alpha[:,1:], _log_add(prev_alpha[:,1:],prev_alpha[:,:-1])) alpha_bar2 = tensor.set_subtensor(alpha_bar1[:,2:], _log_add(alpha_bar1[:,2:],prev_alpha[:,:-2])) alpha_bar = tensor.switch(l_case2, alpha_bar2, alpha_bar1) probs = _log(p[tensor.arange(B)[:,None].repeat(S,axis=1).flatten(), l_blk.flatten()].reshape((B,S))) next_alpha = _log_mul(alpha_bar, probs) next_alpha = tensor.switch(p_mask[:,None], next_alpha, prev_alpha) return next_alpha alpha, _ = scan(fn=recursion, sequences=[probs, probs_mask], outputs_info=[alpha0]) last_alpha = alpha[-1] # last_alpha = theano.printing.Print('a-1')(last_alpha) prob = _log_add(last_alpha[tensor.arange(B), 2*l_len.astype('int32')-1], last_alpha[tensor.arange(B), 2*l_len.astype('int32')]) # return the negative log probability of the labellings return -prob
def make_report(pars, trainer, data): data = h5.File('/nthome/maugust/thesis/train_val_test_crafted_real_int.hdf5','r') TX = data['test_set/test_set'] TZ = data['test_labels/real_test_labels'] TZ = one_hot(TZ,13) current_pars = trainer.model.parameters.data trainer.model.parameters.data[...] = trainer.best_pars n_wrong = 1 - T.eq(T.argmax(trainer.model.exprs['output'], axis=1), T.argmax(trainer.model.exprs['target'], axis=1)).mean() f_n_wrong = trainer.model.function(['inpt', 'target'], n_wrong) f_pos = T.mean(T.neq(T.argmax(trainer.model.exprs['output'], axis=1),0) * T.eq(T.argmax(trainer.model.exprs['target'], axis=1), 0)) f_f_pos = trainer.model.function(['inpt', 'target'], f_pos) f_neg = T.mean(T.eq(T.argmax(trainer.model.exprs['output'], axis=1),0) * T.neq(T.argmax(trainer.model.exprs['target'], axis=1), 0)) f_f_neg = trainer.model.function(['inpt', 'target'], f_neg) emp_loss = f_n_wrong(TX,TZ) f_p = f_f_pos(TX,TZ) f_n = f_f_neg(TX,TZ) P_pos = np.argmax(trainer.model.predict(TX),axis=1) Z_pos = np.argmax(TZ, axis=1) neighbour_fails = .0 relevant_fails = 0 for i in np.arange(len(P_pos)): if P_pos[i] > 0 and Z_pos[i] > 0 and P_pos[i] != Z_pos[i]: relevant_fails += 1 if is_neighbour(P_pos[i],Z_pos[i]): neighbour_fails += 1 if relevant_fails > 0: neighbour_fails /= relevant_fails emp_loss_s = 'model achieved %f%% classification error on the test set' %emp_loss f_p_s = '\nmodel achieved %f%% false positives on the test set' %f_p f_n_s = '\nmodel achieved %f%% false negatives on the test set' %f_n neigh_s = '\nmodel achieved %f%% neighbour misspredictions on the test set' %neighbour_fails print emp_loss_s print f_p_s print f_n_s print neigh_s with open(os.path.join('.','eval_result.txt'),'w') as f: f.write(emp_loss_s) f.write(f_p_s) f.write(f_n_s) f.write(neigh_s) trainer.model.parameters.data[...] = current_pars return {'train_loss': trainer.score(*trainer.eval_data['train']), 'val_loss': trainer.score(*trainer.eval_data['val']), 'best_emp_test_loss': emp_loss}
def errors(self, y): if y.ndim != self.y_pred.ndim: raise TypeError("y should have the same shape as self.y_pred", ("y", y.type, "y_pred", self.y_pred.type)) if y.dtype.startswith("int"): return T.mean(T.neq(self.y_pred, y)) else: print ("!!! y should be of int type") return T.mean(T.neq(self.y_pred, numpy.asarray(y, dtype="int")))
def errors(self): """ :rtype: theano.Variable """ if self.y_data_flat.type == T.ivector().type: return self.norm * T.sum(T.neq(T.argmax(self.y_m[self.i], axis=-1), self.y_data_flat[self.i])) else: return self.norm * T.sum(T.neq(T.argmax(self.y_m[self.i], axis=-1), T.argmax(self.y_data_flat[self.i], axis=-1)))
def __init__(self, layer_sizes, n_samples, alpha, learning_rate, v_prior, batch_size, X_train, y_train, N_train, X_val, y_val, N_val): self.batch_size = batch_size self.N_train = N_train self.X_train = X_train self.y_train = y_train self.N_val = N_val self.X_val = X_val self.y_val = y_val # We create the network self.network = network.Network(layer_sizes, n_samples, v_prior, N_train) # index to a batch index = T.lscalar() # We create the input and output variables. The input will be a minibatch replicated n_samples times self.x = T.matrix('x') self.y = T.vector('y', dtype = 'int32') # The logarithm of the values for the likelihood factors ll = self.network.log_likelihood_values(self.x, self.y) # The energy function for black-box alpha self.estimate_marginal_ll = -1.0 * N_train / (self.x.shape[ 0 ] * alpha) * \ T.sum(LogSumExp(alpha * (ll - self.network.log_f_hat()), 0) + T.log(1.0 / n_samples)) - self.network.log_normalizer_q() + \ self.network.log_Z_prior() # We create a theano function for updating q self.process_minibatch = theano.function([ index ], self.estimate_marginal_ll, \ updates = adam(self.estimate_marginal_ll, self.network.params, learning_rate), \ givens = { self.x: self.X_train[ index * batch_size: (index + 1) * batch_size ], \ self.y: self.y_train[ index * batch_size: (index + 1) * batch_size ] }) # We create a theano function for making predictions self.error_minibatch_train = theano.function([ index ], T.mean(T.neq(T.argmax((LogSumExp(self.network.output(self.x), 0) + T.log(1.0 / n_samples))[ 0, :, : ], axis = 1), self.y)), givens = { self.x: self.X_train[ index * batch_size: (index + 1) * batch_size ], self.y: self.y_train[ index * batch_size: (index + 1) * batch_size ] }) self.error_minibatch_val = theano.function([ index ], T.mean(T.neq(T.argmax((LogSumExp(self.network.output(self.x), 0) + T.log(1.0 / n_samples))[ 0, :, : ], axis = 1), self.y)), givens = { self.x: self.X_val[ index * batch_size: (index + 1) * batch_size ], self.y: self.y_val[ index * batch_size: (index + 1) * batch_size ] }) self.ll_minibatch_val = theano.function([ index ], T.mean(LogSumExp(ll, 0) + T.log(1.0 / n_samples)), \ givens = { self.x: self.X_val[ index * batch_size: (index + 1) * batch_size ], \ self.y: self.y_val[ index * batch_size: (index + 1) * batch_size ] }) self.network.update_randomness()
def errors(self): """ :rtype: theano.Variable """ self.y_m = self.z.reshape((self.z.shape[0]*self.z.shape[1],self.z.shape[2])) if self.y_data_flat.type == T.ivector().type: return self.norm * T.sum(T.neq(T.argmax(self.y_m[self.i], axis=-1), self.y_data_flat[self.i])) else: return self.norm * T.sum(T.neq(T.argmax(self.y_m[self.i], axis=-1), T.argmax(self.y_data_flat[self.i], axis = -1)))
def get_tagging_channels_from_state(self, state, target): # Before using the state and targets, log them with the accumulator state = self.outputs_accumulator(state) target = self.targets_accumulator(target) missingValuesFilter = T.neq(target, -1) rval = OrderedDict() y_hat = state > 0.5 y = target > 0.5 wrong_bit = T.cast(T.neq(y, y_hat), state.dtype) * missingValuesFilter rval["mistagging"] = T.cast(wrong_bit.sum() / missingValuesFilter.sum(), state.dtype) y = T.cast(y, state.dtype) y_hat = T.cast(y_hat, state.dtype) tp = (y * y_hat * missingValuesFilter).sum() fp = ((1 - y) * y_hat * missingValuesFilter).sum() precision = tp / T.maximum(1.0, tp + fp) recall = tp / T.maximum(1.0, (y * missingValuesFilter).sum()) rval["precision"] = precision rval["recall"] = recall rval["f1"] = 2.0 * precision * recall / T.maximum(1, precision + recall) tp = (y * y_hat * missingValuesFilter).sum(axis=0) fp = ((1 - y) * y_hat * missingValuesFilter).sum(axis=0) precision = tp / T.maximum(1.0, tp + fp) rval["per_output_precision.max"] = precision.max() rval["per_output_precision.mean"] = precision.mean() rval["per_output_precision.min"] = precision.min() recall = tp / T.maximum(1.0, (y * missingValuesFilter).sum(axis=0)) rval["per_output_recall.max"] = recall.max() rval["per_output_recall.mean"] = recall.mean() rval["per_output_recall.min"] = recall.min() f1 = 2.0 * precision * recall / T.maximum(1, precision + recall) rval["per_output_f1.max"] = f1.max() rval["per_output_f1.mean"] = f1.mean() rval["per_output_f1.min"] = f1.min() # Define dummy channels with dummy values that will eventually receive # meanAvgPrec values from the TrainExtension that computes it for dummy in self.dummy_channels: rval[dummy] = f1.max() # Use f1.max() because it's already been # computed so it costs nothing # Add computation of the mean average recision # from pylearn2_ICML2014 import meanAvgPrec # (rval['min_avg_prec'], # rval['mean_avg_prec'], # rval['max_avg_prec']) = meanAvgPrec.meanAveragePrecisionTheano(target, state) return rval
def jaccard_similarity(y_true, y_predicted): """ y_true: tensor ({1, 0}) y_predicted: tensor ({1, 0}) note - we round predicted because float probabilities would not work """ y_predicted = T.round(y_predicted).astype(theano.config.floatX) either_nonzero = T.or_(T.neq(y_true, 0), T.neq(y_predicted, 0)) return T.and_(T.neq(y_true, y_predicted), either_nonzero).sum(axis=-1, dtype=theano.config.floatX) / either_nonzero.sum(axis=-1, dtype=theano.config.floatX)
def masked_categorical_accuracy(y_true, y_pred, mask): y_true = K.argmax(y_true, axis=-1) y_pred = K.argmax(y_pred, axis=-1) error = K.equal(y_true, y_pred) mask_template = T.and_(T.neq(y_true, mask), T.neq(y_true, 0)).nonzero() return K.mean(error[mask_template])
def past_weight_grad_step(xs, es, kp_x, kd_x, kp_e, kd_e, shape, dws=None): """ Do an efficient update of the weights given the two spike-update. (This still runs FING SLOWLY!) :param xs: An (n_in) vector :param es: An (n_out) vector :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: (n_in, n_out) :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_in, n_out = shape rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros(n_in)+1) te_last = create_shared_variable(np.zeros(n_out)+1) x_last = create_shared_variable(np.zeros(n_in)) e_last = create_shared_variable(np.zeros(n_out)) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) x_spike_ixs, = tt.nonzero(x_spikes) e_spike_ixs, = tt.nonzero(e_spikes) if dws is None: dws = tt.zeros(shape) t_last = tt.minimum(tx_last[x_spike_ixs, None], te_last) # (n_x_spikes, n_out) dws = tt.inc_subtensor(dws[x_spike_ixs, :], x_last[x_spike_ixs, None]*e_last * rx**(tx_last[x_spike_ixs, None]-t_last) * re**(te_last[None, :]-t_last) * geoseries_sum(re*rx, t_end=t_last, t_start=1) ) new_x_last = tt.set_subtensor(x_last[x_spike_ixs], x_last[x_spike_ixs]*rx**tx_last[x_spike_ixs]+ xs[x_spike_ixs]/as_floatx(kd_x)) new_tx_last = tt.switch(x_spikes, 0, tx_last) t_last = tt.minimum(new_tx_last[:, None], te_last[e_spike_ixs]) # (n_in, n_e_spikes) dws = tt.inc_subtensor(dws[:, e_spike_ixs], new_x_last[:, None]*e_last[e_spike_ixs] * rx**(new_tx_last[:, None]-t_last) * re**(te_last[None, e_spike_ixs]-t_last) * geoseries_sum(re*rx, t_end=t_last, t_start=1) ) add_update(x_last, new_x_last) add_update(e_last, tt.set_subtensor(e_last[e_spike_ixs], e_last[e_spike_ixs]*re**te_last[e_spike_ixs]+ es[e_spike_ixs]/as_floatx(kd_e))) add_update(tx_last, new_tx_last+1) add_update(te_last, tt.switch(e_spikes, 1, te_last+1)) return dws
def errors(self, y): # check if y has same dimension of y_pred if y.ndim != self.q_y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', ('y', target.type, 'y_pred', self.q_y_pred.type)) # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction return T.mean(T.neq(self.q_y_pred, y)), T.mean(T.neq(self.p_y_pred, y)) else: raise NotImplementedError()
def __init__(self, state, data): self.rng = numpy.random.RandomState(state['seed']) self.srng = RandomStreams(self.rng.randint(1e5)) self.data = data self.nin = int(data.xdim) self.state = state self.nout = int(data.ydim) ####################### # 0. Training functions ####################### self.x = TT.matrix('X') self.y = TT.ivector('y') self.layer0 = HiddenLayerStandard(self.rng, self.x, self.nin, eval(str(state['nhid'])), name='layer0') self.layer1 = SoftmaxLayerStandard(self.rng, self.layer0.output, eval(str(state['nhid'])), self.nout) self.params = [] self.params += self.layer0.params self.params += self.layer1.params self.best_params = [(x.name, x.get_value()) for x in self.params] self.params_shape = [ x.get_value(borrow=True).shape for x in self.params ] ##### PARAMS self.inputs = [self.x, self.y] inds = TT.constant(numpy.asarray(range(state['cbs']), dtype='int32')) cost = -TT.log(self.layer1.output)[inds, self.y] self.train_cost = TT.mean(cost) if state['matrix'] == 'KL': self.Gvs = lambda *args:\ TT.Lop(self.layer1.output, self.params, TT.Rop(self.layer1.output, self.params, args) / (self.layer1.output * state['mbs'])) elif state['matrix'] == 'cov': self.Gvs = lambda *args:\ TT.Lop(cost, self.params, TT.Rop(cost, self.params, args) / (numpy.float32(state['mbs']))) pred = TT.argmax(self.layer1.output, axis=1) self.error = TT.mean(TT.neq(pred, self.y)) * 100. ######################### # 1. Validation functions ######################### givens = {} givens[self.x] = self.data._valid_x givens[self.y] = self.data._valid_y print("IMPS", [type(x) for x in givens]) print("IMPS", [x.shape for x in givens]) self.valid_eval_func = theano.function([], self.error, givens=givens, name='valid_eval_fn', profile=0) givens[self.x] = self.data._test_x givens[self.y] = self.data._test_y self.test_eval_func = theano.function([], self.error, givens=givens, name='test_fn', profile=0)
def build_network(): """Build network. Returns ------- """ import theano.tensor as t from collections import OrderedDict # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # BinaryConnect binary = True print("binary = " + str(binary)) stochastic = True print("stochastic = " + str(stochastic)) # (-h,+h) are the two binary values # h = "Glorot" h = 1. print("h = " + str(h)) # w_lr_scale = 1. # "Glorot" means we are using the coefficients from Glorot's paper w_lr_scale = "Glorot" print("w_lr_scale = " + str(w_lr_scale)) # Prepare Theano variables for inputs and targets input_var = t.tensor4('inputs') target = t.matrix('targets') lr = t.scalar('lr', dtype=theano.config.floatX) cnn = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input_var) # 128C3-128C3-P2 cnn = binary_connect.Conv2DLayer( cnn, binary=binary, stochastic=stochastic, H=h, W_LR_scale=w_lr_scale, num_filters=128, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.rectify) cnn = binary_connect.Conv2DLayer( cnn, binary=binary, stochastic=stochastic, H=h, W_LR_scale=w_lr_scale, num_filters=128, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.rectify) # 256C3-256C3-P2 cnn = binary_connect.Conv2DLayer( cnn, binary=binary, stochastic=stochastic, H=h, W_LR_scale=w_lr_scale, num_filters=256, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.rectify) cnn = binary_connect.Conv2DLayer( cnn, binary=binary, stochastic=stochastic, H=h, W_LR_scale=w_lr_scale, num_filters=256, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.rectify) # 512C3-512C3-P2 cnn = binary_connect.Conv2DLayer( cnn, binary=binary, stochastic=stochastic, H=h, W_LR_scale=w_lr_scale, num_filters=512, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.rectify) cnn = binary_connect.Conv2DLayer( cnn, binary=binary, stochastic=stochastic, H=h, W_LR_scale=w_lr_scale, num_filters=512, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.rectify) # 1024FP-1024FP-10FP cnn = binary_connect.DenseLayer( cnn, binary=binary, stochastic=stochastic, H=h, W_LR_scale=w_lr_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=1024) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.rectify) cnn = binary_connect.DenseLayer( cnn, binary=binary, stochastic=stochastic, H=h, W_LR_scale=w_lr_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=1024) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.rectify) cnn = binary_connect.DenseLayer( cnn, binary=binary, stochastic=stochastic, H=h, W_LR_scale=w_lr_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=10) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.identity) train_output = lasagne.layers.get_output(cnn, deterministic=False) # squared hinge loss loss = t.mean(t.sqr(t.maximum(0., 1. - target * train_output))) if binary: from itertools import chain # w updates w = lasagne.layers.get_all_params(cnn, binary=True) w_grads = binary_connect.compute_grads(loss, cnn) updates = lasagne.updates.adam(loss_or_grads=w_grads, params=w, learning_rate=lr) updates = binary_connect.clipping_scaling(updates, cnn) # other parameters updates params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) updates = OrderedDict( chain( updates.items(), lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=lr).items())) else: params = lasagne.layers.get_all_params(cnn, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=lr) test_output = lasagne.layers.get_output(cnn, deterministic=True) test_loss = t.mean(t.sqr(t.maximum(0., 1. - target * test_output))) test_err = t.mean(t.neq(t.argmax(test_output, axis=1), t.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch # (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input_var, target, lr], loss, updates=updates, on_unused_input='ignore') # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target], [test_loss, test_err]) return cnn, train_fn, val_fn
# !/usr/bin/env python3
######################### # BUILD FINE-TUNE MODEL # ######################### print "\n\n... building fine-tune model -- contraction 1" for imodel in model.models_stack: imodel.threshold = 0. model_ft = model + LogisticRegression( hid_layer_sizes[-1], 10, npy_rng=npy_rng ) model_ft.print_layer() train_set_error_rate = theano.function( [], T.mean(T.neq(model_ft.models_stack[-1].predict(), train_y)), givens = {model_ft.varin : train_x}, ) test_set_error_rate = theano.function( [], T.mean(T.neq(model_ft.models_stack[-1].predict(), test_y)), givens = {model_ft.varin : test_x}, ) print "Done." print "... training with conjugate gradient: minimize.py" fun_cost = theano.function( [model_ft.varin, model_ft.models_stack[-1].vartruth], model_ft.models_stack[-1].cost() + model_ft.models_stack[-1].weightdecay(weightdecay) ) def return_cost(test_params, input_x, truth_y):
def build_decoder(self, hs, x, xmask=None, y=None, y_neg=None, mode=EVALUATION, prev_hd=None, step_num=None): # Check parameter consistency if mode == Decoder.EVALUATION: assert not prev_hd assert y else: assert not y assert prev_hd # if mode == EVALUATION # xd = (timesteps, batch_size, qdim) # # if mode != EVALUATION # xd = (n_samples, dim) xd = self.approx_embedder(x) if not xmask: xmask = T.neq(x, self.eoq_sym) # we must zero out the </s> embedding # i.e. the embedding x_{-1} is the 0 vector # as well as hd_{-1} which will be reseted in the scan functions if xd.ndim != 3: assert mode != Decoder.EVALUATION # So only in beam search xd = (xd.dimshuffle((1, 0)) * xmask).dimshuffle((1, 0)) else: assert mode == Decoder.EVALUATION # So only in beam search xd = (xd.dimshuffle((2, 0, 1)) * xmask).dimshuffle((1, 2, 0)) # Run the decoder if mode == Decoder.EVALUATION: hd_init = T.alloc(np.float32(0), x.shape[1], self.qdim) else: hd_init = prev_hd if self.query_step_type == "gated": f_dec = self.gated_step o_dec_info = [hd_init, None, None, None] else: f_dec = self.plain_step o_dec_info = [hd_init] # If the mode of the decoder is EVALUATION # then we evaluate by default all the sentence # xd - i.e. xd.ndim == 3, xd = (timesteps, batch_size, qdim) if mode == Decoder.EVALUATION: _res, _ = theano.scan(f_dec, sequences=[xd, xmask, hs],\ outputs_info=o_dec_info) # else we evaluate only one step of the recurrence using the # previous hidden states and the previous computed hierarchical # states. else: _res = f_dec(xd, xmask, hs, prev_hd) if isinstance(_res, list) or isinstance(_res, tuple): hd = _res[0] else: hd = _res pre_activ = self.build_output_layer(hs, xd, hd) # EVALUATION : Return target_probs + all the predicted ranks # target_probs.ndim == 3 if mode == Decoder.EVALUATION: target_probs = GrabProbs(self.output_softmax(pre_activ), y) return target_probs, hd, _res # BEAM_SEARCH : Return output (the softmax layer) + the new hidden states elif mode == Decoder.BEAM_SEARCH: return self.output_softmax(pre_activ), hd
print 'no ndm' print min_informative_str(ipt) if found > 0: print type(node.op), found try: print '\t', type(node.op.scalar_op) except: pass print count test = CIFAR10(which_set='test', one_hot=True, gcn=55.) yl = T.argmax(yb, axis=1) mf1acc = 1. - T.neq(yl, T.argmax(ymf1, axis=1)).mean() #mfnacc = 1.-T.neq(yl , T.argmax(mfny,axis=1)).mean() batch_acc = function([Xb, yb], [mf1acc]) def accs(): mf1_accs = [] for i in xrange(10000 / batch_size): mf1_accs.append( batch_acc( test.get_topological_view(test.X[i * batch_size:(i + 1) * batch_size, :]), test.y[i * batch_size:(i + 1) * batch_size, :])[0]) return sum(mf1_accs) / float(len(mf1_accs))
def fn(cycles, rng, sm, cum_dmg, sn_0, sn_c, sn_cutoff, fat, n_fat, n_c, m_1, m_2, fat_fact, n_0, m_0, n_cutoff, r_y, r_m, m_s_th): # y is previous result """ input function to the loop over all bins """ cum_damage = np.float64(0) log10_sn_1 = (tt.log10(fat * fat_fact) + (tt.log10(n_fat) - tt.log10(n_0)) / m_1).astype("float64") sn_1 = 10**log10_sn_1 sn_0 = (10**(log10_sn_1 + tt.log10(n_0) / m_0)).astype("float64") sn_c = (10**(tt.log10(fat * fat_fact) - (tt.log10(n_c) - tt.log10(n_fat)) / m_1)).astype("float64") sn_cutoff = ( 10**(tt.log10(sn_c) - (tt.log10(n_cutoff) - tt.log10(n_c)) / m_2)).astype("float64") life = 0 log10_life = 0 dmg_per_bin = 0 s_factor_life_per_bin = 0 s_factor_stress_per_bin = 0 s_nb = ifelse( tt.neq(cycles, 0), ifelse( tt.le(cycles, n_0), (10**(log10_sn_1 + (tt.log10(n_0) - tt.log10(cycles)) / m_0)), ifelse( tt.le(cycles, n_c), (10**(tt.log10(sn_c) + (tt.log10(n_c) - tt.log10(cycles)) / m_1)), 10**(tt.log10(sn_c) - (tt.log10(cycles) - tt.log10(n_c)) / m_2))), 0 * cycles) rng = ifelse( tt.neq(m_s_th, 0) & tt.neq(sm, 0), ifelse(tt.neq(sm, 0), apply_mean_stress_theory(m_s_th, sm, rng, sn_0, r_m, r_y), rng), rng) # double check if 0 = False log10_life, life, dmg_per_bin, s_factor_life_per_bin = ifelse( tt.lt(sn_0, rng), [np.float64(-1), np.float64(0), np.float64(100), np.float64(0)], ifelse( tt.lt(sn_1, rng), [(tt.log10(n_0) - m_0 * (tt.log10(rng) - log10_sn_1)), (10**(tt.log10(n_0) - m_0 * (tt.log10(rng) - log10_sn_1))), np.float64(0), np.float64(0)], ifelse( tt.lt(sn_c, rng), [(tt.log10(n_c) - m_1 * (tt.log10(rng) - tt.log10(sn_c))), (10**(tt.log10(n_c) - m_1 * (tt.log10(rng) - tt.log10(sn_c)))), np.float64(0), np.float64(0)], ifelse(tt.lt(0, rng), [(tt.min([ tt.log10(n_cutoff), tt.log10(n_c) + m_2 * (tt.log10(sn_c) - tt.log10(rng)) ])), (10**tt.min([ tt.log10(n_cutoff), tt.log10(n_c) + m_2 * (tt.log10(sn_c) - tt.log10(rng)) ])).astype("float64"), np.float64(0), np.float64(0)], [ tt.log10(n_cutoff).astype("float64"), n_cutoff.astype("float64"), np.float64(0), np.float64(0) ])))) dmg_per_bin = ifelse(tt.lt(0, life), cycles / life, 0 * life) #_rst_dic_per_bin['life'] > 0: s_factor_life_per_bin = ifelse(tt.neq(dmg_per_bin, 0.), 1 / dmg_per_bin, np.float64(1)) s_factor_stress_per_bin = tt.min([100.0, s_nb / tt.max([1, rng])]) return dmg_per_bin, sn_0, sn_c, sn_cutoff
def errors(self, visibles, validate_terms): output_error = [] self.output_prediction = [] output_targets = {} for j, (W, v) in enumerate(zip(self.W_params, visibles)): if W.name in validate_terms: output_targets[W.name] = v visibles[j] = shared(np.zeros(v.shape.eval(), dtype=theano.config.floatX), name=W.name, borrow=True) gibbs_output = self.gibbs_vhv(visibles) gibbs_output = gibbs_output[-len(visibles):] for i, (W, v, s) in enumerate(zip(self.W_params, visibles, gibbs_output)): if W.name in validate_terms: visibles[i] = s for valid_term in validate_terms: energy, valid_feature = self.conditional_energy( visibles, validate_terms, valid_term) if valid_feature['type'] == 'category': # for categorical features (n, feature, category) probabilities = T.nnet.softmax(energy) y = T.argmax(output_targets[valid_term], axis=-1).flatten() p = T.argmax(probabilities, axis=-1) error = T.mean(T.neq(y, p)) # accuracy self.output_prediction.extend([y]) self.output_prediction.extend([p]) elif valid_feature['type'] == 'scale': # for scale features (n, feature) norm = self.norms[valid_feature['name']] y = output_targets[valid_term].flatten() * norm y_out = T.nnet.softplus(energy).flatten() * norm error = T.sqrt(T.mean(T.sqr(y_out - y))) # RMSE error self.output_prediction.extend([y]) self.output_prediction.extend([y_out]) elif valid_feature['type'] == 'binary': # for binary features (n, feature) y = output_targets[valid_term].flatten() prob = T.nnet.sigmoid(energy) p = T.ceil(prob * 3) - 2. error = T.mean(T.neq(p, y)) # accuracy self.output_prediction.extend([y]) self.output_prediction.extend([p]) else: raise NotImplementedError() output_error.append(error) return output_error
def __init__(self, options, channel, data): self.rng = numpy.random.RandomState(options['seed']) self.srng = RandomStreams(self.rng.randint(1e5)) self.nin = data['train_x'].shape[1] self.options = options if isinstance(options['hids'], list): self.hids = options['hids'] else: self.hids = eval(str(options['hids'])) self.nout = numpy.int32(numpy.max(data['train_y']) + 1) def gen_mat(nin, nout, name): # NOTE : assumes sigmoid self.rng = numpy.random.RandomState(123) if options['init'] == 'small': lim = numpy.sqrt(1. / nin) vals = self.rng.uniform(size=(nin, nout), low=-lim, high=lim).astype('float32') else: lim = numpy.sqrt(6. / (nin + nout)) print 'Lim used to generate random numbers', lim vals = self.rng.uniform(size=(nin, nout), low=-lim, high=lim).astype('float32') * 4. try: print 'Rank (',nin, ',', nout, '):', \ numpy.linalg.matrix_rank(vals) except: pass var = theano.shared(vals, name=name) print_mem(name) return var def gen_vec(n, name): self.rng = numpy.random.RandomState(123) vals = self.rng.uniform(size=(n, ), low=-.0005, high=.0005).astype('float32') * 0. var = theano.shared(vals, name=name) print_mem(name) return var ##### PARAMS all_hids = [self.nin] + self.hids + [self.nout] activs = [TT.nnet.sigmoid] * len(self.hids) + [softmax] #activs = [TT.tanh] * len(self.hids) + [softmax] self.params = [] self.cpu_params = [] self.params_shape = [] for idx, (in_dim, out_dim) in\ enumerate(zip(all_hids[:-1], all_hids[1:])): gpu_W = gen_mat(in_dim, out_dim, name='W%d' % idx) gpu_b = gen_vec(out_dim, name='b%d' % idx) self.params += [gpu_W, gpu_b] self.params_shape.append((in_dim, out_dim)) self.params_shape.append((out_dim, )) self.x = TT.matrix('X') self.y = TT.ivector('y') self.inputs = [self.x, self.y] hid = self.x for idx, activ in zip(range(len(self.params) // 2), activs): W = self.params[idx * 2] b = self.params[idx * 2 + 1] preactiv = TT.dot(hid, W) + b hid = activ(preactiv) self.preactiv_out = preactiv batch_train_cost = -TT.log(hid)[ TT.constant(numpy.asarray(range(options['cbs'])).astype('int32')), self.y] if options['type'] == 'gradCov': self.outs = [batch_train_cost] self.outs_operator = ['linear'] elif options['type'] == 'leroux': self.outs = [batch_train_cost - batch_train_cost.mean()] self.outs_operator = ['linear'] else: self.outs = [hid] self.outs_operator = ['softmax'] self.gf_outs = [hid] self.gf_outs_operator = ['softmax'] self.gc_outs = [batch_train_cost] self.gc_outs_operator = ['linear'] self.train_cost = TT.mean(batch_train_cost) pred = TT.argmax(hid, axis=1) self.err = TT.mean(TT.neq(pred, self.y)) self.valid_xdata = theano.shared(data['valid_x'], name='valid_xdata', borrow=True) self.test_xdata = theano.shared(data['test_x'], name='test_xdata', borrow=True) mode = gpu_mode self.valid_ydata = TT.cast( theano.shared(data['valid_y'], name='valid_ydata', borrow=True), 'int32') self.test_ydata = TT.cast( theano.shared(data['test_y'], name='test_xdata', borrow=True), 'int32') givens = {} givens[self.x] = self.valid_xdata givens[self.y] = self.valid_ydata self.valid_eval_func = theano.function([], self.err, givens=givens, name='valid_eval_fn', profile=options['profile'], mode=mode) givens[self.x] = self.test_xdata givens[self.y] = self.test_ydata self.test_eval_func = theano.function([], self.err, givens=givens, name='test_fn', profile=options['profile'], mode=mode)
W = lasagne.layers.get_all_params(mlp, binary=True) W_grads = binary_net.compute_grads(loss,mlp) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates,mlp) # other parameters updates params = lasagne.layers.get_all_params(mlp, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR).items()) else: params = lasagne.layers.get_all_params(mlp, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(mlp, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') binary_net.train( train_fn,val_fn, mlp, batch_size, LR_start,LR_decay,
def __init__(self, rng, input, is_train, n_in, n_hidden, n_out, p=0.5, dropout=False, input_p=0.1): #, batch_size=20): #Need input dropout layer if input_p!=None: self.input_layer = drop(input, rng=rng, p=input_p) self.input_layer = T.switch(T.neq(is_train, 0), self.input_layer, input) else: self.input_layer=input param_to_scale = [] #To scale weights to square length of 15 self.layer_0 = HiddenLayer( rng=rng, input=self.input_layer, n_in=n_in, n_out=n_hidden[0], activation=prelu, is_train=is_train, p=p, dropout=dropout ) self.params = self.layer_0.params param_to_scale = param_to_scale + [self.layer_0.params[0]] #Add more layers accordingly layer_number = 1 if len(n_hidden)>1: for layer in n_hidden[1:]: current_hidden_layer = HiddenLayer( rng=rng, input=getattr(self, "layer_" + str(layer_number-1)).output, n_in=n_hidden[layer_number-1], n_out=n_hidden[layer_number], activation=prelu, is_train=is_train, p=p, dropout=dropout ) setattr(self, "layer_" + str(layer_number), current_hidden_layer) self.params = self.params + getattr(self, "layer_" + str(layer_number)).params param_to_scale = param_to_scale + [getattr(self, "layer_" + str(layer_number)).params[0]] layer_number = layer_number + 1 # The logistic regression layer gets as input the hidden units # of the hidden layer self.linearRegressionLayer = LinearRegression( input=getattr(self, "layer_" + str(layer_number-1)).output, n_in=n_hidden[layer_number-1], n_out=n_out, rng=rng #,batch_size=batch_size ) self.params = self.params + self.linearRegressionLayer.params #L1 and L2 regularization self.L1 = ( abs(self.layer_0.W).sum() + abs(self.linearRegressionLayer.W).sum() ) self.L2_sqr = ( (self.layer_0.W ** 2).sum() + (self.linearRegressionLayer.W ** 2).sum() ) # # self.negative_log_likelihood = ( # self.logRegressionLayer.negative_log_likelihood # ) # # self.errors = self.logRegressionLayer.errors # self.pred = self.logRegressionLayer.pred # self.diff = self.logRegressionLayer.diff self.param_to_scale = param_to_scale self.errors = self.linearRegressionLayer.errors self.loss = self.linearRegressionLayer.loss self.NRMSE = self.linearRegressionLayer.NRMSE self.pred = self.linearRegressionLayer.pred self.input = input #KEEP IN MIND THIS IS DIFFERENT THAN self.input_layer!!!
def ready(self): global total_encode_time #say("in encoder ready: \n") #start_encode_time = time.time() generator = self.generator embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = generator.dropout # len*batch x = generator.x z = generator.z_pred z = z.dimshuffle((0,1,"x")) # batch*nclasses y = self.y = T.fmatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [ ] depth = args.depth layer_type = args.layer.lower() for i in xrange(depth): if layer_type == "rcnn": l = ExtRCNN( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation, order = args.order ) elif layer_type == "lstm": l = ExtLSTM( n_in = n_e if i == 0 else n_d, n_out = n_d, activation = activation ) layers.append(l) # len * batch * 1 masks = T.cast(T.neq(x, padding_id).dimshuffle((0,1,"x")) * z, theano.config.floatX) # batch * 1 cnt_non_padding = T.sum(masks, axis=0) + 1e-8 # len*batch*n_e embs = generator.word_embs pooling = args.pooling lst_states = [ ] h_prev = embs for l in layers: # len*batch*n_d h_next = l.forward_all(h_prev, z) if pooling: # batch * n_d masked_sum = T.sum(h_next * masks, axis=0) lst_states.append(masked_sum/cnt_non_padding) # mean pooling else: lst_states.append(h_next[-1]) # last state h_prev = apply_dropout(h_next, dropout) if args.use_all: size = depth * n_d # batch * size (i.e. n_d*depth) h_final = T.concatenate(lst_states, axis=1) else: size = n_d h_final = lst_states[-1] h_final = apply_dropout(h_final, dropout) output_layer = self.output_layer = Layer( n_in = size, n_out = self.nclasses, activation = sigmoid ) # batch * nclasses preds = self.preds = output_layer.forward(h_final) # batch loss_mat = self.loss_mat = (preds-y)**2 pred_diff = self.pred_diff = T.mean(T.max(preds, axis=1) - T.min(preds, axis=1)) if args.aspect < 0: loss_vec = T.mean(loss_mat, axis=1) else: assert args.aspect < self.nclasses loss_vec = loss_mat[:,args.aspect] self.loss_vec = loss_vec zsum = generator.zsum zdiff = generator.zdiff logpz = generator.logpz coherent_factor = args.sparsity * args.coherent loss = self.loss = T.mean(loss_vec) sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \ T.mean(zdiff) * coherent_factor cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0)) self.obj = T.mean(cost_vec) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost self.cost_g = cost_logpz * 10 + generator.l2_cost self.cost_e = loss * 10 + l2_cost
def not_equal(self, x, y): return T.neq(x, y)
def __init__(self, rng, is_train, input, n_in, n_out, W=None, b=None, activation=T.tanh, p=0.7): """ Hidden unit activation is given by: activation(dot(input,W) + b) :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type is_train: theano.iscalar :param is_train: indicator pseudo-boolean (int) for switching between training and prediction :type input: theano.tensor.dmatrix :param input: a symbolic tensor of shape (n_examples, n_in) :type n_in: int :param n_in: dimensionality of input :type n_out: int :param n_out: number of hidden units :type activation: theano.Op or function :param activation: Non linearity to be applied in the hidden layer :type p: float or double :param p: probability of NOT dropping out a unit """ self.input = input if W is None: W_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.W = W self.b = b lin_output = T.dot(input, self.W) + self.b output = activation(lin_output) # multiply output and drop -> in an approximation the scaling effects cancel out train_output = drop(output,p) #is_train is a pseudo boolean theano variable for switching between training and prediction self.output = T.switch(T.neq(is_train, 0), train_output, p*output) # parameters of the model self.params = [self.W, self.b]
def __init__(self, rng, params, cost_function='mse', optimizer=RMSprop): lr = params["lr"] n_lstm = params['n_hidden'] n_out = params['n_output'] batch_size = params["batch_size"] sequence_length = params["seq_length"] # minibatch) X = T.tensor3() # batch of sequence of vector Y = T.tensor3() # batch of sequence of vector is_train = T.iscalar( 'is_train' ) # pseudo boolean for switching between training and prediction #CNN global parameters. subsample = (1, 1) p_1 = 0.5 border_mode = "valid" cnn_batch_size = batch_size * sequence_length pool_size = (2, 2) #Layer1: conv2+pool+drop filter_shape = (64, 1, 9, 9) input_shape = (cnn_batch_size, 1, 120, 60 ) #input_shape= (samples, channels, rows, cols) input = X.reshape(input_shape) c1 = ConvLayer(rng, input, filter_shape, input_shape, border_mode, subsample, activation=nn.relu) p1 = PoolLayer(c1.output, pool_size=pool_size, input_shape=c1.output_shape) dl1 = DropoutLayer(rng, input=p1.output, prob=p_1, is_train=is_train) retain_prob = 1. - p_1 test_output = p1.output * retain_prob d1_output = T.switch(T.neq(is_train, 0), dl1.output, test_output) #Layer2: conv2+pool filter_shape = (128, p1.output_shape[1], 3, 3) c2 = ConvLayer(rng, d1_output, filter_shape, p1.output_shape, border_mode, subsample, activation=nn.relu) p2 = PoolLayer(c2.output, pool_size=pool_size, input_shape=c2.output_shape) #Layer3: conv2+pool filter_shape = (128, p2.output_shape[1], 3, 3) c3 = ConvLayer(rng, p2.output, filter_shape, p2.output_shape, border_mode, subsample, activation=nn.relu) p3 = PoolLayer(c3.output, pool_size=pool_size, input_shape=c3.output_shape) #Layer4: hidden n_in = reduce(lambda x, y: x * y, p3.output_shape[1:]) x_flat = p3.output.flatten(2) h1 = HiddenLayer(rng, x_flat, n_in, 1024, activation=nn.relu) n_in = 1024 rnn_input = h1.output.reshape((batch_size, sequence_length, n_in)) #Layer5: gru self.n_in = n_in self.n_lstm = n_lstm self.n_out = n_out self.W_hy = init_weight((self.n_lstm, self.n_out), rng=rng, name='W_hy', sample='glorot') self.b_y = init_bias(self.n_out, rng=rng, sample='zero') layer1 = LSTMLayer(rng, 0, self.n_in, self.n_lstm) layer2 = LSTMLayer(rng, 1, self.n_lstm, self.n_lstm) layer3 = LSTMLayer(rng, 2, self.n_lstm, self.n_lstm) self.params = layer1.params + layer2.params + layer3.params self.params.append(self.W_hy) self.params.append(self.b_y) def step_lstm(x_t, mask, h_tm1_1, c_tm1_1, h_tm1_2, c_tm1_2, h_tm1_3, c_tm1_3): [h_t_1, c_t_1, y_t_1] = layer1.run(x_t, h_tm1_1, c_tm1_1) dl1 = DropoutLayer(rng, input=y_t_1, prob=0.5, is_train=is_train, mask=mask) [h_t_2, c_t_2, y_t_2] = layer2.run(dl1.output, h_tm1_2, c_tm1_2) [h_t_3, c_t_3, y_t_3] = layer3.run(y_t_2, h_tm1_3, c_tm1_3) y = T.dot(y_t_3, self.W_hy) + self.b_y return [h_t_1, c_t_1, h_t_2, c_t_2, h_t_3, c_t_3, y] h0_1 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_1 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial cell state h0_2 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_2 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial cell state h0_3 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial hidden state c0_3 = shared(np.zeros(shape=(batch_size, self.n_lstm), dtype=dtype)) # initial cell state mask_shape = (sequence_length, batch_size, self.n_lstm) p_1 = 0.5 mask = rng.binomial(size=mask_shape, p=p_1, dtype=X.dtype) #(1, 0, 2) -> AxBxC to BxAxC #(batch_size,sequence_length, n_in) >> (sequence_length, batch_size ,n_in) #T.dot(x_t, self.W_xi)x_t=(sequence_length, batch_size ,n_in), W_xi= [self.n_in, self.n_lstm] [h_t_1, c_t_1, h_t_2, c_t_2, h_t_3, c_t_3, y_vals], _ = theano.scan( fn=step_lstm, sequences=[rnn_input.dimshuffle(1, 0, 2), mask], outputs_info=[h0_1, c0_1, h0_2, c0_2, h0_3, c0_3, None]) self.output = y_vals.dimshuffle(1, 0, 2) self.params = c1.params + c2.params + c3.params + h1.params + self.params cost = get_err_fn(self, cost_function, Y) _optimizer = optimizer(cost, self.params, lr=lr) self.train = theano.function(inputs=[X, Y, is_train], outputs=cost, updates=_optimizer.getUpdates(), allow_input_downcast=True) self.predictions = theano.function(inputs=[X, is_train], outputs=self.output, allow_input_downcast=True) self.n_param = count_params(self.params)
def build_network(): """ Returns ------- """ input_var = t.tensor4('inputs') target = t.matrix('targets') net = {'input': InputLayer((None, 3, 299, 299), input_var=input_var)} net['conv'] = bn_conv(net['input'], num_filters=32, filter_size=3, stride=2) net['conv_1'] = bn_conv(net['conv'], num_filters=32, filter_size=3) net['conv_2'] = bn_conv(net['conv_1'], num_filters=64, filter_size=3, pad=1) net['pool'] = Pool2DLayer(net['conv_2'], pool_size=3, stride=2, mode='max') net['conv_3'] = bn_conv(net['pool'], num_filters=80, filter_size=1) net['conv_4'] = bn_conv(net['conv_3'], num_filters=192, filter_size=3) net['pool_1'] = Pool2DLayer(net['conv_4'], pool_size=3, stride=2, mode='max') net['mixed/join'] = inception_a(net['pool_1'], nfilt=((64, ), (48, 64), (64, 96, 96), (32, ))) net['mixed_1/join'] = inception_a(net['mixed/join'], nfilt=((64, ), (48, 64), (64, 96, 96), (64, ))) net['mixed_2/join'] = inception_a(net['mixed_1/join'], nfilt=((64, ), (48, 64), (64, 96, 96), (64, ))) net['mixed_3/join'] = inception_b(net['mixed_2/join'], nfilt=((384, ), (64, 96, 96))) net['mixed_4/join'] = inception_c(net['mixed_3/join'], nfilt=((192, ), (128, 128, 192), (128, 128, 128, 128, 192), (192, ))) net['mixed_5/join'] = inception_c(net['mixed_4/join'], nfilt=((192, ), (160, 160, 192), (160, 160, 160, 160, 192), (192, ))) net['mixed_6/join'] = inception_c(net['mixed_5/join'], nfilt=((192, ), (160, 160, 192), (160, 160, 160, 160, 192), (192, ))) net['mixed_7/join'] = inception_c(net['mixed_6/join'], nfilt=((192, ), (192, 192, 192), (192, 192, 192, 192, 192), (192, ))) net['mixed_8/join'] = inception_d(net['mixed_7/join'], nfilt=((192, 320), (192, 192, 192, 192))) net['mixed_9/join'] = inception_e(net['mixed_8/join'], nfilt=((320, ), (384, 384, 384), (448, 384, 384, 384), (192, )), pool_mode='average_exc_pad') net['mixed_10/join'] = inception_e(net['mixed_9/join'], nfilt=((320, ), (384, 384, 384), (448, 384, 384, 384), (192, )), pool_mode='max') net['pool3'] = GlobalPoolLayer(net['mixed_10/join']) net['softmax'] = DenseLayer(net['pool3'], num_units=1008, nonlinearity=softmax) test_output = lasagne.layers.get_output(net['softmax'], deterministic=True) test_loss = t.mean(t.sqr(t.maximum(0., 1. - target * test_output))) test_err = t.mean(t.neq(t.argmax(test_output, axis=1), t.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target], [test_loss, test_err]) return {'model': net['softmax'], 'val_fn': val_fn}
def nlls(o,y): return -T.mean( (T.log(o)[T.arange(y.shape[0]),y]+T.sum(T.log(1-o), axis=1)-T.log(1-o)[T.arange(y.shape[0]),y]) * T.neq(y,-1))
def train(self, train_sets, valid_sets, test_sets, n_epochs=200, learning_rate=0.1): train_set_x, train_set_y = train_sets valid_set_x, valid_set_y = valid_sets test_set_x, test_set_y = test_sets n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= self.batch_size n_valid_batches //= self.batch_size n_test_batches //= self.batch_size cost = -T.mean( T.log(self.final_output[T.arange(self.y.shape[0]), self.y])) error = T.mean(T.neq(T.argmax(self.final_output, axis=1), self.y)) # find all the parameters and update them using gradient descent params = self.params grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] index = self.index batch_size = self.batch_size x = self.x y = self.y test_model = theano.function( [index], error, givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], error, givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is found improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter, flush=True) cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch {}, minibatch {}/{}, validation error {}%'. format(epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) with open('model_{}.mod'.format(iter), 'wb') as f: pickle.dump(self.dump(), f) # if we got the best validation score until now if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch {}, minibatch {}/{}, test error of ' 'best model {}%').format(epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) with open('test_{}.res'.format(iter), 'w') as f: print(network.predict(test_set_x), file=f) end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
def train(self, offline=False, data=None, mean=None, std=None ): print 'mlp.train' def gradient_updates_momentum(cost, params, learning_rate, momentum): updates = [] for param in params: param_update = theano.shared(param.get_value()*0., broadcastable=param.broadcastable) updates.append((param, param - learning_rate*param_update)) updates.append((param_update, momentum*param_update + (1. - momentum)*T.grad(cost, param))) return updates patchSize = self.patchSize batchSize = self.batchSize learning_rate = self.learning_rate momentum = self.momentum rng = numpy.random.RandomState(1234) tx, ty, vx, vy, reset = data.sample() train_samples = len(ty) val_samples = len(vy) train_set_x, train_set_y = shared_dataset((tx, ty), doCastLabels=True) if val_samples > 0: valid_set_x, valid_set_y = shared_dataset((vx, vy), doCastLabels=True) if reset: self.best_validation_loss = numpy.inf # compute number of minibatches for training, validation and testing n_train_batches = train_samples / batchSize n_valid_batches = val_samples / 1000 #batchSize # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = self.x #T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels cost = self.cost(y) lr = T.scalar('learning_rate') m = T.scalar('momentum') learning_rate_shared = theano.shared(np.float32(learning_rate)) momentum_shared = theano.shared(np.float32(momentum)) print 'training data....' print 'n_train_batches:',n_train_batches print 'n_valid_batches:',n_valid_batches print 'train_samples:', train_samples print 'val_samples:', val_samples print 'best_validation:', self.best_validation_loss if val_samples > 0: validate_model = theano.function( [index], self.errors(y), givens={ x: valid_set_x[index * batchSize: (index + 1) * batchSize], y: valid_set_y[index * batchSize: (index + 1) * batchSize] } ) predict_samples = theano.function( [], outputs=T.neq(self.y_pred, y), givens={ x: train_set_x, y: train_set_y, } ) gparams = [] for param in self.params: gparam = T.grad(cost, param) gparams.append(gparam) updates = gradient_updates_momentum(cost, self.params, lr, m) train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batchSize:(index + 1) * batchSize], y: train_set_y[index * batchSize:(index + 1) * batchSize], lr: learning_rate_shared, m: momentum_shared}) ############### # TRAIN MODEL # ############### print '... training' validation_frequency = 1 start_time = time.clock() minibatch_avg_costs = [] iter = 0 epoch = 0 self.best_train_error = np.inf last_train_error = numpy.inf for minibatch_index in xrange(n_train_batches): if self.done: break train_cost = train_model(minibatch_index) minibatch_avg_costs.append( train_cost ) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if n_valid_batches == 0: train_error = minibatch_avg_costs[-1].item(0) print minibatch_index, '-', train_error if train_error < self.best_train_error: self.best_train_error = train_error self.save() if n_valid_batches > 0 and (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = np.array([validate_model(i) for i in xrange(n_valid_batches)]) #this_validation_loss = numpy.sum(validation_losses) * 100.0 / val_samples this_validation_loss = numpy.mean(validation_losses*100.0) elapsed_time = time.clock() - start_time data.report_stats( self.id, elapsed_time, minibatch_index, this_validation_loss, minibatch_avg_costs[-1].item(0)) # if we got the best validation score until now if this_validation_loss < self.best_validation_loss: self.best_validation_loss = this_validation_loss self.save() print "New best score!" #if n_valid_batches == 0: # self.save() if not self.offline: probs = predict_samples() data.p[ data.i_train ] = probs data.save_stats()
def build_encoder(self, x, xmask=None, **kwargs): one_step = False if len(kwargs): one_step = True # if x.ndim == 2 then # x = (n_steps, batch_size) if x.ndim == 2: batch_size = x.shape[1] # else x = (word_1, word_2, word_3, ...) # or x = (last_word_1, last_word_2, last_word_3, ..) # in this case batch_size is else: batch_size = 1 # if it is not one_step then we initialize everything to 0 if not one_step: h_0 = T.alloc(np.float32(0), batch_size, self.qdim) hr_0 = T.alloc(np.float32(0), batch_size, self.qdim) hs_0 = T.alloc(np.float32(0), batch_size, self.sdim) # in sampling mode (i.e. one step) we require else: # in this case x.ndim != 2 assert x.ndim != 2 assert 'prev_h' in kwargs assert 'prev_hr' in kwargs assert 'prev_hs' in kwargs h_0 = kwargs['prev_h'] hr_0 = kwargs['prev_hr'] hs_0 = kwargs['prev_hs'] xe = self.approx_embedder(x) if xmask == None: xmask = T.neq(x, self.eoq_sym) # Gated Encoder if self.query_step_type == "gated": f_enc = self.gated_query_step o_enc_info = [h_0, hr_0, None, None, None] else: f_enc = self.plain_query_step o_enc_info = [h_0, hr_0] if self.session_step_type == "gated": f_hier = self.gated_session_step o_hier_info = [hs_0, None, None, None] else: f_hier = self.plain_session_step o_hier_info = [hs_0] # Run through all the sentence (encode everything) if not one_step: _res, _ = theano.scan(f_enc, sequences=[xe, xmask], outputs_info=o_enc_info) # Make just one step further else: _res = f_enc(xe, xmask, h_0, hr_0) # Get the hidden state sequence h = _res[0] hr = _res[1] # All hierarchical sentence # The hs sequence is based on the original mask if not one_step: _res, _ = theano.scan(f_hier, sequences=[h, xmask], outputs_info=o_hier_info) # Just one step further else: _res = f_hier(h, xmask, hs_0) if isinstance(_res, list) or isinstance(_res, tuple): hs = _res[0] else: hs = _res return (h, hr), hs, (_res[2], _res[3])
def train_online(self, data): print 'train online...' def gradient_updates_momentum(cost, params, learning_rate, momentum): updates = [] for param in params: param_update = theano.shared(param.get_value()*0., broadcastable=param.broadcastable) updates.append((param, param - learning_rate*param_update)) updates.append((param_update, momentum*param_update + (1. - momentum)*T.grad(cost, param))) return updates # DATA INITIALIZATION d = data.sample() train_x = d[0] train_y = d[1] valid_x = d[2] valid_y = d[3] reset = d[4] if reset: self.best_validation_loss = numpy.inf train_samples = len(train_y) valid_samples = len(valid_y) print 'valid_samples:',valid_samples print 'train_samples:', train_samples if self.resample: self.lr_shared.set_value( np.float32(self.learning_rate) ) self.m_shared.set_value( np.float32(self.momentum) ) else: self.resample = True self.y = T.ivector('y') # the labels are presented as 1D vector of [int] labels self.lr = T.scalar('learning_rate') self.m = T.scalar('momentum') self.lr_shared = theano.shared(np.float32(self.learning_rate)) self.m_shared = theano.shared(np.float32(self.momentum)) index = T.lscalar() # index to a [mini]batch x = self.x y = self.y lr = self.lr m = self.m lr_shared = self.lr_shared m_shared = self.m_shared patchSize = self.patchSize batchSize = self.batchSize train_set_x, train_set_y = shared_dataset((train_x, train_y), doCastLabels=True) if valid_samples > 0: valid_set_x, valid_set_y = shared_dataset((valid_x, valid_y), doCastLabels=True) # compute number of minibatches for training, validation n_train_batches = train_samples / batchSize n_valid_batches = valid_samples / batchSize #BUILD THE MODEL cost = self.cost(y) if valid_samples > 0: validate_model = theano.function( [index], self.errors(y), givens={ x: valid_set_x[index * batchSize: (index + 1) * batchSize], y: valid_set_y[index * batchSize: (index + 1) * batchSize] } ) ''' predict_samples = theano.function( inputs=[index], outputs=T.neq(self.y_pred, self.y), givens={ x: train_set_x[index * batchSize: (index + 1) * batchSize], y: train_set_y[index * batchSize: (index + 1) * batchSize] } ) ''' predict_samples = theano.function( [], outputs=T.neq(self.y_pred, self.y), givens={ x: train_set_x, y: train_set_y, } ) gparams = [] for param in self.params: gparam = T.grad(cost, param) gparams.append(gparam) updates = gradient_updates_momentum(cost, self.params, lr, m) train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batchSize:(index + 1) * batchSize], y: train_set_y[index * batchSize:(index + 1) * batchSize], lr: lr_shared, m: m_shared}) # TRAIN THE MODEL print '... training' print 'self.best_validation_loss:', self.best_validation_loss best_iter = 0 validation_frequency = 1 start_time = time.clock() elapsed_time = 0 iter = 0 minibatch_avg_costs = [] minibatch_index = 0 #while (elapsed_time < self.trainTime)\ # and (minibatch_index<n_train_batches)\ # and (not self.done): while (minibatch_index<n_train_batches) and (not self.done): if (elapsed_time >= self.trainTime): break train_cost = train_model(minibatch_index) # test the trained samples against the target # values to measure the training performance i = minibatch_index ''' probs = predict_samples(minibatch_index) #print 'probs:', probs.shape i_batch = data.i_train[ i * batchSize:(i+1)*batchSize ] data.p[ i_batch ] = probs ''' ''' good = np.where( probs == 0)[0] bad = np.where( probs == 1)[0] print 'bad:', len(bad) print 'good:', len(good) #print probs ''' #print '----->traincost:', type(train_cost), train_cost minibatch_avg_costs.append(train_cost) iter += 1 #iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0 and valid_samples > 0: validation_losses = np.array([validate_model(i) for i in xrange(n_valid_batches)]) this_validation_loss = numpy.sum(validation_losses) * 100.0 / valid_samples elapsed_time = time.clock() - start_time ''' self.reportTrainingStats(elapsed_time, minibatch_index, this_validation_loss, minibatch_avg_costs[-1].item(0)) ''' print this_validation_loss, '/', self.best_validation_loss data.add_validation_loss( this_validation_loss ) # if we got the best validation score until now if this_validation_loss < self.best_validation_loss: self.best_validation_loss = this_validation_loss best_iter = iter self.save() print "New best score!" # advance to next mini batch minibatch_index += 1 # update elapsed time elapsed_time = time.clock() - start_time if valid_samples == 0: self.save() probs = predict_samples() data.p[ data.i_train ] = probs elapsed_time = time.clock() - start_time msg = 'The code an for' status = '%f seconds' % (elapsed_time) Utility.report_status( msg, status ) print 'done...'
def _error_func(self, y): return 100 * T.mean(T.neq(T.argmax(y, axis=1), self.k))
def evaluate(self, train_set, test_set, shuffle_batch=True, epochs=25, lr_decay=0.95, sqr_norm_lim=9,labels=None,model=None): """ Train a simple conv net sqr_norm_lim = s^2 in the paper lr_decay = adadelta decay parameter """ cost = self.negative_log_likelihood(self.y) dropout_cost = self.dropout_negative_log_likelihood(self.y) # adadelta upgrades: dict of variable:delta grad_updates = self.sgd_updates_adadelta(dropout_cost, lr_decay, 1e-6, sqr_norm_lim) # shuffle dataset and assign to mini batches. # if dataset size is not a multiple of batch size, replicate # extra data (at random) np.random.seed(3435) batch_size = self.batch_size if train_set.shape[0] % batch_size > 0: extra_data_num = batch_size - train_set.shape[0] % batch_size #extra_data = train_set[np.random.choice(train_set.shape[0], extra_data_num)] perm_set = np.random.permutation(train_set) extra_data = perm_set[:extra_data_num] new_data = np.append(train_set, extra_data, axis=0) else: new_data = train_set shuffled_data = np.random.permutation(new_data) # Attardi n_batches = shuffled_data.shape[0]/batch_size # divide train set into 90% train, 10% validation sets n_train_batches = int(np.round(n_batches*0.8)) n_val_batches = n_batches - n_train_batches train_set = shuffled_data[:n_train_batches*batch_size,:] val_set = shuffled_data[n_train_batches*batch_size:,:] # push data to gpu # the dataset has the format [word_indices,padding,user,label] train_set_x, train_set_y = shared_dataset(train_set[:,:-2], train_set[:,-1]) train_set_u = theano.shared(np.asarray(train_set[:,-2],dtype='int32')) # val_set_x = val_set[:,:-2] # val_set_u = val_set[:,-2] # val_set_y = val_set[:,-1] val_set_x, val_set_y = shared_dataset(val_set[:,:-2], val_set[:,-1]) val_set_u = theano.shared(np.asarray(val_set[:,-2],dtype='int32')) test_set_x = test_set[:,:-2] test_set_u = test_set[:,-2] test_set_y = test_set[:,-1] batch_start = self.index * batch_size batch_end = batch_start + batch_size # compile Theano functions to get train/val/test errors test_y_pred = self.predict(test_set_x) test_error = T.mean(T.neq(test_y_pred, self.y)) # errors on train set if self.Users is not None: train_model = theano.function([self.index], cost, updates=grad_updates, givens={ self.x: train_set_x[batch_start:batch_end], self.y: train_set_y[batch_start:batch_end], self.u: train_set_u[batch_start:batch_end] }, allow_input_downcast = True) train_error = theano.function([self.index], self.errors(self.y), givens={ self.x: train_set_x[batch_start:batch_end], self.y: train_set_y[batch_start:batch_end], self.u: train_set_u[batch_start:batch_end]}, allow_input_downcast=True) val_model = theano.function([self.index], self.errors(self.y), givens={ self.x: val_set_x[batch_start:batch_end], self.y: val_set_y[batch_start:batch_end], self.u: val_set_u[batch_start:batch_end]}, allow_input_downcast=True) test_model = theano.function([self.x, self.u, self.y], test_error, allow_input_downcast=True) else: train_model = theano.function([self.index], cost, updates=grad_updates, givens={ self.x: train_set_x[batch_start:batch_end], self.y: train_set_y[batch_start:batch_end]}, allow_input_downcast = True) train_error = theano.function([self.index], self.errors(self.y), givens={ self.x: train_set_x[batch_start:batch_end], self.y: train_set_y[batch_start:batch_end]}, allow_input_downcast=True) val_model = theano.function([self.index], self.errors(self.y), givens={ self.x: val_set_x[batch_start:batch_end], self.y: val_set_y[batch_start:batch_end]}, allow_input_downcast=True) test_model = theano.function([self.x, self.y], test_error, allow_input_downcast=True) # start training over mini-batches print 'training...' best_val_perf = 0 test_perf = 0 patience = 5 drops = 0 prev_val_perf = 0 for epoch in xrange(epochs): start_time = time.time() # FIXME: should permute whole set rather than minibatch indexes if shuffle_batch: for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_model(minibatch_index) self.set_zero(self.zero_vec) # CHECKME: Why? else: for minibatch_index in xrange(n_train_batches): cost_epoch = train_model(minibatch_index) self.set_zero(self.zero_vec) train_losses = [train_error(i) for i in xrange(n_train_batches)] train_perf = 1 - np.mean(train_losses) val_losses = [val_model(i) for i in xrange(n_val_batches)] val_perf = 1 - np.mean(val_losses) info = 'epoch: %i\%i (%.2f secs) train acc: %.2f %% | val acc: %.2f %%' % ( epoch,epochs, time.time()-start_time, train_perf * 100., val_perf*100.) # from ipdb import set_trace; set_trace() if val_perf > prev_val_perf: drops=0 if val_perf >= best_val_perf: best_val_perf = val_perf info+= " **" if model: # print "save model" self.save(model) if self.Users is not None: test_loss = test_model(test_set_x, test_set_u, test_set_y) else: test_loss = test_model(test_set_x, test_set_y) test_perf = 1 - test_loss else: drops+=1 if drops >= patience: print "Ran out of patience..." break prev_val_perf = val_perf print info # set_trace() return test_perf
def __init__(self, state): Model.__init__(self) self.state = state # Compatibility towards older models self.__dict__.update(state) self.rng = numpy.random.RandomState(state['seed']) # Load dictionary raw_dict = cPickle.load(open(self.dictionary, 'r')) # Probabilities for each term in the corpus self.noise_probs = [ x[2] for x in sorted(raw_dict, key=operator.itemgetter(1)) ] self.noise_probs = numpy.array(self.noise_probs, dtype='float64') self.noise_probs /= numpy.sum(self.noise_probs) self.noise_probs = self.noise_probs**0.75 self.noise_probs /= numpy.sum(self.noise_probs) self.t_noise_probs = theano.shared(self.noise_probs.astype('float32'), 't_noise_probs') # Dictionaries to convert str to idx and vice-versa self.str_to_idx = dict([(tok, tok_id) for tok, tok_id, _ in raw_dict]) self.idx_to_str = dict([(tok_id, tok) for tok, tok_id, freq in raw_dict]) if '</q>' not in self.str_to_idx \ or '</s>' not in self.str_to_idx: raise Exception("Error, malformed dictionary!") # Number of words in the dictionary self.idim = len(self.str_to_idx) self.state['idim'] = self.idim logger.debug("Initializing encoder") self.encoder = Encoder(self.state, self.rng, self) logger.debug("Initializing decoder") self.decoder = Decoder(self.state, self.rng, self, self.encoder) # Init params self.params = self.encoder.params + self.decoder.params assert len(set(self.params)) == (len(self.encoder.params) + len(self.decoder.params)) self.y_neg = T.itensor3('y_neg') self.x_data = T.imatrix('x_data') self.x_ranks = T.imatrix('x_ranks') self.x_cost_mask = T.matrix('cost_mask') self.x_max_length = T.iscalar('x_max_length') # The training is done with a trick. We append a special </q> at the beginning of the dialog # so that we can predict also the first sent in the dialog starting from the dialog beginning token (</q>). self.aug_x_data = T.concatenate([ T.alloc(np.int32(self.eoq_sym), 1, self.x_data.shape[1]), self.x_data ]) training_x = self.aug_x_data[:self.x_max_length] training_y = self.aug_x_data[1:self.x_max_length + 1] training_ranks = self.x_ranks[:self.x_max_length - 1].flatten() training_ranks_mask = T.neq(training_ranks, 0).flatten() # Here we find the end-of-sentence tokens in the minibatch. training_hs_mask = T.neq(training_x, self.eoq_sym) training_x_cost_mask = self.x_cost_mask[:self.x_max_length].flatten() # Backward compatibility if 'decoder_bias_type' in self.state: logger.debug("Decoder bias type {}".format(self.decoder_bias_type)) logger.info("Build encoder") (self.h, _), self.hs, (self.rs, self.us) = \ self.encoder.build_encoder(training_x, xmask=training_hs_mask) logger.info("Build decoder (EVAL)") target_probs, self.hd, self.decoder_states = \ self.decoder.build_decoder(self.hs, training_x, xmask=training_hs_mask, \ y=training_y, mode=Decoder.EVALUATION) logger.info("Build rank predictor") self.predicted_ranks = self.decoder.build_rank_layer(self.hs) # Prediction cost and rank cost self.per_example_cost = -T.log2(target_probs).reshape( (self.x_max_length, self.x_data.shape[1])) self.rank_cost = T.sum( ((self.predicted_ranks[1:].flatten() - training_ranks)**2) * (training_ranks_mask)) / T.sum(training_ranks_mask) self.training_cost = T.sum( -T.log2(target_probs) * training_x_cost_mask) + np.float32( self.lambda_rank) * self.rank_cost self.updates = self.compute_updates( self.training_cost / training_x.shape[1], self.params) # Beam-search variables self.beam_source = T.lvector("beam_source") self.beam_hs = T.matrix("beam_hs") self.beam_step_num = T.lscalar("beam_step_num") self.beam_hd = T.matrix("beam_hd")
def errors(self, y): """ Errors over the total number of examples (in the minibatch) """ return T.mean(T.neq(self.y_pred, y))
def ready(self): global total_generate_time #say("in generator ready: \n") #start_generate_time = time.time() embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX) ) # len*batch x = self.x = T.imatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [ ] layer_type = args.layer.lower() for i in xrange(2): if layer_type == "rcnn": l = RCNN( n_in = n_e, n_out = n_d, activation = activation, order = args.order ) elif layer_type == "lstm": l = LSTM( n_in = n_e, n_out = n_d, activation = activation ) layers.append(l) # len * batch #masks = T.cast(T.neq(x, padding_id), theano.config.floatX) masks = T.cast(T.neq(x, padding_id), theano.config.floatX ).dimshuffle((0,1,"x")) # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) self.word_embs = embs flipped_embs = embs[::-1] # len*bacth*n_d h1 = layers[0].forward_all(embs) h2 = layers[1].forward_all(flipped_embs) h_final = T.concatenate([h1, h2[::-1]], axis=2) h_final = apply_dropout(h_final, dropout) size = n_d * 2 #size = n_e output_layer = self.output_layer = Layer( n_in = size, n_out = 1, activation = sigmoid ) # len*batch*1 probs = output_layer.forward(h_final) #probs = output_layer.forward(embs) #probs1 = probs.reshape(x.shape) #probs_rev = output_layer.forward(flipped_embs) #probs1_rev = probs.reshape(x.shape) #probs = T.concatenate([probs1, probs1_rev[::-1]], axis=2) # len*batch probs2 = probs.reshape(x.shape) if self.args.seed is not None: self.MRG_rng = MRG_RandomStreams(self.args.seed) else: self.MRG_rng = MRG_RandomStreams() z_pred = self.z_pred = T.cast(self.MRG_rng.binomial(size=probs2.shape, p=probs2), theano.config.floatX) #"int8") # we are computing approximated gradient by sampling z; # so should mark sampled z not part of the gradient propagation path # z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred) #self.sample_updates = sample_updates print "z_pred", z_pred.ndim z2 = z_pred.dimshuffle((0,1,"x")) logpz = - T.nnet.binary_crossentropy(probs, z2) * masks logpz = self.logpz = logpz.reshape(x.shape) probs = self.probs = probs.reshape(x.shape) # batch z = z_pred self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX) self.zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX) params = self.params = [ ] for l in layers + [ output_layer ]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost
def error(self,): return T.mean(T.neq(self.pred, self.outputs))
def main(num_epochs=NEPOCH): print("Loading data ...") snli = SNLI(batch_size=BSIZE) train_batches = list(snli.train_minibatch_generator()) dev_batches = list(snli.dev_minibatch_generator()) test_batches = list(snli.test_minibatch_generator()) W_word_embedding = snli.weight # W shape: (# vocab size, WE_DIM) W_word_embedding = snli.weight / \ (numpy.linalg.norm(snli.weight, axis=1).reshape(snli.weight.shape[0], 1) + \ 0.00001) del snli print("Building network ...") ########### input layers ########### # hypothesis input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector') input_var_h.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 18), 'int32'), numpy.zeros( (BSIZE, 6)).astype('int32'))) l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_h) input_mask_h = T.TensorType('int32', [False, False])('hypo_mask') input_mask_h.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32'))) input_mask_h.tag.test_value[1, 18:22] = 1 l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_h) # premise input_var_p = T.TensorType('int32', [False, False])('premise_vector') input_var_p.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 16), 'int32'), numpy.zeros( (BSIZE, 3)).astype('int32'))) l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_p) input_mask_p = T.TensorType('int32', [False, False])('premise_mask') input_mask_p.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32'))) input_mask_p.tag.test_value[1, 16:18] = 1 l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_p) ################################### # output shape (BSIZE, None, WEDIM) l_hypo_embed = lasagne.layers.EmbeddingLayer( l_in_h, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=W_word_embedding) l_prem_embed = lasagne.layers.EmbeddingLayer( l_in_p, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=l_hypo_embed.W) # EMBEDING MAPPING: output shape (BSIZE, None, WEMAP) l_hypo_reduced_embed = DenseLayer3DInput(l_hypo_embed, num_units=WEMAP, b=None, nonlinearity=None) l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_reduced_embed, p=DPOUT, rescale=True) l_prem_reduced_embed = DenseLayer3DInput(l_prem_embed, num_units=WEMAP, W=l_hypo_reduced_embed.W, b=None, nonlinearity=None) l_prem_embed_dpout = lasagne.layers.DropoutLayer(l_prem_reduced_embed, p=DPOUT, rescale=True) # ATTEND l_hypo_embed_hid1 = DenseLayer3DInput( l_hypo_embed_dpout, num_units=EMBDHIDA, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_hypo_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_hypo_embed_hid1, p=DPOUT, rescale=True) l_hypo_embed_hid2 = DenseLayer3DInput( l_hypo_embed_hid1_dpout, num_units=EMBDHIDB, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_prem_embed_hid1 = DenseLayer3DInput( l_prem_embed_dpout, num_units=EMBDHIDA, W=l_hypo_embed_hid1.W, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_prem_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_prem_embed_hid1, p=DPOUT, rescale=True) l_prem_embed_hid2 = DenseLayer3DInput( l_prem_embed_hid1_dpout, num_units=EMBDHIDB, W=l_hypo_embed_hid2.W, b=None, nonlinearity=lasagne.nonlinearities.rectify) # output dim: (BSIZE, NROWx, NROWy) l_e = ComputeEmbeddingPool([l_hypo_embed_hid1, l_prem_embed_hid2]) # output dim: (BSIZE, NROWy, DIM) l_hypo_weighted = AttendOnEmbedding([l_hypo_reduced_embed, l_e], masks=[l_mask_h, l_mask_p], direction='col') # output dim: (BSIZE, NROWx, DIM) l_prem_weighted = AttendOnEmbedding([l_prem_reduced_embed, l_e], masks=[l_mask_h, l_mask_p], direction='row') # COMPARE # output dim: (BSIZE, NROW, 4*LSTMHID) l_hypo_premwtd = lasagne.layers.ConcatLayer( [l_hypo_reduced_embed, l_prem_weighted], axis=2) l_prem_hypowtd = lasagne.layers.ConcatLayer( [l_prem_reduced_embed, l_hypo_weighted], axis=2) l_hypo_premwtd_dpout = lasagne.layers.DropoutLayer(l_hypo_premwtd, p=DPOUT, rescale=True) l_hypo_comphid1 = DenseLayer3DInput( l_hypo_premwtd_dpout, num_units=COMPHIDA, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_hypo_comphid1_dpout = lasagne.layers.DropoutLayer(l_hypo_comphid1, p=DPOUT, rescale=True) l_hypo_comphid2 = DenseLayer3DInput( l_hypo_comphid1_dpout, num_units=COMPHIDB, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_prem_hypowtd_dpout = lasagne.layers.DropoutLayer(l_prem_hypowtd, p=DPOUT, rescale=True) l_prem_comphid1 = DenseLayer3DInput( l_prem_hypowtd_dpout, num_units=COMPHIDA, W=l_hypo_comphid1.W, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_prem_comphid1_dpout = lasagne.layers.DropoutLayer(l_prem_comphid1, p=DPOUT, rescale=True) l_prem_comphid2 = DenseLayer3DInput( l_prem_comphid1_dpout, num_units=COMPHIDB, W=l_hypo_comphid2.W, b=None, nonlinearity=lasagne.nonlinearities.rectify) # AGGREGATE # output dim: (BSIZE, 4*LSTMHID) l_hypo_mean = MeanOverDim(l_hypo_comphid2, mask=l_mask_h, dim=1) l_prem_mean = MeanOverDim(l_prem_comphid2, mask=l_mask_p, dim=1) l_v1v2 = lasagne.layers.ConcatLayer([l_hypo_mean, l_prem_mean], axis=1) l_v1v2_dpout = lasagne.layers.DropoutLayer(l_v1v2, p=DPOUT, rescale=True) l_outhid1 = lasagne.layers.DenseLayer( l_v1v2_dpout, num_units=OUTHID, b=None, nonlinearity=lasagne.nonlinearities.rectify) l_outhid1_dpout = lasagne.layers.DropoutLayer(l_outhid1, p=DPOUT, rescale=True) l_outhid2 = lasagne.layers.DenseLayer( l_outhid1_dpout, num_units=OUTHID, b=None, nonlinearity=lasagne.nonlinearities.rectify) # l_outhid2_dpout = lasagne.layers.DropoutLayer(l_outhid2, p=DPOUT, rescale=True) l_output = lasagne.layers.DenseLayer( l_outhid2, num_units=3, b=None, nonlinearity=lasagne.nonlinearities.softmax) ########### target, cost, validation, etc. ########## target_values = T.ivector('target_output') target_values.tag.test_value = numpy.asarray([ 1, ] * BSIZE, dtype='int32') network_output = lasagne.layers.get_output(l_output) network_prediction = T.argmax(network_output, axis=1) error_rate = T.mean(T.neq(network_prediction, target_values)) network_output_clean = lasagne.layers.get_output(l_output, deterministic=True) network_prediction_clean = T.argmax(network_output_clean, axis=1) error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values)) cost = T.mean( T.nnet.categorical_crossentropy(network_output, target_values)) cost_clean = T.mean( T.nnet.categorical_crossentropy(network_output_clean, target_values)) # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_output) if not UPDATEWE: all_params.remove(l_hypo_embed.W) numparams = sum( [numpy.prod(i) for i in [i.shape.eval() for i in all_params]]) print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams)) print("-----------------------------------------------------------------") for item in all_params: print("{0:24}{1:24}{2}".format(item, item.shape.eval(), numpy.prod(item.shape.eval()))) # if exist param file then load params look_for = 'params' + os.sep + 'params_' + filename + '.pkl' if os.path.isfile(look_for): print("Resuming from file: " + look_for) all_param_values = cPickle.load(open(look_for, 'rb')) for p, v in zip(all_params, all_param_values): p.set_value(v) # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LR) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost, error_rate], updates=updates) # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) compute_cost = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost_clean, error_rate_clean]) # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) def evaluate(mode): if mode == 'dev': data = dev_batches if mode == 'test': data = test_batches set_cost = 0. set_error_rate = 0. for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1): _cost, _error = compute_cost(hypo, hm, premise, pm, truth) set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \ 1.0 / batches_seen * _cost set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \ 1.0 / batches_seen * _error return set_cost, set_error_rate print("Done. Evaluating scratch model ...") dev_set_cost, dev_set_error = evaluate('dev') print("BEFORE TRAINING: dev cost %f, error %f" % (dev_set_cost, dev_set_error)) print("Training ...") try: for epoch in range(num_epochs): train_set_cost = 0. train_set_error = 0. start = time.time() for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(train_batches, 1): _cost, _error = train(hypo, hm, premise, pm, truth) train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \ 1.0 / batches_seen * _cost train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \ 1.0 / batches_seen * _error if (batches_seen * BSIZE) % 5000 == 0: end = time.time() print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" % (batches_seen * BSIZE, end - start, LR, train_set_cost, train_set_error)) start = end if (batches_seen * BSIZE) % 100000 == 0: dev_set_cost, dev_set_error = evaluate('dev') print("***dev cost %f, error %f" % (dev_set_cost, dev_set_error)) # save parameters all_param_values = [p.get_value() for p in all_params] cPickle.dump( all_param_values, open('params' + os.sep + 'params_' + filename + '.pkl', 'wb')) dev_set_cost, dev_set_error = evaluate('dev') test_set_cost, test_set_error = evaluate('test') print("epoch %d, cost: train %f dev %f test %f;\n" " error train %f dev %f test %f" % (epoch, train_set_cost, dev_set_cost, test_set_cost, train_set_error, dev_set_error, test_set_error)) except KeyboardInterrupt: pdb.set_trace() pass
def run(self, y): # y comes in as shape batch X total_seq y = y.transpose([1,0]) # y is of shape seq X batch and of type 'int' # y needs to be 1-hot encoded, but this is more # easily done in the step function # reverse each example of y (not the batches, just the variables) y_rev = y[::-1, :] # get initial values for LSTMs hf, cf = self.forward_lstm.get_initial_hidden hb, cb = self.backward_lstm.get_initial_hidden # setup initial values for scan outputs_info = [dict(initial=hf, taps=[-1]), # hf dict(initial=cf, taps=[-1]), # cf dict(initial=hb, taps=[-1]), # cb dict(initial=cb, taps=[-1])] # cb # run LSTM loop [hf,cf,hb,cb], _ = theano.scan(fn=self.step, sequences=[y,y_rev], outputs_info=outputs_info, n_steps=self.N) # return forward and backward concatenated # this needs to be aligned so that [4,13,45,3,X, X, X] # and [0,0, 0, 3,45,13,4] # concatenate correctly to [4/3,13/25,45/13,3/4,X,X,X] # stores the indices of the string b_indx = zeros((self.N, self.bs), int) # stores the last-set index c = zeros((self.bs,), int) # This loop creates an array that can be used to # map hb to hf with the proper alignment for i in range(self.N): # if this part of y_rev is 0, ignore # else, get the current index indx = T.switch(T.neq(y_rev[i,:], 0), i, 0) # set b_indx to be the current indx if this is # a valid part of the string b_indx = T.set_subtensor(b_indx[c,T.arange(self.bs)], indx) # increment those that were used inc = T.switch(T.neq(y_rev[i,:], 0), 1, 0) c = c + inc # the magic that gets hb to align with hf # it takes hb, uses the aligning indices and grabs those on the # diagonal as the elements we are interested in. This results in # essentially "shifting" the first non-zero element of hb # to the front of the list, for each sample in the batch h_b_aligned = hb[b_indx][:,T.arange(self.bs),T.arange(self.bs)] # concatenate them together. Now everything is aligned, as it should be! h_lang = T.concatenate([hf, h_b_aligned], axis=2) # axis 0 -> N # axis 1 -> batch # axis 2 -> m return h_lang
X = T.matrix() X.tag.test_value = np.zeros((100,784),dtype='float32') Y = T.matrix() Y.tag.test_value = np.zeros((100,10),dtype='float32') Q = model.mf(V=X, Y=Y) H2 = Q[-2][-1] hid, pen, lab = model.hidden_layers Y_hat = lab.mf_update(state_below = H2) true = T.argmax(Y, axis=1) pred = T.argmax(Y_hat, axis=1) err = T.neq(true, pred) err_count = err.sum() errs = function([X,Y], err_count) total = 0 dataset = MNIST(which_set = 'train', binarize=1, one_hot=True) for i in xrange(0, 60000, 100): x = dataset.X[i:i+100,:].astype(X.dtype) assert x.shape == (100, 784) y = dataset.y[i:i+100,:].astype(Y.dtype) assert y.shape == (100, 10) total += errs(x, y)