def get_monitoring_channels(self, X, Y, **kwargs): theano_rng = MRG_RandomStreams(2012 + 12 + 19) # Explanation of reality zh1, rh1 = self.infer_h1(X) rh1 = block_gradient(rh1) zh2 = T.dot(rh1, self.rh2w) + self.rh2b rh2 = theano_rng.binomial(p = T.nnet.sigmoid(zh2), size = zh2.shape, dtype='float32') rh2 = block_gradient(rh2) y = T.dot(rh2, self.ryw) + self.ryb err = T.neq(T.argmax(y, axis=1), T.argmax(Y, axis=1)) assert err.ndim == 1 return { 'misclass' : err.astype('float32').mean() }
def cost(self, Y, Y_hat): # Pull out the argument to the sigmoid assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if not hasattr(op, 'scalar_op'): raise ValueError("Expected Y_hat to be generated by an Elemwise " "op, got " + str(op) + " of type " + str(type(op))) assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid) z, = owner.inputs # Broadcasted multiplication with the gradient mask. if self._monitor_individual: z = (z * self._gradient_mask + block_gradient(z) * (1. - self._gradient_mask)) # Geometric mean. z = z.mean(axis=1) # Expecting binary targets. term_1 = Y[:, 0] * T.nnet.softplus(-z) term_2 = (1 - Y[:, 0]) * T.nnet.softplus(z) total = term_1 + term_2 assert total.ndim == 1 return total.mean()
def arithmetic_mean(self, state): reshaped = state.reshape((state.shape[0], self._n_replicas, state.shape[1] / self._n_replicas)) broadcasted_mask = self._grad_mask.dimshuffle('x', 0, 'x') unblocked = reshaped * broadcasted_mask blocked = block_gradient(reshaped) * (np.float32(1) - broadcasted_mask) return (unblocked + blocked).mean(axis=1)
def __call__(self, model, X, Y, **kwargs): Y_hat_e = model.fprop(X) Y_hat = model.fprop(X, apply_dropout=True) assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z ,= owner.inputs assert z.ndim == 2 z_weight = Y_hat - Y_hat_e z_weight = block_gradient(z_weight) neg = z_weight * z neg = neg.sum(axis=1).mean() z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x')) # we use sum and not mean because this is really one variable per row log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 log_prob_of = log_prob_of.mean() return -(log_prob_of + self.alpha * neg)
def __call__(self, model, X, Y, **kwargs): Y_hat = model.fprop(X, apply_dropout=False) prob = Y_hat * Y + (1-Y_hat) * (1-Y) weight = 1./(.1 + prob) weight = block_gradient(weight) Y_hat = model.fprop(X, apply_dropout=True) # Pull out the argument to the sigmoid assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if not hasattr(op, 'scalar_op'): raise ValueError("Expected Y_hat to be generated by an Elemwise op, got "+str(op)+" of type "+str(type(op))) assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid) Z ,= owner.inputs term_1 = Y * T.nnet.softplus(-Z) term_2 = (1 - Y) * T.nnet.softplus(Z) total = term_1 + term_2 total = weight * total ave = total.mean() return ave
def __call__(self, model, X, Y, ** kwargs): Y_hat, Y_hat_e = model.lone_ranger_dropout_fprop(X, default_input_include_prob=self.default_input_include_prob, input_include_probs=self.input_include_probs, default_input_scale=self.default_input_scale, input_scales=self.input_scales, scale_ensemble=self.scale_ensemble, dont_drop_input = self.dont_drop_input ) assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z ,= owner.inputs assert z.ndim == 2 z_weight = Y_hat - Y_hat_e z_weight = block_gradient(z_weight) neg = z_weight * z neg = neg.sum(axis=1).mean() z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x')) # we use sum and not mean because this is really one variable per row log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 log_prob_of = log_prob_of.mean() return -(log_prob_of + self.alpha * neg)
def dropout_fprop(self, inputs, default_input_include_prob=0.5, input_include_probs=None, default_input_scale=2., input_scales=None, per_example=True): inputs = self.input_space.format_as(inputs, self.mlp.input_space) if self.scale: inputs = inputs / 255. rval = self.mlp.dropout_fprop(inputs, default_input_include_prob, input_include_probs, default_input_scale, input_scales, per_example) if self.pooling_mode == 0: rval = tensor.max(rval, axis=0) elif self.pooling_mode == 1: top_ids = block_gradient(tensor.sort(rval, axis=0))[-3:][::-1] top_vals = rval[top_ids] rval = T.mean(top_vals) elif self.pooling_mode == 2: #import ipdb; ipdb.set_trace() #collapsed_rval = tensor.sum(rval, axis=1) top_ids = block_gradient(tensor.argsort(rval, axis=0))[::-1] top_vals_sum = rval[top_ids[0], tensor.arange(rval.shape[1])] * self.probs[0] #+\ #rval[top_ids[1], tensor.arange(rval.shape[1])] * self.probs[1] #+\ #rval[top_ids[2], tensor.arange(rval.shape[1])] * self.probs[2] rval = top_vals_sum / 2 else: raise Exception("Others are not implemented yet!") rval = rval.dimshuffle('x', 0) # TODO if you set input prob, the final layer doesn't recognize h0 if input_include_probs is None and input_scales is None: rval = self.final_layer.dropout_fprop(rval, default_input_include_prob, input_include_probs, default_input_scale, input_scales, per_example) else: rval = self.final_layer.fprop(rval) return rval
def get_samples_and_objectives(self, model, data): space, sources = self.get_data_specs(model) space.validate(data) assert isinstance(model, AdversaryPair) g = model.generator d = model.discriminator # Note: this assumes data is design matrix X = data m = data.shape[space.get_batch_axis()] y1 = T.alloc(1, m, 1) y0 = T.alloc(0, m, 1) # NOTE: if this changes to optionally use dropout, change the inference # code below to use a non-dropped-out version. S, z, other_layers = g.sample_and_noise(m, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None)) if self.noise_both != 0.: rng = MRG_RandomStreams(2014 / 6 + 2) S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob, self.discriminator_input_include_probs, self.discriminator_default_input_scale, self.discriminator_input_scales) y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob, self.discriminator_input_include_probs, self.discriminator_default_input_scale, self.discriminator_input_scales) d_obj = 0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0)) if self.no_drop_in_d_for_g: y_hat0_no_drop = d.dropout_fprop(S) g_obj = d.layers[-1].cost(y1, y_hat0_no_drop) else: g_obj = d.layers[-1].cost(y1, y_hat0) if self.blend_obj: g_obj = (self.zurich_coeff * g_obj - self.minimax_coeff * d_obj) / (self.zurich_coeff + self.minimax_coeff) if model.inferer is not None: # Change this if we ever switch to using dropout in the # construction of S. S_nograd = block_gradient(S) # Redundant as long as we have custom get_gradients pred = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob, self.inference_input_include_probs, self.inference_default_input_scale, self.inference_input_scales) if self.infer_layer is None: target = z else: target = other_layers[self.infer_layer] i_obj = model.inferer.layers[-1].cost(target, pred) else: i_obj = 0 return S, d_obj, g_obj, i_obj
def geometric_mean(self, state): pre = extract_op_argument(state, recurse=2) rsh = pre.reshape((pre.shape[0], self._n_replicas, pre.shape[1] / self._n_replicas)) broadcasted_mask = self._grad_mask.dimshuffle('x', 0, 'x') unblocked = rsh * broadcasted_mask blocked = block_gradient(rsh) * (np.float32(1) - broadcasted_mask) geo = T.nnet.softmax((unblocked + blocked).mean(axis=1)) return geo
def get_weight(self, model, X, Y): ensemble_Y = model.fprop(X, apply_dropout=False) prob_of = (ensemble_Y * Y).sum(axis=1) weight = 1./ (self.k + self.alpha * (prob_of - self.beta * 1./T.cast(Y.shape[1], 'float32'))) weight = weight / weight.sum() weight = block_gradient(weight) return weight
def block(l): """ .. todo:: WRITEME """ new = [] for elem in l: if isinstance(elem, (list, tuple)): new.append(block(elem)) else: new.append(block_gradient(elem)) if isinstance(l, tuple): return tuple(new) return new
def fprop(self, inputs): # format inputs inputs = self.input_space.format_as(inputs, self.mlp.input_space) rval = self.mlp.fprop(inputs) if self.pooling_mode == 0: rval = tensor.max(rval, axis=0) elif self.pooling_mode == 1: rval = block_gradient(tensor.sort(rval, axis=0))[-3:][::-1] rval = tensor.sum(rval * self.probs, axis=0) / 3 #rval = tensor.mean(rval, axis=0) else: raise Exception("Others are not implemented yet!") rval = rval.dimshuffle('x', 0) rval = self.final_layer.fprop(rval) return rval
def get_samples_and_objectives(self, model, data): space, sources = self.get_data_specs(model) space.validate(data) assert isinstance(model, AdversaryPair) g = model.generator d = model.discriminator # Note: this assumes data is b01c X = data assert X.ndim == 4 m = data.shape[space.get_batch_axis()] y1 = T.alloc(1, m, 1) y0 = T.alloc(0, m, 1) # NOTE: if this changes to optionally use dropout, change the inference # code below to use a non-dropped-out version. S, z = g.inpainting_sample_and_noise(X, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale) y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob, self.discriminator_input_include_probs, self.discriminator_default_input_scale, self.discriminator_input_scales) y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob, self.discriminator_input_include_probs, self.discriminator_default_input_scale, self.discriminator_input_scales) d_obj = 0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0)) if self.no_drop_in_d_for_g: y_hat0_no_drop = d.dropout_fprop(S) g_obj = d.layers[-1].cost(y1, y_hat0) else: g_obj = d.layers[-1].cost(y1, y_hat0) if model.inferer is not None: # Change this if we ever switch to using dropout in the # construction of S. S_nograd = block_gradient(S) # Redundant as long as we have custom get_gradients z_hat = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob, self.inference_input_include_probs, self.inference_default_input_scale, self.inference_input_scales) i_obj = model.inferer.layers[-1].cost(z, z_hat) else: i_obj = 0 return S, d_obj, g_obj, i_obj
def get_gradients(self, model, X, Y=None, **kwargs): assert 'dual' not in kwargs updates = {} if self.use_admm: rho = self.constraint_coeff * 2. dual = model.dual WBW = T.dot(model.W.T * model.beta, model.W) target = T.identity_like(WBW) err = WBW - target new_dual = dual + rho * err new_dual = block_gradient(new_dual) kwargs['dual'] = new_dual updates[dual] = new_dual cost = self(model, X, Y, **kwargs) params = model.get_params() assert not isinstance(params, set) return dict(zip(params, T.grad(cost, params))), updates
def __call__(self, model, X, Y, **kwargs): Y = Y * 2 - 1 # Get the approximate ensemble predictions Y_hat = model.fprop(X, apply_dropout=False) # Pull out the argument to the sigmoid assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if not hasattr(op, 'scalar_op'): raise ValueError("Expected Y_hat to be generated by an Elemwise op, got "+str(op)+" of type "+str(type(op))) assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid) F ,= owner.inputs weights = - Y * T.nnet.softmax(-(Y * F).T).T weights = block_gradient(weights) # Get the individual model predictions Y_hat = model.fprop(X, apply_dropout=True) # Pull out the argument to the sigmoid assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if not hasattr(op, 'scalar_op'): raise ValueError("Expected Y_hat to be generated by an Elemwise op, got "+str(op)+" of type "+str(type(op))) assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid) f ,= owner.inputs cost = (weights * T.exp(-Y * f)).mean() assert cost.ndim == 0 return cost
def get_cost(self, X, Y, **kwargs): # Dream theano_rng = MRG_RandomStreams(2012 + 12 + 18) exp_y = T.nnet.softmax(T.alloc(0., self.batch_size, self.n_classes) + self.gyb) dy = theano_rng.multinomial(pvals = exp_y, dtype='float32') dy = block_gradient(dy) exp_h2 = T.nnet.sigmoid(T.dot(dy, self.gh2w) + self.gh2b) dh2 = theano_rng.binomial(p = exp_h2, size = exp_h2.shape, dtype='float32') dh2 = block_gradient(dh2) exp_h1 = T.nnet.sigmoid(T.dot(dh2, self.gh1w) + self.gh1b) dh1 = theano_rng.binomial(p = exp_h1, size = exp_h1.shape, dtype='float32') dh1 = block_gradient(dh1) exp_v = T.nnet.sigmoid(T.dot(dh1, self.gvw) + self.gvb) dv = theano_rng.binomial(p = exp_v, size = exp_v.shape, dtype='float32') dv = block_gradient(dv) # Explanation of dream zh1, rh1 = self.infer_h1(dv) zh2 = T.dot(rh1, self.rh2w) + self.rh2b rh2 = T.nnet.sigmoid(zh2) zy = T.dot(rh2, self.ryw) + self.ryb # Probability of dream dream_prob = sigmoid_prob(zh1, dh1) + sigmoid_prob(zh2, dh2) + softmax_prob(zy, dy) # Explanation of reality zh1, rh1 = self.infer_h1(X) rh1 = block_gradient(rh1) zh2 = T.dot(rh1, self.rh2w) + self.rh2b rh2 = theano_rng.binomial(p = T.nnet.sigmoid(zh2), size = zh2.shape, dtype='float32') rh2 = block_gradient(rh2) # Probability of reality real_prob = softmax_prob(T.alloc(0., self.batch_size, self.n_classes) + self.gyb, Y) + \ sigmoid_prob(T.dot(Y, self.gh2w) + self.gh2b, rh2) + \ sigmoid_prob(T.dot(rh2, self.gh1w) + self.gh1b, rh1) + \ sigmoid_prob(T.dot(rh1, self.gvw) + self.gvb, X) return - dream_prob - real_prob + .0001 * ( T.sqr(self.gvw).sum() + T.sqr(self.gh1w).sum() + \ T.sqr(self.gh2w).sum() )
def get_samples_and_objectives(self, model, data): space, sources = self.get_data_specs(model) space.validate(data) assert isinstance(model, AdversaryPair) g = model.generator d = model.discriminator # Note: this assumes data is design matrix X = data m = data.shape[space.get_batch_axis()] y1 = T.alloc(1, m, 1) y0 = T.alloc(0, m, 1) # NOTE: if this changes to optionally use dropout, change the inference # code below to use a non-dropped-out version. S, z, other_layers = g.sample_and_noise( m, default_input_include_prob=self. generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None)) if self.noise_both != 0.: rng = MRG_RandomStreams(2014 / 6 + 2) S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob, self.discriminator_input_include_probs, self.discriminator_default_input_scale, self.discriminator_input_scales) y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob, self.discriminator_input_include_probs, self.discriminator_default_input_scale, self.discriminator_input_scales) # d_obj = 0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0)) pos_mask = y_hat1 < .5 + self.d_eps neg_mask = y_hat0 > .5 - self.d_eps pos_cost_matrix = d.layers[-1].cost_matrix(y1, y_hat1) neg_cost_matrix = d.layers[-1].cost_matrix(y0, y_hat0) pos_cost = (pos_mask * pos_cost_matrix).mean() neg_cost = (neg_mask * neg_cost_matrix).mean() d_obj = 0.5 * (pos_cost + neg_cost) if self.no_drop_in_d_for_g: y_hat0_no_drop = d.dropout_fprop(S) g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0_no_drop) else: g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0) assert g_cost_mat.ndim == 2 assert y_hat0.ndim == 2 mask = y_hat0 < 0.5 + self.g_eps masked_cost = g_cost_mat * mask g_obj = masked_cost.mean() if model.inferer is not None: # Change this if we ever switch to using dropout in the # construction of S. S_nograd = block_gradient( S) # Redundant as long as we have custom get_gradients pred = model.inferer.dropout_fprop( S_nograd, self.inference_default_input_include_prob, self.inference_input_include_probs, self.inference_default_input_scale, self.inference_input_scales) if self.infer_layer is None: target = z else: target = other_layers[self.infer_layer] i_obj = model.inferer.layers[-1].cost(target, pred) else: i_obj = 0 return S, d_obj, g_obj, i_obj