def test_continuous_y(): for inference_method in get_installed(["lp", "ad3"]): X, Y = generate_blocks(n_samples=1) x, y = X[0], Y[0] w = np.array([1, 0, 0, 1, 0, -4, 0]) # unary # pairwise crf = GridCRF(inference_method=inference_method) crf.initialize(X, Y) joint_feature = crf.joint_feature(x, y) y_cont = np.zeros_like(x) gx, gy = np.indices(x.shape[:-1]) y_cont[gx, gy, y] = 1 # need to generate edge marginals vert = np.dot(y_cont[1:, :, :].reshape(-1, 2).T, y_cont[:-1, :, :].reshape(-1, 2)) # horizontal edges horz = np.dot(y_cont[:, 1:, :].reshape(-1, 2).T, y_cont[:, :-1, :].reshape(-1, 2)) pw = vert + horz joint_feature_cont = crf.joint_feature(x, (y_cont, pw)) assert_array_almost_equal(joint_feature, joint_feature_cont) const = find_constraint(crf, x, y, w, relaxed=False) const_cont = find_constraint(crf, x, y, w, relaxed=True) # djoint_feature and loss are equal: assert_array_almost_equal(const[1], const_cont[1], 4) assert_almost_equal(const[2], const_cont[2], 4) # returned y_hat is one-hot version of other if isinstance(const_cont[0], tuple): assert_array_equal(const[0], np.argmax(const_cont[0][0], axis=-1)) # test loss: assert_almost_equal(crf.loss(y, const[0]), crf.continuous_loss(y, const_cont[0][0]), 4)
def test_continuous_y(): for inference_method in ["lp", "ad3"]: X, Y = toy.generate_blocks(n_samples=1) x, y = X[0], Y[0] w = np.array([1, 0, 0, 1, 0, -4, 0]) # unary # pairwise crf = LatentGridCRF(n_labels=2, n_states_per_label=1, inference_method=inference_method) psi = crf.psi(x, y) y_cont = np.zeros_like(x) gx, gy = np.indices(x.shape[:-1]) y_cont[gx, gy, y] = 1 # need to generate edge marginals vert = np.dot(y_cont[1:, :, :].reshape(-1, 2).T, y_cont[:-1, :, :].reshape(-1, 2)) # horizontal edges horz = np.dot(y_cont[:, 1:, :].reshape(-1, 2).T, y_cont[:, :-1, :].reshape(-1, 2)) pw = vert + horz psi_cont = crf.psi(x, (y_cont, pw)) assert_array_almost_equal(psi, psi_cont) const = find_constraint(crf, x, y, w, relaxed=False) const_cont = find_constraint(crf, x, y, w, relaxed=True) # dpsi and loss are equal: assert_array_almost_equal(const[1], const_cont[1]) assert_almost_equal(const[2], const_cont[2]) # returned y_hat is one-hot version of other assert_array_equal(const[0], np.argmax(const_cont[0][0], axis=-1)) # test loss: assert_equal(crf.loss(y, const[0]), crf.continuous_loss(y, const_cont[0][0]))
def test_learning(): crf = IgnoreVoidCRF(n_states=3, n_features=2, void_label=2, inference_method='lp') ssvm = SubgradientStructuredSVM(crf, verbose=10, C=100, n_jobs=1, max_iter=50, learning_rate=0.01) ssvm.fit(X, Y) for x in X: y_hat_exhaustive = exhaustive_inference(crf, x, ssvm.w) y_hat = crf.inference(x, ssvm.w) assert_array_equal(y_hat, y_hat_exhaustive) constr = [ find_constraint(crf, x, y, ssvm.w, y_hat=y_hat) for x, y, y_hat in zip(X, Y, ssvm.predict(X)) ] losses = [c[3] for c in constr] slacks = [c[2] for c in constr] assert_true(np.all(np.array(slacks) >= np.array(losses)))
def _sequential_learning(self, X, Y, w): n_samples = len(X) objective, positive_slacks = 0, 0 if self.batch_size in [None, 1]: # online learning for x, y in zip(X, Y): y_hat, delta_joint_feature, slack, loss = \ find_constraint(self.model, x, y, w) objective += slack if slack > 0: positive_slacks += 1 self._solve_subgradient(delta_joint_feature, n_samples, w) else: # mini batch learning if self.batch_size == -1: slices = [slice(0, len(X)), None] else: n_batches = int(np.ceil(float(len(X)) / self.batch_size)) slices = gen_even_slices(n_samples, n_batches) for batch in slices: X_b = X[batch] Y_b = Y[batch] Y_hat = self.model.batch_loss_augmented_inference( X_b, Y_b, w, relaxed=True) delta_joint_feature = (self.model.batch_joint_feature(X_b, Y_b) - self.model.batch_joint_feature(X_b, Y_hat)) loss = np.sum(self.model.batch_loss(Y_b, Y_hat)) violation = np.maximum(0, loss - np.dot(w, delta_joint_feature)) objective += violation positive_slacks += self.batch_size self._solve_subgradient(delta_joint_feature / len(X_b), n_samples, w) return objective, positive_slacks, w
def test_continuous_y(): for inference_method in get_installed(["lp", "ad3"]): X, Y = generate_blocks(n_samples=1) x, y = X[0], Y[0] w = np.array([ 1, 0, # unary 0, 1, 0, # pairwise -4, 0 ]) crf = LatentGridCRF(n_labels=2, n_features=2, n_states_per_label=1, inference_method=inference_method) joint_feature = crf.joint_feature(x, y) y_cont = np.zeros_like(x) gx, gy = np.indices(x.shape[:-1]) y_cont[gx, gy, y] = 1 # need to generate edge marginals vert = np.dot(y_cont[1:, :, :].reshape(-1, 2).T, y_cont[:-1, :, :].reshape(-1, 2)) # horizontal edges horz = np.dot(y_cont[:, 1:, :].reshape(-1, 2).T, y_cont[:, :-1, :].reshape(-1, 2)) pw = vert + horz joint_feature_cont = crf.joint_feature(x, (y_cont, pw)) assert_array_almost_equal(joint_feature, joint_feature_cont, 4) const = find_constraint(crf, x, y, w, relaxed=False) const_cont = find_constraint(crf, x, y, w, relaxed=True) # djoint_feature and loss are equal: assert_array_almost_equal(const[1], const_cont[1], 4) assert_almost_equal(const[2], const_cont[2], 4) if isinstance(const_cont[0], tuple): # returned y_hat is one-hot version of other assert_array_equal(const[0], np.argmax(const_cont[0][0], axis=-1)) # test loss: assert_almost_equal(crf.loss(y, const[0]), crf.continuous_loss(y, const_cont[0][0]), 4)
def _frank_wolfe_bc(self, X, Y): """Block-Coordinate Frank-Wolfe learning. Compare Algorithm 3 in the reference paper. """ n_samples = len(X) w = self.w.copy() w_mat = np.zeros((n_samples, self.model.size_psi)) l_mat = np.zeros(n_samples) l_avg = 0.0 l = 0.0 k = 0 for p in xrange(self.max_iter): if self.verbose > 0: print("Iteration %d" % p) for i in range(n_samples): x, y = X[i], Y[i] y_hat, delta_psi, slack, loss = find_constraint(self.model, x, y, w) # ws and ls ws = delta_psi * self.C ls = loss / n_samples # line search if self.line_search: eps = 1e-15 w_diff = w_mat[i] - ws gamma = (w_diff.T.dot(w) - (self.C * n_samples)*(l_mat[i] - ls)) / (np.sum(w_diff ** 2) + eps) gamma = max(0.0, min(1.0, gamma)) else: gamma = 2.0 * n_samples / (k + 2.0 * n_samples) w -= w_mat[i] w_mat[i] = (1.0 - gamma) * w_mat[i] + gamma * ws w += w_mat[i] l -= l_mat[i] l_mat[i] = (1.0 - gamma) * l_mat[i] + gamma * ls l += l_mat[i] if self.do_averaging: rho = 2.0 / (k + 2.0) self.w = (1.0 - rho) * self.w + rho * w l_avg = (1.0 - rho) * l_avg + rho * l else: self.w = w k += 1 if (self.check_dual_every != 0) and (p % self.check_dual_every == 0): dual_val, dual_gap, primal_val = self._calc_dual_gap(X, Y, l) if self.verbose > 0: print("dual: %f, dual_gap: %f, primal: %f" % (dual_val, dual_gap, primal_val)) if dual_gap < self.tol: return
def test_continuous_y(): for inference_method in get_installed(["lp", "ad3"]): X, Y = generate_blocks(n_samples=1) x, y = X[0], Y[0] w = np.array([1, 0, # unary 0, 1, 0, # pairwise -4, 0]) crf = GridCRF(inference_method=inference_method) crf.initialize(X, Y) psi = crf.psi(x, y) y_cont = np.zeros_like(x) gx, gy = np.indices(x.shape[:-1]) y_cont[gx, gy, y] = 1 # need to generate edge marginals vert = np.dot(y_cont[1:, :, :].reshape(-1, 2).T, y_cont[:-1, :, :].reshape(-1, 2)) # horizontal edges horz = np.dot(y_cont[:, 1:, :].reshape(-1, 2).T, y_cont[:, :-1, :].reshape(-1, 2)) pw = vert + horz psi_cont = crf.psi(x, (y_cont, pw)) assert_array_almost_equal(psi, psi_cont) const = find_constraint(crf, x, y, w, relaxed=False) const_cont = find_constraint(crf, x, y, w, relaxed=True) # dpsi and loss are equal: assert_array_almost_equal(const[1], const_cont[1]) assert_almost_equal(const[2], const_cont[2]) # returned y_hat is one-hot version of other if isinstance(const_cont[0], tuple): assert_array_equal(const[0], np.argmax(const_cont[0][0], axis=-1)) # test loss: assert_almost_equal(crf.loss(y, const[0]), crf.continuous_loss(y, const_cont[0][0]))
def test_learning(): crf = IgnoreVoidCRF(n_states=3, n_features=2, void_label=2, inference_method='lp') ssvm = SubgradientStructuredSVM(crf, verbose=10, C=100, n_jobs=1, max_iter=50, learning_rate=0.01) ssvm.fit(X, Y) for x in X: y_hat_exhaustive = exhaustive_inference(crf, x, ssvm.w) y_hat = crf.inference(x, ssvm.w) assert_array_equal(y_hat, y_hat_exhaustive) constr = [find_constraint(crf, x, y, ssvm.w, y_hat=y_hat) for x, y, y_hat in zip(X, Y, ssvm.predict(X))] losses = [c[3] for c in constr] slacks = [c[2] for c in constr] assert_true(np.all(np.array(slacks) >= np.array(losses)))
def _sequential_learning(self, X, Y, w): n_samples = len(X) objective, positive_slacks = 0, 0 if self.batch_size in [None, 1]: # online learning for x, y in zip(X, Y): y_hat, delta_joint_feature, slack, loss = \ find_constraint(self.model, x, y, w) objective += slack if slack > 0: positive_slacks += 1 self._solve_subgradient(delta_joint_feature, n_samples, w) else: # mini batch learning if self.batch_size == -1: slices = [slice(0, len(X)), None] else: n_batches = int(np.ceil(float(len(X)) / self.batch_size)) slices = gen_even_slices(n_samples, n_batches) for batch in slices: X_b = X[batch] Y_b = Y[batch] Y_hat = self.model.batch_loss_augmented_inference(X_b, Y_b, w, relaxed=True) delta_joint_feature = ( self.model.batch_joint_feature(X_b, Y_b) - self.model.batch_joint_feature(X_b, Y_hat)) loss = np.sum(self.model.batch_loss(Y_b, Y_hat)) violation = np.maximum(0, loss - np.dot(w, delta_joint_feature)) objective += violation positive_slacks += self.batch_size self._solve_subgradient(delta_joint_feature / len(X_b), n_samples, w) return objective, positive_slacks, w
def _frank_wolfe_bc(self, X, Y): """Block-Coordinate Frank-Wolfe learning. Compare Algorithm 3 in the reference paper. """ n_samples = len(X) w = self.w.copy() w_mat = np.zeros((n_samples, self.model.size_joint_feature)) l_mat = np.zeros(n_samples) l = 0.0 k = 0 rng = check_random_state(self.random_state) for iteration in xrange(self.max_iter): if self.verbose > 0: print("Iteration %d" % iteration) perm = np.arange(n_samples) if self.sample_method == 'perm': rng.shuffle(perm) elif self.sample_method == 'rnd': perm = rng.randint(low=0, high=n_samples, size=n_samples) for j in range(n_samples): i = perm[j] x, y = X[i], Y[i] y_hat, delta_joint_feature, slack, loss = find_constraint( self.model, x, y, w) # ws and ls ws = delta_joint_feature * self.C ls = loss / n_samples # line search if self.line_search: eps = 1e-15 w_diff = w_mat[i] - ws gamma = (w_diff.T.dot(w) - (self.C * n_samples) * (l_mat[i] - ls)) / (np.sum(w_diff**2) + eps) gamma = max(0.0, min(1.0, gamma)) else: gamma = 2.0 * n_samples / (k + 2.0 * n_samples) w -= w_mat[i] w_mat[i] = (1.0 - gamma) * w_mat[i] + gamma * ws w += w_mat[i] l -= l_mat[i] l_mat[i] = (1.0 - gamma) * l_mat[i] + gamma * ls l += l_mat[i] if self.do_averaging: rho = 2. / (k + 2.) self.w = (1. - rho) * self.w + rho * w self.l = (1. - rho) * self.l + rho * l else: self.w = w self.l = l k += 1 if (self.check_dual_every != 0) and (iteration % self.check_dual_every == 0): dual_val, dual_gap, primal_val = self._calc_dual_gap(X, Y) self.primal_objective_curve_.append(primal_val) self.objective_curve_.append(dual_val) self.timestamps_.append(time() - self.timestamps_[0]) if self.verbose > 0: print("dual: %f, dual_gap: %f, primal: %f" % (dual_val, dual_gap, primal_val)) if self.logger is not None: self.logger(self, iteration) if dual_gap < self.tol: return
def _frank_wolfe_bc(self, param_x, param_y, initialize=True): n_samples = len(param_x) w = self.w.copy() if initialize: self.w_mat = np.zeros((n_samples, self.model.size_joint_feature)) self.l_mat = np.zeros(n_samples) self.l_loss = 0.0 self.k = 0 self.rng = check_random_state(self.random_state) for iteration in range(self.max_iter): if self.verbose > 0: print(("Iteration %d" % iteration)) perm = np.arange(n_samples) if self.sample_method == 'perm': self.rng.shuffle(perm) elif self.sample_method == 'rnd': perm = self.rng.randint(low=0, high=n_samples, size=n_samples) for j in range(n_samples): i = perm[j] x, y = param_x[i], param_y[i] y_hat, delta_joint_feature, slack, loss = find_constraint( self.model, x, y, w) ws = delta_joint_feature * self.C ls = loss / n_samples if self.line_search: eps = 1e-15 w_diff = self.w_mat[i] - ws self.gamma = (w_diff.T.dot(w) - (self.C * n_samples) * (self.l_mat[i] - ls)) / (np.sum(w_diff**2) + eps) self.gamma = max(0.0, min(1.0, self.gamma)) else: self.gamma = 2.0 * n_samples / (self.k + 2.0 * n_samples) w -= self.w_mat[i] self.w_mat[i] = (1.0 - self.gamma) * \ self.w_mat[i] + self.gamma * ws w += self.w_mat[i] self.l_loss -= self.l_mat[i] self.l_mat[i] = (1.0 - self.gamma) * \ self.l_mat[i] + self.gamma * ls self.l_loss += self.l_mat[i] if self.do_averaging: self.rho = 2. / (self.k + 2.) self.w = (1. - self.rho) * self.w + self.rho * w self.param_l = (1. - self.rho) * \ self.param_l + self.rho * self.l_loss else: self.w = w self.param_l = self.l_loss self.k += 1 if (self.check_dual_every != 0) and (iteration % self.check_dual_every == 0): dual_val, dual_gap, primal_val = self._calc_dual_gap( param_x, param_y) self.primal_objective_curve_.append(primal_val) self.objective_curve_.append(dual_val) self.timestamps_.append(time() - self.timestamps_[0]) if self.verbose > 0: print(("dual: %f, dual_gap: %f, primal: %f" % (dual_val, dual_gap, primal_val))) if self.logger is not None: self.logger(self, iteration) if dual_gap < self.tol: return
def _frank_wolfe_bc(self, X, Y): """Block-Coordinate Frank-Wolfe learning. Compare Algorithm 3 in the reference paper. """ n_samples = len(X) w = self.w.copy() w_mat = np.zeros((n_samples, self.model.size_joint_feature)) l_mat = np.zeros(n_samples) l = 0.0 k = 0 rng = check_random_state(self.random_state) for iteration in range(self.max_iter): if self.verbose > 0: print("Iteration %d" % iteration) perm = np.arange(n_samples) if self.sample_method == 'perm': rng.shuffle(perm) elif self.sample_method == 'rnd': perm = rng.randint(low=0, high=n_samples, size=n_samples) for j in range(n_samples): i = perm[j] x, y = X[i], Y[i] y_hat, delta_joint_feature, slack, loss = find_constraint(self.model, x, y, w) # ws and ls ws = delta_joint_feature * self.C ls = loss / n_samples # line search if self.line_search: eps = 1e-15 w_diff = w_mat[i] - ws gamma = (w_diff.T.dot(w) - (self.C * n_samples)*(l_mat[i] - ls)) / (np.sum(w_diff ** 2) + eps) gamma = max(0.0, min(1.0, gamma)) else: gamma = 2.0 * n_samples / (k + 2.0 * n_samples) w -= w_mat[i] w_mat[i] = (1.0 - gamma) * w_mat[i] + gamma * ws w += w_mat[i] l -= l_mat[i] l_mat[i] = (1.0 - gamma) * l_mat[i] + gamma * ls l += l_mat[i] if self.do_averaging: rho = 2. / (k + 2.) self.w = (1. - rho) * self.w + rho * w self.l = (1. - rho) * self.l + rho * l else: self.w = w self.l = l k += 1 if (self.check_dual_every != 0) and (iteration % self.check_dual_every == 0): dual_val, dual_gap, primal_val = self._calc_dual_gap(X, Y) self.primal_objective_curve_.append(primal_val) self.objective_curve_.append(dual_val) self.timestamps_.append(time() - self.timestamps_[0]) if self.verbose > 0: print("dual: %f, dual_gap: %f, primal: %f" % (dual_val, dual_gap, primal_val)) if self.logger is not None: self.logger(self, iteration) if dual_gap < self.tol: return