def trw(node_weights, edges, edge_weights, y, max_iter=100, verbose=0, tol=1e-3, relaxed=False): result = decompose_grid_graph([(node_weights, edges, edge_weights)]) contains_node, chains, edge_index = result[0][0], result[1][0], result[2][0] n_nodes, n_states = node_weights.shape y_hat = [] lambdas = [] multiplier = [] for p in xrange(n_nodes): multiplier.append(1.0 / len(contains_node[p])) for chain in chains: lambdas.append(np.zeros((len(chain), n_states))) y_hat.append(np.zeros(len(chain))) multiplier = np.array(multiplier) multiplier.shape = (n_nodes, 1) mu = np.zeros((n_nodes, n_states)) learning_rate = 0.1 energy_history = [] for iteration in xrange(max_iter): E = 0 dmu = np.zeros((n_nodes, n_states)) unaries = node_weights - mu y_hat_gco, energy = inference_gco(unaries, edge_weights, edges, n_iter=5, return_energy=True) E -= energy y_hat_kappa, energy = optimize_kappa(y, mu, 1, n_nodes, n_states) E += energy dmu[np.ogrid[:dmu.shape[0]], y_hat_gco] -= 1 dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1 mu -= learning_rate * dmu energy_history.append(E) if iteration: learning_rate = 1. / np.sqrt(iteration) if verbose: print 'Iteration {}: energy {}'.format(iteration, E) if iteration and np.abs(E - energy_history[-2]) < tol: if verbose: print 'Converged' break return y_hat_gco, y_hat_kappa, energy_history, iteration
def loss_augmented_inference(self, x, y, w): unary_potentials = self._get_unary_potentials(x, w) pairwise_potentials = self._get_pairwise_potentials(x, w) edges = self._get_edges(x) for label in xrange(self.n_states): mask = y.full != label unary_potentials[mask, label] += y.weights[mask] return inference_gco(unary_potentials, pairwise_potentials, edges, n_iter=5, return_energy=True)
def loss_augmented_inference(self, x, y, w): unary_potentials = self._get_unary_potentials(x, w) pairwise_potentials = self._get_pairwise_potentials(x, w) edges = self._get_edges(x) label_costs = np.zeros(self.n_states) c = np.sum(y.weights) / float(self.n_states) for label in y.weak: label_costs[label] = c for label in xrange(0, self.n_states): if label not in y.weak: unary_potentials[:, label] += y.weights h = inference_gco(unary_potentials, pairwise_potentials, edges, label_costs, n_iter=5, return_energy=True) return h
def fit(self, X, Y, train_scorer, test_scorer, decompose='general', use_latent_first_iter=500, undergenerating_weak=True, smd=False): self.logger.info('Initialization') if decompose == 'general': contains_node, chains, edge_index = decompose_graph(X) elif decompose == 'grid': contains_node, chains, edge_index = decompose_grid_graph(X) else: raise ValueError y_hat = [] lambdas = [] multiplier = [] xx = [] mu = {} for k in xrange(len(X)): x, y = X[k], Y[k] n_nodes = x[0].shape[0] xx.append(np.zeros(n_nodes)) _lambdas = [] _y_hat = [] _multiplier = [] for p in xrange(n_nodes): _multiplier.append(1.0 / len(contains_node[k][p])) for chain in chains[k]: _lambdas.append(np.zeros((len(chain), self.n_states))) _y_hat.append(np.zeros(len(chain), dtype=np.int32)) lambdas.append(_lambdas) y_hat.append(_y_hat) _multiplier = np.array(_multiplier) _multiplier.shape = (n_nodes, 1) multiplier.append(_multiplier) if not y.full_labeled: mu[k] = np.zeros((n_nodes, self.n_states)) w = np.zeros(self.size_w) self.w = w.copy() self.start_time = time.time() self.timestamps = [0] self.objective_curve = [] self.train_score = [] self.test_score = [] self.w_history = [] learning_rate1 = 0.1 learning_rate2 = 0.1 for iteration in xrange(self.max_iter): self.logger.info('Iteration %d', iteration) self.logger.info('Optimize slave MRF and update w') objective = 0 dw = np.zeros(w.shape) for k in xrange(len(X)): x, y = X[k], Y[k] n_nodes = x[0].shape[0] # self.logger.info('object %d', k) if y.full_labeled: unaries = self._loss_augment_unaries( self._get_unary_potentials(x, w), y.full, y.weights) unaries *= multiplier[k] pairwise = self._get_pairwise_potentials(x, w) jf = self._joint_features_full(x, y.full) objective -= np.dot(w, jf) dw -= jf for i in xrange(len(chains[k])): y_hat[k][i], energy = optimize_chain( chains[k][i], lambdas[k][i] + unaries[chains[k][i], :], pairwise, edge_index[k]) dw += self._joint_features(chains[k][i], x, y_hat[k][i], edge_index[k], multiplier[k]) objective += energy elif iteration > use_latent_first_iter: if undergenerating_weak: # Use gco for full K oracle # y_hat_, energy = self.loss_augmented_inference(x, y, w) # jf_gt = self._joint_features_full(x, y.full) # objective -= np.dot(w, jf_gt) # objective += energy # dw -= jf_gt # dw += self._joint_features_full(x, y_hat_) # use gco for first summand in DD for mm in xrange(10): dmu = np.zeros((n_nodes, self.n_states)) unaries = self._get_unary_potentials(x, w) - mu[k] pairwise = self._get_pairwise_potentials(x, w) y_hat_gco, energy = inference_gco( unaries, pairwise, self._get_edges(x), n_iter=5, return_energy=True) objective -= energy dmu[np.ogrid[:dmu.shape[0]], y_hat_gco] -= 1 dw += self._joint_features_full(x, y_hat_gco) jf = self._joint_features_full(x, y.full) objective -= np.dot(w, jf) dw -= jf y_hat_kappa, energy = optimize_kappa( y, mu[k], self.alpha, n_nodes, self.n_states) objective += energy dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1 mu[k] -= learning_rate2 * dmu elif not smd: dmu = np.zeros((n_nodes, self.n_states)) unaries = (self._get_unary_potentials(x, w) - mu[k]) * multiplier[k] pairwise = self._get_pairwise_potentials(x, w) jf = self._joint_features_full(x, y.full) objective -= np.dot(w, jf) dw -= jf #begin inner (can remove this to restore to previous state) E = 0 Eprev = -100 for j in xrange(self.update_mu): E = 0 for i in xrange(len(chains[k])): y_hat[k][i], energy = optimize_chain( chains[k][i], lambdas[k][i] + unaries[chains[k][i], :], pairwise, edge_index[k]) E += energy lambda_sum = np.zeros((n_nodes, self.n_states), dtype=np.float64) for p in xrange(n_nodes): for i in contains_node[k][p]: pos = np.where(chains[k][i] == p)[0][0] lambda_sum[ p, y_hat[k][i][pos]] += multiplier[k][p] for i in xrange(len(chains[k])): N = lambdas[k][i].shape[0] lambdas[k][i][np.ogrid[:N], y_hat[k][i]] -= learning_rate2 lambdas[k][i] += learning_rate2 * lambda_sum[ chains[k][i], :] if np.abs(E - Eprev) < 0.1: break Eprev = E #end inner #last one for i in xrange(len(chains[k])): y_hat[k][i], energy = optimize_chain( chains[k][i], lambdas[k][i] + unaries[chains[k][i], :], pairwise, edge_index[k]) dw += self._joint_features(chains[k][i], x, y_hat[k][i], edge_index[k], multiplier[k]) objective += energy dmu[chains[k][i], y_hat[k][i]] -= multiplier[k][ chains[k][i]].flatten() # y_hat_kappa, energy = optimize_kappa( y, mu[k], self.alpha, n_nodes, self.n_states) objective += energy dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1 mu[k] -= learning_rate2 * dmu elif smd: if iteration > 1500: mMu = 10 else: mMu = 1 for mm in xrange(mMu): dmu = np.zeros((n_nodes, self.n_states)) jf = self._joint_features_full(x, y.full) objective -= np.dot(w, jf) dw -= jf unaries = -self._get_unary_potentials(x, w) + mu[k] edge_weights = -self._get_pairwise_potentials(x, w) edges = self._get_edges(x) n_edges = edges.shape[0] y_hat2 = [] pairwise = [] for j in xrange(self.n_states): y_hat2.append(np.zeros(self.n_states)) _pairwise = np.zeros((n_edges, 2, 2)) for i in xrange(n_edges): _pairwise[i, 1, 0] = _pairwise[ i, 0, 1] = -0.5 * edge_weights[i, j, j] pairwise.append(_pairwise) for i in xrange(n_edges): e1, e2 = edges[i] unaries[e1, :] += 0.5 * np.diag( edge_weights[i, :, :]) unaries[e2, :] += 0.5 * np.diag( edge_weights[i, :, :]) xx[k], f_val, d = fmin_l_bfgs_b(f, xx[k], args=(unaries, pairwise, edges), maxiter=50, maxfun=50, pgtol=1e-2) E = np.sum(xx[k]) for j in xrange(self.n_states): new_unaries = np.zeros((n_nodes, 2)) new_unaries[:, 1] = unaries[:, j] + xx[k] y_hat2[j], energy = binary_general_graph( edges, new_unaries, pairwise[j]) E -= 0.5 * energy dmu[:, j] -= y_hat2[j] dw += self._joint_features_full( x, y_hat2[j] * j) y_hat_kappa, energy = optimize_kappa( y, mu[k], 1, n_nodes, self.n_states) E += energy dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1 objective += E mu[k] -= learning_rate2 * dmu dw += w / self.C if iteration < 100 or iteration % self.update_w_every == 0: w -= learning_rate1 * dw objective = self.C * objective + np.sum(w**2) / 2 self.logger.info('Update lambda') for k in xrange(len(X)): if undergenerating_weak and not Y[k].full_labeled: continue if smd and not Y[k].full_labeled: continue n_nodes = X[k][0].shape[0] lambda_sum = np.zeros((n_nodes, self.n_states), dtype=np.float64) for p in xrange(n_nodes): for i in contains_node[k][p]: pos = np.where(chains[k][i] == p)[0][0] lambda_sum[p, y_hat[k][i][pos]] += multiplier[k][p] for i in xrange(len(chains[k])): N = lambdas[k][i].shape[0] lambdas[k][i][np.ogrid[:N], y_hat[k][i]] -= learning_rate2 lambdas[k][i] += learning_rate2 * lambda_sum[ chains[k][i], :] if iteration % self.complete_every == 0 or iteration in [ 51, 80, 101, 130 ]: self.logger.info('Complete latent variables') Y_new = Parallel(n_jobs=self.n_jobs, verbose=0, max_nbytes=1e8)( delayed(latent)(self.model, x, y, w) for x, y in zip(X, Y)) changes = np.sum([ np.any(y_new.full != y.full) for y_new, y in zip(Y_new, Y) ]) self.logger.info('changes in latent variables: %d', changes) Y = Y_new if iteration and (iteration % self.check_every == 0): self.logger.info('Compute train and test scores') self.train_score.append(train_scorer(w)) self.logger.info('Train SCORE: %f', self.train_score[-1]) self.test_score.append(test_scorer(w)) self.logger.info('Test SCORE: %f', self.test_score[-1]) self.logger.info('diff: %f', np.sum((w - self.w)**2)) if iteration: learning_rate1 = 1.0 / iteration learning_rate2 = 1.0 / iteration self.timestamps.append(time.time() - self.start_time) self.objective_curve.append(objective) self.logger.info('Objective: %f', objective) self.w = w.copy() self.w_history.append(self.w) self.w = w self.timestamps = np.array(self.timestamps) self.objective_curve = np.array(self.objective_curve) self.train_score = np.array(self.train_score) self.test_score = np.array(self.test_score) self.w_history = np.vstack(self.w_history)
def trw(node_weights, edges, edge_weights, y, max_iter=100, verbose=0, tol=1e-3, relaxed=False): result = decompose_grid_graph([(node_weights, edges, edge_weights)]) contains_node, chains, edge_index = result[0][0], result[1][0], result[2][ 0] n_nodes, n_states = node_weights.shape y_hat = [] lambdas = [] multiplier = [] for p in xrange(n_nodes): multiplier.append(1.0 / len(contains_node[p])) for chain in chains: lambdas.append(np.zeros((len(chain), n_states))) y_hat.append(np.zeros(len(chain))) multiplier = np.array(multiplier) multiplier.shape = (n_nodes, 1) mu = np.zeros((n_nodes, n_states)) learning_rate = 0.1 energy_history = [] for iteration in xrange(max_iter): E = 0 dmu = np.zeros((n_nodes, n_states)) unaries = node_weights - mu y_hat_gco, energy = inference_gco(unaries, edge_weights, edges, n_iter=5, return_energy=True) E -= energy y_hat_kappa, energy = optimize_kappa(y, mu, 1, n_nodes, n_states) E += energy dmu[np.ogrid[:dmu.shape[0]], y_hat_gco] -= 1 dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1 mu -= learning_rate * dmu energy_history.append(E) if iteration: learning_rate = 1. / np.sqrt(iteration) if verbose: print 'Iteration {}: energy {}'.format(iteration, E) if iteration and np.abs(E - energy_history[-2]) < tol: if verbose: print 'Converged' break return y_hat_gco, y_hat_kappa, energy_history, iteration
def fit(self, X, Y, train_scorer, test_scorer, decompose='general', use_latent_first_iter=500, undergenerating_weak=True, smd=False): self.logger.info('Initialization') if decompose == 'general': contains_node, chains, edge_index = decompose_graph(X) elif decompose == 'grid': contains_node, chains, edge_index = decompose_grid_graph(X) else: raise ValueError y_hat = [] lambdas = [] multiplier = [] xx = [] mu = {} for k in xrange(len(X)): x, y = X[k], Y[k] n_nodes = x[0].shape[0] xx.append(np.zeros(n_nodes)) _lambdas = [] _y_hat = [] _multiplier = [] for p in xrange(n_nodes): _multiplier.append(1.0 / len(contains_node[k][p])) for chain in chains[k]: _lambdas.append(np.zeros((len(chain), self.n_states))) _y_hat.append(np.zeros(len(chain), dtype=np.int32)) lambdas.append(_lambdas) y_hat.append(_y_hat) _multiplier = np.array(_multiplier) _multiplier.shape = (n_nodes, 1) multiplier.append(_multiplier) if not y.full_labeled: mu[k] = np.zeros((n_nodes, self.n_states)) w = np.zeros(self.size_w) self.w = w.copy() self.start_time = time.time() self.timestamps = [0] self.objective_curve = [] self.train_score = [] self.test_score = [] self.w_history = [] learning_rate1 = 0.1 learning_rate2 = 0.1 for iteration in xrange(self.max_iter): self.logger.info('Iteration %d', iteration) self.logger.info('Optimize slave MRF and update w') objective = 0 dw = np.zeros(w.shape) for k in xrange(len(X)): x, y = X[k], Y[k] n_nodes = x[0].shape[0] # self.logger.info('object %d', k) if y.full_labeled: unaries = self._loss_augment_unaries(self._get_unary_potentials(x, w), y.full, y.weights) unaries *= multiplier[k] pairwise = self._get_pairwise_potentials(x, w) jf = self._joint_features_full(x, y.full) objective -= np.dot(w, jf) dw -= jf for i in xrange(len(chains[k])): y_hat[k][i], energy = optimize_chain(chains[k][i], lambdas[k][i] + unaries[chains[k][i],:], pairwise, edge_index[k]) dw += self._joint_features(chains[k][i], x, y_hat[k][i], edge_index[k], multiplier[k]) objective += energy elif iteration > use_latent_first_iter: if undergenerating_weak: # Use gco for full K oracle # y_hat_, energy = self.loss_augmented_inference(x, y, w) # jf_gt = self._joint_features_full(x, y.full) # objective -= np.dot(w, jf_gt) # objective += energy # dw -= jf_gt # dw += self._joint_features_full(x, y_hat_) # use gco for first summand in DD for mm in xrange(10): dmu = np.zeros((n_nodes, self.n_states)) unaries = self._get_unary_potentials(x, w) - mu[k] pairwise = self._get_pairwise_potentials(x, w) y_hat_gco, energy = inference_gco(unaries, pairwise, self._get_edges(x), n_iter=5, return_energy=True) objective -= energy dmu[np.ogrid[:dmu.shape[0]], y_hat_gco] -= 1 dw += self._joint_features_full(x, y_hat_gco) jf = self._joint_features_full(x, y.full) objective -= np.dot(w, jf) dw -= jf y_hat_kappa, energy = optimize_kappa(y, mu[k], self.alpha, n_nodes, self.n_states) objective += energy dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1 mu[k] -= learning_rate2 * dmu elif not smd: dmu = np.zeros((n_nodes, self.n_states)) unaries = (self._get_unary_potentials(x, w) - mu[k]) * multiplier[k] pairwise = self._get_pairwise_potentials(x, w) jf = self._joint_features_full(x, y.full) objective -= np.dot(w, jf) dw -= jf #begin inner (can remove this to restore to previous state) E = 0 Eprev = -100 for j in xrange(self.update_mu): E = 0 for i in xrange(len(chains[k])): y_hat[k][i], energy = optimize_chain(chains[k][i], lambdas[k][i] + unaries[chains[k][i],:], pairwise, edge_index[k]) E += energy lambda_sum = np.zeros((n_nodes, self.n_states), dtype=np.float64) for p in xrange(n_nodes): for i in contains_node[k][p]: pos = np.where(chains[k][i] == p)[0][0] lambda_sum[p, y_hat[k][i][pos]] += multiplier[k][p] for i in xrange(len(chains[k])): N = lambdas[k][i].shape[0] lambdas[k][i][np.ogrid[:N], y_hat[k][i]] -= learning_rate2 lambdas[k][i] += learning_rate2 * lambda_sum[chains[k][i],:] if np.abs(E - Eprev) < 0.1: break Eprev = E #end inner #last one for i in xrange(len(chains[k])): y_hat[k][i], energy = optimize_chain(chains[k][i], lambdas[k][i] + unaries[chains[k][i],:], pairwise, edge_index[k]) dw += self._joint_features(chains[k][i], x, y_hat[k][i], edge_index[k], multiplier[k]) objective += energy dmu[chains[k][i], y_hat[k][i]] -= multiplier[k][chains[k][i]].flatten() # y_hat_kappa, energy = optimize_kappa(y, mu[k], self.alpha, n_nodes, self.n_states) objective += energy dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1 mu[k] -= learning_rate2 * dmu elif smd: if iteration > 1500: mMu = 10 else: mMu = 1 for mm in xrange(mMu): dmu = np.zeros((n_nodes, self.n_states)) jf = self._joint_features_full(x, y.full) objective -= np.dot(w, jf) dw -= jf unaries = -self._get_unary_potentials(x, w) + mu[k] edge_weights = -self._get_pairwise_potentials(x, w) edges = self._get_edges(x) n_edges = edges.shape[0] y_hat2 = [] pairwise = [] for j in xrange(self.n_states): y_hat2.append(np.zeros(self.n_states)) _pairwise = np.zeros((n_edges, 2, 2)) for i in xrange(n_edges): _pairwise[i,1,0] = _pairwise[i,0,1] = -0.5 * edge_weights[i,j,j] pairwise.append(_pairwise) for i in xrange(n_edges): e1, e2 = edges[i] unaries[e1,:] += 0.5 * np.diag(edge_weights[i,:,:]) unaries[e2,:] += 0.5 * np.diag(edge_weights[i,:,:]) xx[k], f_val, d = fmin_l_bfgs_b(f, xx[k], args=(unaries, pairwise, edges), maxiter=50, maxfun=50, pgtol=1e-2) E = np.sum(xx[k]) for j in xrange(self.n_states): new_unaries = np.zeros((n_nodes, 2)) new_unaries[:,1] = unaries[:,j] + xx[k] y_hat2[j], energy = binary_general_graph(edges, new_unaries, pairwise[j]) E -= 0.5*energy dmu[:,j] -= y_hat2[j] dw += self._joint_features_full(x, y_hat2[j] * j) y_hat_kappa, energy = optimize_kappa(y, mu[k], 1, n_nodes, self.n_states) E += energy dmu[np.ogrid[:dmu.shape[0]], y_hat_kappa] += 1 objective += E mu[k] -= learning_rate2 * dmu dw += w / self.C if iteration < 100 or iteration % self.update_w_every == 0: w -= learning_rate1 * dw objective = self.C * objective + np.sum(w ** 2) / 2 self.logger.info('Update lambda') for k in xrange(len(X)): if undergenerating_weak and not Y[k].full_labeled: continue if smd and not Y[k].full_labeled: continue n_nodes = X[k][0].shape[0] lambda_sum = np.zeros((n_nodes, self.n_states), dtype=np.float64) for p in xrange(n_nodes): for i in contains_node[k][p]: pos = np.where(chains[k][i] == p)[0][0] lambda_sum[p, y_hat[k][i][pos]] += multiplier[k][p] for i in xrange(len(chains[k])): N = lambdas[k][i].shape[0] lambdas[k][i][np.ogrid[:N], y_hat[k][i]] -= learning_rate2 lambdas[k][i] += learning_rate2 * lambda_sum[chains[k][i],:] if iteration % self.complete_every == 0 or iteration in [51, 80, 101, 130]: self.logger.info('Complete latent variables') Y_new = Parallel(n_jobs=self.n_jobs, verbose=0, max_nbytes=1e8)( delayed(latent)(self.model, x, y, w) for x, y in zip(X, Y)) changes = np.sum([np.any(y_new.full != y.full) for y_new, y in zip(Y_new, Y)]) self.logger.info('changes in latent variables: %d', changes) Y = Y_new if iteration and (iteration % self.check_every == 0): self.logger.info('Compute train and test scores') self.train_score.append(train_scorer(w)) self.logger.info('Train SCORE: %f', self.train_score[-1]) self.test_score.append(test_scorer(w)) self.logger.info('Test SCORE: %f', self.test_score[-1]) self.logger.info('diff: %f', np.sum((w-self.w)**2)) if iteration: learning_rate1 = 1.0 / iteration learning_rate2 = 1.0 / iteration self.timestamps.append(time.time() - self.start_time) self.objective_curve.append(objective) self.logger.info('Objective: %f', objective) self.w = w.copy() self.w_history.append(self.w) self.w = w self.timestamps = np.array(self.timestamps) self.objective_curve = np.array(self.objective_curve) self.train_score = np.array(self.train_score) self.test_score = np.array(self.test_score) self.w_history = np.vstack(self.w_history)