def main(_): # setting up output directory outdir = os.path.expanduser(FLAGS.outdir) #os.makedirs(outdir, exist_ok=True) # DATA N, M, D, R_true, I_train, I_test = get_data() # MODEL I = tf.placeholder(tf.float32, [N, M]) scale_uv = tf.concat( [tf.ones([D, N]), tf.ones([D, M])], axis=1) mean_uv = tf.concat( [tf.zeros([D, N]), tf.zeros([D, M])], axis=1) UV = Normal(loc=mean_uv, scale=scale_uv) #R = Normal( # loc=tf.matmul(tf.transpose(UV[:, :N]), UV[:, N:]) * I, # scale=tf.ones([N, M])) R = Normal( loc=tf.matmul(tf.transpose(UV[:, :N]), UV[:, N:]), scale=tf.ones([N, M])) # generator dist. for matrix R_mask = R * I # generated masked matrix sess = tf.InteractiveSession() p_joint = Joint(R_true, I_train, sess, D, N, M) # INFERENCE mean_suv = tf.concat([ tf.get_variable("qU/loc", [D, N]), tf.get_variable("qV/loc", [D, M]) ], axis=1) scale_suv = tf.concat([ tf.nn.softplus(tf.get_variable("qU/scale", [D, N])), tf.nn.softplus(tf.get_variable("qV/scale", [D, M])) ], axis=1) qUV = Normal(loc=mean_suv, scale=scale_suv) inference = ed.KLqp({UV: qUV}, data={R_mask: R_true, I: I_train}) inference.run(n_iter=FLAGS.VI_iter) # CRITICISM cR = ed.copy(R_mask, {UV: qUV}) # reconstructed matrix test_mse = ed.evaluate('mean_squared_error', data={ cR: R_true, I: I_test.astype(bool) }) logger.info("iters %d ed test mse %.5f" % (FLAGS.VI_iter, test_mse)) train_mse = ed.evaluate('mean_squared_error', data={ cR: R_true, I: I_train.astype(bool) }) logger.info("iters %d ed train mse %.5f" % (FLAGS.VI_iter, train_mse)) elbo_t = elbo(qUV, p_joint) logger.info('iters %d elbo %.2f' % (FLAGS.VI_iter, elbo_t))
def main(_): # setting up output directory outdir = os.path.expanduser(FLAGS.outdir) os.makedirs(outdir, exist_ok=True) N, M, D, R_true, I_train, I_test = get_data() debug('N, M, D', N, M, D) # Solution components weights, qUVt_components = [], [] # Files to log metrics times_filename = os.path.join(outdir, 'times.csv') mse_train_filename = os.path.join(outdir, 'mse_train.csv') mse_test_filename = os.path.join(outdir, 'mse_test.csv') ll_test_filename = os.path.join(outdir, 'll_test.csv') ll_train_filename = os.path.join(outdir, 'll_train.csv') elbos_filename = os.path.join(outdir, 'elbos.csv') gap_filename = os.path.join(outdir, 'gap.csv') step_filename = os.path.join(outdir, 'steps.csv') # 'adafw', 'ada_afw', 'ada_pfw' if FLAGS.fw_variant.startswith('ada'): lipschitz_filename = os.path.join(outdir, 'lipschitz.csv') iter_info_filename = os.path.join(outdir, 'iter_info.txt') start = 0 if FLAGS.restore: #start = 50 #qUVt_components = get_random_components(D, N, M, start) #weights = np.random.dirichlet([1.] * start).astype(np.float32) #lipschitz_estimate = opt.adafw_linit() parameters = np.load(os.path.join(outdir, 'qt_latest.npz')) weights = list(parameters['weights']) start = parameters['fw_iter'] qUVt_components = list(parameters['comps']) assert len(weights) == len(qUVt_components), "Inconsistent storage" # get lipschitz estimate from the file, could've stored it # in params but that would mean different saved file for # adaptive variants if FLAGS.fw_variant.startswith('ada'): lipschitz_filename = os.path.join(outdir, 'lipschitz.csv') if not os.path.isfile(lipschitz_filename): raise ValueError("Inconsistent storage") with open(lipschitz_filename, 'r') as f: l = f.readlines() lipschitz_estimate = float(l[-1].strip()) else: # empty the files present in the folder already open(times_filename, 'w').close() open(mse_train_filename, 'w').close() open(mse_test_filename, 'w').close() open(ll_test_filename, 'w').close() open(ll_train_filename, 'w').close() open(elbos_filename, 'w').close() open(gap_filename, 'w').close() open(step_filename, 'w').close() # 'adafw', 'ada_afw', 'ada_pfw' if FLAGS.fw_variant.startswith('ada'): open(lipschitz_filename, 'w').close() open(iter_info_filename, 'w').close() for t in range(start, start + FLAGS.n_fw_iter): g = tf.Graph() with g.as_default(): tf.set_random_seed(FLAGS.seed) sess = tf.InteractiveSession() with sess.as_default(): # MODEL I = tf.placeholder(tf.float32, [N, M]) scale_uv = tf.concat( [tf.ones([D, N]), tf.ones([D, M])], axis=1) mean_uv = tf.concat( [tf.zeros([D, N]), tf.zeros([D, M])], axis=1) UV = Normal(loc=mean_uv, scale=scale_uv) R = Normal(loc=tf.matmul(tf.transpose(UV[:, :N]), UV[:, N:]), scale=tf.ones([N, M])) # generator dist. for matrix R_mask = R * I # generated masked matrix p_joint = Joint(R_true, I_train, sess, D, N, M) if t == 0: fw_iterates = {} else: # Current solution prev_components = [ coreutils.base_loc_scale('mvn0', c['loc'], c['scale'], multivariate=False) for c in qUVt_components ] qUV_prev = coreutils.get_mixture(weights, prev_components) fw_iterates = {UV: qUV_prev} # LMO (via relbo INFERENCE) mean_suv = tf.concat([ tf.get_variable("qU/loc", [D, N]), tf.get_variable("qV/loc", [D, M]) ], axis=1) scale_suv = tf.concat([ tf.nn.softplus(tf.get_variable("qU/scale", [D, N])), tf.nn.softplus(tf.get_variable("qV/scale", [D, M])) ], axis=1) sUV = Normal(loc=mean_suv, scale=scale_suv) #inference = relbo.KLqp({UV: sUV}, data={R: R_true, I: I_train}, inference = relbo.KLqp({UV: sUV}, data={ R_mask: R_true, I: I_train }, fw_iterates=fw_iterates, fw_iter=t) inference.run(n_iter=FLAGS.LMO_iter) loc_s = sUV.mean().eval() scale_s = sUV.stddev().eval() # sUV is batched distrbution, there are issues making # Mixture with batch distributions. mvn0 # with event size (D, N + M) and batch size () # NOTE log_prob(sample) still returns tensor # mvn and multivariatenormaldiag work for 1-D not 2-D shapes sUV_mv = coreutils.base_loc_scale('mvn0', loc_s, scale_s, multivariate=False) # TODO send sUV or sUV_mv as argument to step size? sample # works the same way. same with log_prob total_time = 0. data = {R: R_true, I: I_train} if t == 0: gamma = 1. lipschitz_estimate = opt.adafw_linit() step_type = 'init' elif FLAGS.fw_variant == 'fixed': start_step_time = time.time() step_result = opt.fixed(weights, qUVt_components, qUV_prev, loc_s, scale_s, sUV, p_joint, data, t) end_step_time = time.time() total_time += float(end_step_time - start_step_time) elif FLAGS.fw_variant == 'line_search': start_step_time = time.time() step_result = opt.line_search_dkl(weights, qUVt_components, qUV_prev, loc_s, scale_s, sUV, p_joint, data, t) end_step_time = time.time() total_time += float(end_step_time - start_step_time) elif FLAGS.fw_variant == 'adafw': start_step_time = time.time() step_result = opt.adaptive_fw(weights, qUVt_components, qUV_prev, loc_s, scale_s, sUV, p_joint, data, t, lipschitz_estimate) end_step_time = time.time() total_time += float(end_step_time - start_step_time) step_type = step_result['step_type'] if step_type == 'adaptive': lipschitz_estimate = step_result['l_estimate'] elif FLAGS.fw_variant == 'ada_pfw': start_step_time = time.time() step_result = opt.adaptive_pfw(weights, qUVt_components, qUV_prev, loc_s, scale_s, sUV, p_joint, data, t, lipschitz_estimate) end_step_time = time.time() total_time += float(end_step_time - start_step_time) step_type = step_result['step_type'] if step_type in ['adaptive', 'drop']: lipschitz_estimate = step_result['l_estimate'] elif FLAGS.fw_variant == 'ada_afw': start_step_time = time.time() step_result = opt.adaptive_pfw(weights, qUVt_components, qUV_prev, loc_s, scale_s, sUV, p_joint, data, t, lipschitz_estimate) end_step_time = time.time() total_time += float(end_step_time - start_step_time) step_type = step_result['step_type'] if step_type in ['adaptive', 'away', 'drop']: lipschitz_estimate = step_result['l_estimate'] if t == 0: gamma = 1. weights.append(gamma) qUVt_components.append({'loc': loc_s, 'scale': scale_s}) new_components = [sUV_mv] else: qUVt_components = step_result['params'] weights = step_result['weights'] gamma = step_result['gamma'] new_components = [ coreutils.base_loc_scale('mvn0', c['loc'], c['scale'], multivariate=False) for c in qUVt_components ] qUV_new = coreutils.get_mixture(weights, new_components) #qR = Normal( # loc=tf.matmul( # tf.transpose(qUV_new[:, :N]), qUV_new[:, N:]), # scale=tf.ones([N, M])) qR = ed.copy(R, {UV: qUV_new}) cR = ed.copy(R_mask, {UV: qUV_new}) # reconstructed matrix # Log metrics for current iteration logger.info('total time %f' % total_time) append_to_file(times_filename, total_time) logger.info('iter %d, gamma %.4f' % (t, gamma)) append_to_file(step_filename, gamma) if t > 0: gap_t = step_result['gap'] logger.info('iter %d, gap %.4f' % (t, gap_t)) append_to_file(gap_filename, gap_t) # CRITICISM if FLAGS.fw_variant.startswith('ada'): append_to_file(lipschitz_filename, lipschitz_estimate) append_to_file(iter_info_filename, step_type) logger.info('lt = %.5f, iter_type = %s' % (lipschitz_estimate, step_type)) test_mse = ed.evaluate('mean_squared_error', data={ cR: R_true, I: I_test }) logger.info("iter %d ed test mse %.5f" % (t, test_mse)) append_to_file(mse_test_filename, test_mse) train_mse = ed.evaluate('mean_squared_error', data={ cR: R_true, I: I_train }) logger.info("iter %d ed train mse %.5f" % (t, train_mse)) append_to_file(mse_train_filename, train_mse) # very slow #train_ll = log_likelihood(qUV_new, R_true, I_train, sess, D, N, # M) train_ll = ed.evaluate('log_lik', data={ qR: R_true.astype(np.float32), I: I_train }) logger.info("iter %d train log lik %.5f" % (t, train_ll)) append_to_file(ll_train_filename, train_ll) #test_ll = log_likelihood(qUV_new, R_true, I_test, sess, D, N, M) test_ll = ed.evaluate('log_lik', data={ qR: R_true.astype(np.float32), I: I_test }) logger.info("iter %d test log lik %.5f" % (t, test_ll)) append_to_file(ll_test_filename, test_ll) # elbo_loss might be meaningless elbo_loss = elboModel.KLqp({UV: qUV_new}, data={ R: R_true, I: I_train }) elbo_t = elbo(qUV_new, p_joint) res_update = elbo_loss.run() logger.info('iter %d -elbo loss %.2f or %.2f' % (t, res_update['loss'], elbo_t)) append_to_file(elbos_filename, "%f,%f" % (elbo_t, res_update['loss'])) # serialize the current iterate np.savez(os.path.join(outdir, 'qt_latest.npz'), weights=weights, comps=qUVt_components, fw_iter=t + 1) sess.close() tf.reset_default_graph()
def adaptive_pfw(weights, params, q_t, mu_s, cov_s, s_t, p, data, k, l_prev): """Adaptive pairwise variant. Args: weights: [k], weights of the mixture components of q_t params: list containing dictionary of mixture params ('mu', 'scale') q_t: current mixture iterate q_t mu_s: [dim], mean for LMO solution s cov_s: [dim], cov matrix for LMO solution s s_t: Current atom & LMO Solution s p: joint distribution p(z, x) data: training data k: iteration number of Frank-Wolfe l_prev: previous lipschitz estimate gap: Duality-Gap (if already computed) Returns: a dictionary containing gamma, new weights, new parameters lipschitz estimate, duality gap of current iterate and step information """ d_t_norm = divergence(s_t, q_t, metric=FLAGS.distance_metric) logger.info('\ndistance norm is %.3e' % d_t_norm) # Find v_t qcomps = q_t.components index_v_t, step_v_t = argmax_grad_dotp(p, q_t, qcomps) v_t = qcomps[index_v_t] # Pairwise gap step_s = grad_kl_dotp(q_t, p, s_t) gap_pw = step_v_t - step_s logger.info('Pairwise gap %.3e' % gap_pw) if gap_pw <= 0: logger.warning('Pairwise gap <= 0, returning fixed step') return fixed(weights, params, q_t, mu_s, cov_s, s_t, p, data, k, gap_pw) gap = gap_pw MAX_GAMMA = weights[index_v_t] gamma = 2. / (k + 2.) tau = FLAGS.exp_adafw eta = FLAGS.damping_adafw pow_tau = 1.0 i, l_t = 0, l_prev f_t = -elbo(q_t, p) debug('f(q_t) = %.3e' % f_t) is_drop_step = False while gamma >= MIN_GAMMA and i < FLAGS.adafw_MAXITER: # compute L_t and gamma_t l_t = pow_tau * eta * l_prev gamma = min(gap / (l_t * d_t_norm), MAX_GAMMA) d_1 = -gamma * gap d_2 = gamma * gamma * l_t * d_t_norm / 2. debug('linear d1 = %.5f, quad d2 = %.5f' % (d_1, d_2)) quad_bound_rhs = f_t + d_1 + d_2 # construct q_{t + 1} # handle the case of MAX_GAMMA separately new_weights = copy.copy(weights) new_weights.append(gamma) new_params = copy.copy(params) new_params.append({'loc': mu_s, 'scale': cov_s}) if gamma != MAX_GAMMA: new_weights[index_v_t] -= gamma is_drop_step = False else: # hardcoding to 0 del new_weights[index_v_t] del new_params[index_v_t] is_drop_step = True new_components = [ coreutils.base_loc_scale(FLAGS.base_dist, c['loc'], c['scale'], multivariate=False) for c in new_params ] qt_new = coreutils.get_mixture(new_weights, new_components) quad_bound_lhs = -elbo(qt_new, p) logger.info('lt = %.3e, gamma = %.3f, f_(qt_new) = %.3e, ' 'linear extrapolated = %.3e' % (l_t, gamma, quad_bound_lhs, quad_bound_rhs)) if quad_bound_lhs <= quad_bound_rhs: # Adaptive loop succeeded return { 'gamma': gamma, 'l_estimate': l_t, 'weights': new_weights, 'params': new_params, 'gap': gap, 'step_type': 'drop' if is_drop_step else 'adaptive' } pow_tau *= tau i += 1 # gamma below MIN_GAMMA logger.warning("gamma below threshold value, returning fixed step") return fixed(weights, params, q_t, mu_s, cov_s, s_t, p, data, k, gap)
def adaptive_afw(weights, params, q_t, mu_s, cov_s, s_t, p, data, k, l_prev): """Adaptive Away Steps algorithm. Args: weights: [k], weights of the mixture components of q_t params: list containing dictionary of mixture params ('mu', 'scale') q_t: current mixture iterate q_t mu_s: [dim], mean for LMO solution s cov_s: [dim], cov matrix for LMO solution s s_t: Current atom & LMO Solution s p: joint distribution p(z, x) data: training data k: iteration number of Frank-Wolfe l_prev: previous lipschitz estimate Returns: a dictionary containing gamma, new weights, new parameters lipschitz estimate, duality gap of current iterate and step information """ d_t_norm = divergence(s_t, q_t, metric=FLAGS.distance_metric) logger.info('\ndistance norm is %.3e' % d_t_norm) # Find v_t qcomps = q_t.components index_v_t, step_v_t = argmax_grad_dotp(p, q_t, qcomps) v_t = qcomps[index_v_t] # Frank-Wolfe gap step_s = grad_kl_dotp(q_t, p, s_t) step_q = grad_kl_dotp(q_t, p, q_t) gap_fw = step_q - step_s if gap_fw < 0: logger.warning("Frank-Wolfe duality gap is negative") # Away gap gap_a = step_v_t - step_q if gap_a < 0: eprint('Away gap < 0!!!') logger.info('fw gap %.3e, away gap %.3e' % (gap_fw, gap_a)) if (gap_fw >= gap_a) or (len(params) == 1): # FW direction, proceeds exactly as adafw logger.info('Proceeding in FW direction ') return adaptive_fw(weights, params, q_t, mu_s, cov_s, s_t, p, data, k, l_prev, gap_fw) # Away direction logger.info('Proceeding in Away direction ') adaptive_step_type = 'away' gap = gap_a if weights[index_v_t] < 1.0: MAX_GAMMA = weights[index_v_t] / (1.0 - weights[index_v_t]) else: MAX_GAMMA = 100. # Large value when t = 1 gamma = 2. / (k + 2.) tau = FLAGS.exp_adafw eta = FLAGS.damping_adafw pow_tau = 1.0 i, l_t = 0, l_prev f_t = -elbo(q_t, p) debug('f(q_t) = %.5f' % (f_t)) is_drop_step = False while gamma >= MIN_GAMMA and i < FLAGS.adafw_MAXITER: # compute $L_t$ and $\gamma_t$ l_t = pow_tau * eta * l_prev # NOTE: Handle extreme values of gamma carefully gamma = min(gap / (l_t * d_t_norm), MAX_GAMMA) d_1 = -gamma * gap d_2 = gamma * gamma * l_t * d_t_norm / 2. debug('linear d1 = %.5f, quad d2 = %.5f' % (d_1, d_2)) quad_bound_rhs = f_t + d_1 + d_2 # construct $q_{t + 1}$ new_weights = copy.copy(weights) new_params = copy.copy(params) if gamma == MAX_GAMMA: # drop v_t is_drop_step = True del new_weights[index_v_t] new_weights = [(1. + gamma) * w for w in new_weights] del new_params[index_v_t] else: is_drop_step = False new_weights = [(1. + gamma) * w for w in new_weights] new_weights[index_v_t] -= gamma new_components = [ coreutils.base_loc_scale(FLAGS.base_dist, c['loc'], c['scale'], multivariate=False) for c in new_params ] qt_new = coreutils.get_mixture(new_weights, new_components) quad_bound_lhs = -elbo(qt_new, p) logger.info('lt = %.3e, gamma = %.3f, f_(qt_new) = %.3e, ' 'linear extrapolated = %.3e' % (l_t, gamma, quad_bound_lhs, quad_bound_rhs)) if quad_bound_lhs <= quad_bound_rhs: return { 'gamma': gamma, 'l_estimate': l_t, 'weights': new_weights, 'params': new_params, 'gap': gap, 'step_type': "drop" if is_drop_step else "away" } pow_tau *= tau i += 1 # gamma below MIN_GAMMA logger.warning("gamma below threshold value, returning fixed step") return fixed(weights, params, q_t, mu_s, cov_s, s_t, p, data, k, gap)
def adaptive_fw(weights, params, q_t, mu_s, cov_s, s_t, p, data, k, l_prev, gap=None): """Adaptive Frank-Wolfe algorithm. Sets step size as suggested in Algorithm 1 of https://arxiv.org/pdf/1806.05123.pdf Args: weights: [k], weights of the mixture components of q_t params: list containing dictionary of mixture params ('mu', 'scale') q_t: current mixture iterate q_t mu_s: [dim], mean for LMO solution s cov_s: [dim], cov matrix for LMO solution s s_t: Current atom & LMO Solution s p: joint distribution p(z, x) data: training data k: iteration number of Frank-Wolfe l_prev: previous lipschitz estimate gap: Duality-Gap (if already computed) Returns: a dictionary containing gamma, new weights, new parameters lipschitz estimate, duality gap of current iterate and step information """ # NOTE TODO try div(q_t, s_t) d_t_norm = divergence(s_t, q_t, metric=FLAGS.distance_metric) logger.info('\ndistance norm is %.3e' % d_t_norm) if gap is None: step_s = grad_kl_dotp(q_t, p, s_t) step_q = grad_kl_dotp(q_t, p, q_t) gap = step_q - step_s logger.info('duality gap %.3e' % gap) if gap < 0: logger.warning("Duality gap is negative returning fixed step") return fixed(weights, params, q_t, mu_s, cov_s, s_t, p, data, k, gap) gamma = 2. / (k + 2.) tau = FLAGS.exp_adafw eta = FLAGS.damping_adafw # NOTE: this is from v1 of the paper, new version # replaces multiplicative eta with divisor eta pow_tau = 1.0 i, l_t = 0, l_prev # Objective in this case is -ELBO, elbo loss def neg_elbo(q): elbo_loss = elboModel.KLqp({pz: q}, data) return elbo_loss.run()['loss'] f_t = -elbo(q_t, p) debug('f(q_t) = %.3e' % (f_t)) # return intial estimate if gap is -ve while gamma >= MIN_GAMMA and i < FLAGS.adafw_MAXITER: # compute $L_t$ and $\gamma_t$ l_t = pow_tau * eta * l_prev gamma = min(gap / (l_t * d_t_norm), 1.0) d_1 = -gamma * gap d_2 = gamma * gamma * l_t * d_t_norm / 2. #debug('linear d1 = %.3e, quad d2 = %.3e' % (d_1, d_2)) quad_bound_rhs = f_t + d_1 + d_2 # $w_{t + 1} = [(1 - \gamma)w_t, \gamma]$ # Handling the case of gamma = 1.0 # separately, weights might not get exactly 0 because # of precision issues. 0 wt components should be removed if gamma != 1.0: new_weights = copy.copy(weights) new_weights = [(1. - gamma) * w for w in new_weights] new_weights.append(gamma) new_params = copy.copy(params) new_params.append({'loc': mu_s, 'scale': cov_s}) else: new_weights = [1.] new_params = [{'loc': mu_s, 'scale': cov_s}] new_components = [ coreutils.base_loc_scale(FLAGS.base_dist, c['loc'], c['scale'], multivariate=False) for c in new_params ] qt_new = coreutils.get_mixture(new_weights, new_components) quad_bound_lhs = -elbo(qt_new, p) #quad_bound_lhs = neg_elbo(qt_new) logger.info('lt = %.3e, gamma = %.3f, f_(qt_new) = %.3e, ' 'linear extrapolated = %.3e' % (l_t, gamma, quad_bound_lhs, quad_bound_rhs)) if quad_bound_lhs <= quad_bound_rhs: # Adaptive loop succeeded return { 'gamma': gamma, 'l_estimate': l_t, 'weights': new_weights, 'params': new_params, 'gap': gap, 'step_type': 'adaptive' } pow_tau *= tau i += 1 # gamma below MIN_GAMMA logger.warning("gamma below threshold value, returning fixed step") return fixed(weights, params, q_t, mu_s, cov_s, s_t, p, data, k, gap)