def gtv_cvlam(X, y, q, num_folds=5, num_lams=20): n = len(X) folds = create_folds(n, num_folds) scores = np.zeros(num_lams) lams = None for i, fold in enumerate(folds): mask = np.ones(n, dtype=bool) mask[fold] = False x_train, y_train = X[mask], y[mask] x_test, y_test = X[~mask], y[~mask] data, weights, grid = bucket_vals(x_train, y_train, q) results = solve_gfl(data, None, weights=weights, full_path=True, minlam=0.1, maxlam=20., numlam=num_lams) fold_score = np.array([ mse(y_test, predict(x_test, beta, grid)) for beta in results['beta'] ]) scores += fold_score if i == 0: lams = results['lambda'] scores /= float(num_folds) lam_best = lams[np.argmin(scores)] data, weights, grid = bucket_vals(X, y, q) beta = solve_gfl(data, None, weights=weights, lam=lam_best) return beta.reshape(q), grid
def imtv(): parser = argparse.ArgumentParser( description= 'Runs the graph-fused lasso (GFL) solver on an image. Note: this is currently pretty slow for even medium-sized color images.' ) parser.add_argument('imagefile', help='The file containing the image to denoise.') parser.add_argument('output', help='The file to output the results.') parser.add_argument( '--verbose', type=int, default=1, help= 'The level of print statements. 0=none, 1=moderate, 2=all. Default=0.') parser.add_argument('--time', action='store_true', help='Print the timing stats for the algorithm.') parser.set_defaults() args = parser.parse_args() ########### Load data from file if args.verbose: print('Loading image') from scipy.misc import imsave, imread from utils import hypercube_edges y = imread(args.imagefile).astype(float) y_mean = y.mean() y -= y_mean edges = hypercube_edges(y.shape) if args.verbose: print('Solving the GFL for {0} variables with {1} edges'.format( len(y.flatten()), len(edges))) ########### Run the C solver t0 = time.clock() beta = solve_gfl(y.flatten(), edges, verbose=args.verbose, converge=1e-3, maxsteps=3000) t1 = time.clock() ########### Print the timing stats if args.time: print('Solved the GFL in {0}s and {1} total steps of ADMM.'.format( t1 - t0, np.array(solver.steps).sum())) ########### Save the results to file if args.verbose: print('Saving results to {0}'.format(args.output)) imsave(args.output, beta.reshape(y.shape) + y_mean)
def __init__(self, y): # Pre-cache a sparse LU decomposition of the FL matrix from pygfl.utils import get_1d_penalty_matrix from scipy.sparse.linalg import factorized from scipy.sparse import csc_matrix D = get_1d_penalty_matrix(y.shape[0]) D = np.vstack([D, np.zeros(y.shape[0])]) D[-1,-1] = 1e-6 # Nugget for full rank matrix D = csc_matrix(D) self.invD = factorized(D) # Setup the fast GFL solver from pygfl.solver import TrailSolver from pygfl.trails import decompose_graph from pygfl.utils import hypercube_edges, chains_to_trails from networkx import Graph edges = hypercube_edges(y.shape) g = Graph() g.add_edges_from(edges) chains = decompose_graph(g, heuristic='greedy') ntrails, trails, breakpoints, edges = chains_to_trails(chains) self.solver = TrailSolver() self.solver.set_data(y, edges, ntrails, trails, breakpoints) from pygfl.easy import solve_gfl self.beta = solve_gfl(y)
def gfusedlasso(z, A, lam=None): # print(type(z),type(A),type(lam)) A = np.triu(A) > 0 edges = np.stack(np.mask_indices(A.shape[0], lambda n, k: A), axis=-1) # print(z.shape,z.dtype,edges.shape,edges.dtype,lam) z_fused = solve_gfl(z.astype(np.float64), edges, lam=lam) return z_fused.astype(z.dtype)
def TVTR(X, Y, edges, stacked=0, lam=1, rho=1, tol=0.05, verbose=1): #y = np.array(Y, dtype='float64') print(X.shape) print(Y.shape) print(edges.shape) y = np.array(Y, dtype='float64') n = X.shape[0] m = Y.shape[1] Im = np.eye(n) M = Im - X.dot(LA.inv(X.T.dot(X))).dot(X.T) edge_all = edges if stacked == 0: for k in range(1, n): edge_all = np.vstack((edge_all, edges + m * k)) edge_all = np.asarray(edge_all, dtype='int') converge = 0 #initialize parameters theta = np.random.rand(n, m) mu = np.random.rand(n, m) eta = np.random.rand(n, m) U = np.zeros((n, m)) V = np.zeros((n, m)) # infeas = LA.norm(M.dot(G)) iter = 0 while not converge: theta_last = theta print(iter) theta = (y + rho * (mu - V + eta - U)) / (1 + 2 * rho) eta = (Im - M).dot((theta + U)) #print(np.shape(theta+V)) if verbose: print('entering interation ' + str(iter) + ' solve_gfl') mu = solve_gfl((theta + V).reshape((n * m, ), order="C"), edge_all, minlam=lam / rho, maxlam=lam / rho, numlam=1) mu = mu.reshape(n, m) U = U + theta - eta V = V + theta - mu infeas = LA.norm(theta - eta) / LA.norm(theta) relerr = LA.norm(theta_last - theta) / LA.norm(theta_last) converge = infeas < tol and relerr < tol iter += 1 print('Iter: ' + str(iter) + '\t rel_err ' + str(relerr) + '\t Infeasibility ' + str(infeas)) gamma = LA.inv(X.T.dot(X)).dot(X.T).dot(theta) return gamma
def TVTRminibatch(X,Y,edges,num_epochs=1,mini_batch_size=16,lam=1,rho=1,tol=0.05,verbose=1,seed=135): #y = np.array(Y, dtype='float64') if verbose: print (X.shape) print (Y.shape) print (edges.shape) y = np.array(Y, dtype='float64') m = y.shape[1] n= X.shape[0] theta = np.random.rand(n, m) mu = np.random.rand(n, m) eta = np.random.rand(n, m) U = np.zeros((n, m)) V = np.zeros((n, m)) Im = np.eye(n) M = Im - X.dot(LA.inv(X.T.dot(X))).dot(X.T) converge = 0 for i in range(num_epochs): seed=seed+1 iter = 1 minibatches=random_mini_batches(X,mini_batch_size,seed) for minibatch in minibatches: theta_last = theta theta = (y+rho*(mu-V+eta-U))/(1+2*rho) eta = (Im - M).dot((theta + U)) #print(np.shape(theta+V)) print('entering epoch '+str(i)+ ' , iteration' + str(iter)+ ' solve_gfl') n_minibatch=len(minibatch) edge_all = edges for k in range(1, n_minibatch): edge_all = np.vstack((edge_all, edges + m * k)) minibatch_mu = solve_gfl((theta+V)[minibatch].reshape((n_minibatch*m,), order="C"), edge_all, minlam=lam / rho, maxlam=lam / rho, numlam=1) mu[minibatch] = minibatch_mu.reshape(n_minibatch, m) U = U + theta - eta V = V + theta - mu infeas = LA.norm(theta - eta) / LA.norm(theta) relerr = LA.norm(theta_last - theta) / LA.norm(theta_last) converge = infeas < tol and relerr < tol iter += 1 print 'epoch '+str(i)+ 'Iter: ' + str(iter) + '\t rel_err ' + str(relerr) + '\t Infeasibility ' + str(infeas) if converge==1: break gamma = LA.inv(X.T.dot(X)).dot(X.T).dot(theta) return gamma
def denoisemini(y,edge,lam,rho): num,m=y.shape minibatch_mu = solve_gfl(y.reshape((num * m,), order="C"), edge, minlam=lam / rho, maxlam=lam / rho, numlam=1) minibatch_mu=minibatch_mu.reshape((num,m)) return minibatch_mu
def main(): parser = argparse.ArgumentParser( description= 'Runs the graph-fused lasso (GFL) solver on a given dataset with a given edge set. The GFL problem is defined as finding Beta such that it minimizes the equation f(Y, Beta) + lambda * g(E, Beta) where f is a smooth, convex loss function, typically 1/2 sum_i (y_i - Beta_i)^2, and g is sum of first differences on edges, sum_(s,t) |Beta_s - Beta_t| for each edge (s,t) in E.' ) parser.add_argument( 'data', help='The CSV file containing the vector of data points.') parser.add_argument( 'edges', help= 'The CSV file containing the edges connecting the variables, with one edge per line.' ) parser.add_argument('--output', '--o', help='The file to output the results.') parser.add_argument( '--verbose', type=int, default=1, help= 'The level of print statements. 0=none, 1=moderate, 2=all. Default=0.') parser.add_argument('--time', action='store_true', help='Print the timing stats for the algorithm.') parser.set_defaults() args = parser.parse_args() ########### Load data from file if args.verbose: print('Loading data') y = np.loadtxt(args.data, delimiter=',') edges = np.loadtxt(args.edges, delimiter=',', dtype=int) if args.verbose: print('Solving the GFL for {0} variables with {1} edges'.format( len(y), len(edges))) ########### Run the C solver t0 = time.clock() beta = solve_gfl(y, edges, verbose=args.verbose) t1 = time.clock() ########### Print the timing stats if args.time: print('Solved the GFL in {0}s and {1} total steps of ADMM.'.format( t1 - t0, np.array(solver.steps).sum())) ########### Save the results to file if args.output: if args.verbose: print('Saving results to {0}'.format(args.output)) np.savetxt(args.output, beta, delimiter=',') else: print('Results:') print(beta)
def numpy_gfusedlasso(z,edge,lam=None): z_fused = solve_gfl(z.astype(np.float64),edge.astype('int'),lam=float(lam)) return z_fused.astype(z.dtype)
import matplotlib.pylab as plt import numpy as np from pygfl.easy import solve_gfl truth = np.zeros(200) truth[:50] = 0.5 truth[50:100] = 0.75 truth[100:150] = 0.25 truth[150:180] = 0.1 truth[180:] = 0.9 trials = np.random.poisson(10, size=200) successes = np.array([(np.random.random(size=t) <= p).sum() for t,p in zip(trials, truth)]) beta = solve_gfl((trials, successes), loss='binomial') plt.scatter(np.arange(200)+1, successes / trials.astype(float)) plt.plot(np.arange(200)+1, truth, color='gray', alpha=0.5) plt.plot(np.arange(200)+1, 1. / (1+np.exp(-beta)), color='orange') plt.show()
def ss_gfl(y, min_spike=1e-4, max_spike=1e2, nspikes=30, max_steps=100, rel_tol=1e-6, a=None, b=None, **kwargs): # if a is None: # a = np.sqrt(len(y)) # if b is None: # b = np.sqrt(len(y)) sigma, a, b = estimate_hyperparams(y) print('sigma: {} a: {} b: {} expected proportion of nulls: {:.2f}'.format(sigma, a, b, a / (a+b))) # Create the log-space grid of spikes spike_grid = np.exp(np.linspace(np.log(min_spike), np.log(max_spike), nspikes)) # Initialize beta at the observations # beta = y.copy() from pygfl.easy import solve_gfl beta = solve_gfl(y) # Use an equal weighted mixture to start, with two identical slabs # diffs = np.abs(beta[1:] - beta[:-1]) # theta = (a + (diffs < 1e-3).sum()) / (a + b + beta.shape[0] - 3) theta = 0.5 slab = min_spike # Track convergence prev = beta.copy() # Track the BIC path bic = np.zeros(nspikes) # Create a fast solver for the 1d fused lasso fl_solver = FastWeightedFusedLassoSolver(y) # Run over the entire solution path of spikes, starting from very flat # spikes and going to very sharp spikes betas = np.zeros((nspikes, y.shape[0])) for spike_idx, spike in enumerate(spike_grid): print('Spike {}/{}: {:.4f}'.format(spike_idx+1, nspikes, spike)) # Run the EM algorithm for the fixed spike, using the warm-started # beta and theta values for step in range(max_steps): print('\tStep {}'.format(step)) # Get the beta differences diffs = np.abs(beta[1:] - beta[:-1]) # np.set_printoptions(suppress=True, precision=2) # print(diffs) # E-step: Expected local mixture probabilities given theta and beta spike_prob = theta * np.exp(-diffs * spike) * spike slab_prob = (1-theta) * np.exp(-diffs * slab) * slab gamma = spike_prob / (spike_prob + slab_prob) # We have a 2-part M-step. # (i) M-step for beta: run the fused lasso with edge weights: # lam_ij = gamma_ij*spike + (1-gamma_ij)*slab lams = gamma*spike + (1-gamma)*slab beta = solve_weighted_gfl(y, lams * sigma, beta_init=beta, **kwargs) # (ii) M-step for theta: MLE prior mixture probabilities theta = (a + gamma.sum()) / (a + b + beta.shape[0] - 3) theta = theta.clip(1e-3,1-1e-3) # Don't let theta get too big. Equivalent to choosing a and b proportional to the number of nodes print('\ttheta: {:.3f}'.format(theta)) # Check for convergence delta = np.linalg.norm(prev - beta) if delta <= rel_tol: print() break print('\tDelta={:.6f}'.format(delta)) print() prev = beta.copy() # Calculate BIC = -2ln(L) + dof * (ln(n) - ln(2pi)) nll = -0.5 / sigma * ((y - beta)**2).sum() dof = (np.abs(beta[1:] - beta[:-1]) >= 1e-4).sum() + 1 bic[spike_idx] = 2*nll + dof * (np.log(beta.shape[0]) - np.log(2 * np.pi)) print('NLL: {:.4f} dof: {} BIC: {:.2f}'.format(nll, dof, bic[spike_idx])) # if spike_idx > 0 and np.abs(bic[spike_idx] - bic[spike_idx-1]) <= rel_tol: # break # Save the entire path of solutions betas[spike_idx] = beta return {'betas': betas[:spike_idx], 'bic': bic[:spike_idx]}
# Save the entire path of solutions betas[spike_idx] = beta return {'betas': betas[:spike_idx], 'bic': bic[:spike_idx]} if __name__ == '__main__': from pygfl.easy import solve_gfl import matplotlib.pyplot as plt np.random.seed(5) truth = np.array([0]*20 + [4]*30 + [-1]*40 + [-5]*20 + [-1]*30 + [1.8]*10 + [-0.8]*30 + [3]*50) X = np.arange(1,1+len(truth)) Y = np.random.normal(truth) # Fit the ordinary GFL beta_gfl = solve_gfl(Y) # Fit the spike-and-slab GFL results = ss_gfl(Y) # Use the last solution beta_ssl = results['betas'][-1] # Use the BIC solution # beta_ssl_bic = results['betas'][np.argmin(results['bic'])] plt.scatter(X, Y, color='gray', alpha=0.2, label='Observations') plt.plot(X, truth, color='black', label='Truth') plt.plot(X, beta_gfl, color='blue', label='FL') plt.plot(X, beta_ssl, color='orange', label='SSFL') # plt.plot(X, beta_ssl_bic, color='green', label='SSFL (BIC)')
import matplotlib.pylab as plt import numpy as np from pygfl.easy import solve_gfl truth = np.zeros(200) truth[:50] = 0.5 truth[50:100] = 0.75 truth[100:150] = 0.25 truth[150:180] = 0.1 truth[180:] = 0.9 data = (np.random.random(size=200) <= truth).astype(int) beta = solve_gfl(data, loss='logistic') plt.scatter(np.arange(200) + 1, data) plt.plot(np.arange(200) + 1, truth, color='gray', alpha=0.5) plt.plot(np.arange(200) + 1, 1. / (1 + np.exp(-beta)), color='orange') plt.show()