def test(mat_name, l, check_c = True): """ @param mat_name: name of matrix file @param l: number of columns in sketch """ if ".txt" not in mat_name: mat_name += ".txt" mat_pname = os.path.join(MATRIX_DIR, mat_name) mat = load_matrix(mat_pname) print "Original shape: %r, l: %d" %(mat.shape, l) f_norm = squaredFrobeniusNorm(mat) start = time.time() p_sketch = sketch(mat, l) p_time = time.time() - start print "Sketch shape: ", p_sketch.shape p_err = calculateError(mat, p_sketch) # calculate bound on error err_bound = 2 * squaredFrobeniusNorm(mat) / l if not check_c: return err_bound, f_norm, p_err, None # run sketch.c on the matrix sketch_pname = os.path.join(MATRIX_DIR, "sketch_" + mat_name) subprocess.call(["make", "clean"]) subprocess.call(["make", "sketch"]) # need to check output c_start = time.time() err = subprocess.check_output([RUN_SKETCH, '-f', mat_pname, '-w', sketch_pname, '-l', str(l)]) c_time = time.time() - c_start print "Sketch output: ", err c_sketch = load_matrix(sketch_pname) assert (c_sketch.shape == p_sketch.shape) c_err = calculateError(mat, c_sketch) return err_bound, f_norm, p_err, p_time, c_err, c_time
def kmeans_experiment(on_sketch=True, on_orig=True): #path = "../../data/GoogleNews-vectors-negative300.bin" #wmodel = models.Word2Vec.load_word2vec_format(path, binary=True) mat = load_matrix('data_batch_1') # sketch the transpose mat = mat.T #sketch_sizes = [10, 50, 100, 200] sketch_sizes = [10, 15, 20, 30, 100] sketches = [] for l in sketch_sizes: fname = 'test_matrices/sketches/sk_%d'%l if os.path.exists(fname): sketches.append(load_matrix(fname)) else: sketch_o = BatchPFDSketch(mat, l, l, 0.2, randomized=True) sketch_o.compute_sketch() write_matrix(sketch_o.sketch.T, fname) sketches.append(sketch_o.sketch.T) mat = mat.T #print sketche.shape print "Mat: ", mat.shape #sketch = load_matrix("sketches/w2vec_250.txt") clusters = [2, 4, 8, 16, 32] #clusters = [10, 20, 30, 40] num_processes = 8 results = {'opt': {}, 'sketch': {}} for l in sketch_sizes: results['sketch'][l] = {} for k in clusters: print "Testing ", k if on_sketch: for sketch, l in zip(sketches, sketch_sizes): start_time = time.time() cost, cluster_centers, labels = train_kmeans(sketch, k, num_processes=num_processes) train_time = time.time() - start_time test_cost = compute_cost_labels(mat, labels, k) results['sketch'][l][k] = {'time': train_time, 'cost': test_cost} init_centers = compute_centers(mat, labels, k) if on_orig: start_time = time.time() cost, cluster_centers, labels = train_kmeans(mat, k, init_centers=init_centers, num_processes=num_processes) train_time = time.time() - start_time cost = compute_cost_labels(mat, labels, k) results['opt'][k] = {'time': train_time, 'cost': cost} if on_orig: with open('experiments/kmeans/cifar/mat_results.p', "wb") as f: pickle.dump(results, f) if on_sketch: with open('experiments/kmeans/cifar/sketch_results.p', "wb") as f: pickle.dump(results, f)
def plot_errors(orig_mat_fname, p_fnames, c_fnames = None): # load the original matrix if ".txt" not in orig_mat_fname: orig_mat_fname += ".txt" mat_pname = os.path.join(MATRIX_DIR, orig_mat_fname) mat = load_matrix(mat_pname) rows, cols = mat.shape ls = [] errs = [] bounds = [] regex = re.compile('\d+') if c_fnames: for p_fname, c_fname in zip(p_fnames, c_fnames): # extract the sketch size # calculate the error from each # assert t p_l = int(regex.search(p_fname).group(0)) c_l = int(regex.search(c_fname).group(0)) assert(p_l == c_l) p_sketch = load_matrix(p_fname) c_sketch = load_matrix(c_fname) assert(p_sketch.shape == c_sketch.shape) bound = fd_bound(mat, p_l) p_err = calculateError(mat, p_sketch) c_err = calculateError(mat, c_sketch) print p_err, c_err # the optimizations make floating point arithmetic not exact assert(np.isclose(p_err/c_err, 1.0, atol=1e-1)) ls.append(p_l) errs.append(p_err) bounds.append(bound) else: for p_fname in p_fnames: p_l = int(regex.search(p_fname).group(0)) p_sketch = load_matrix(p_fname) bound = fd_bound(mat, p_l) p_err = calculateError(mat, p_sketch) ls.append(p_l) errs.append(p_err) bounds.append(bound) # lets plot this shit plt.plot(ls, errs, '-o', color='b', label='cov err') plt.plot(ls, bounds, '-o', color='r', label='upper bound') plt.xlabel("Sketch size (l)") plt.ylabel("Error") title = "Sketch size vs Reconstruction Error: %d X %d" %(mat.shape[0], mat.shape[1]) plt.title(title) plt.grid() plt.legend(loc=3) plt.yscale('log') #plt.xlim(25, 375) plt.show() return ls, errs, bounds
def check_sketch_file(orig_mat_fname, sketch_fname, l): # assert os.path.exists assert(os.path.exists(orig_mat_fname)) mat = load_matrix(orig_mat_fname) if not os.path.exists(sketch_fname): return False sketch = load_matrix(sketch_fname) if mat.shape[1] != sketch.shape[1]: return False bound = fd_bound(mat, l) err = calculateError(mat, sketch) if err < bound: return True else: return False
def test_svd(mat_name): if ".txt" not in mat_name: mat_name += ".txt" mat_pname = os.path.join(MATRIX_DIR, mat_name) mat = load_matrix(mat_pname) U, w, Vt = np.linalg.svd(mat, full_matrices=False) V = Vt.T print "V shape: ", V.shape print np.around(V, 2) print "Singular values: ", w
def dynamic_experiment(mat_fname=MATRIX, l1=320, l2=350, batch_size=400, plot=True): print "Dynamic Experiment" mat = load_matrix(mat_fname) # changepoints ts = np.arange(1, mat.shape[0], max(mat.shape[0]/10, 1)) exp_name = "dynamic_exp_" + os.path.splitext(mat_fname)[0] dyn_exp = DynamicSketchExperiment(exp_name, mat_fname, l1, l2, batch_size, ts, randomized=False) dyn_exp.run_experiment() dyn_exp.write_results() if plot: dyn_exp.plot_results()
def test_dyn_exp(): mat_fname = "med_svd_mat.txt" l1 = 20 l2 = 30 batch_size = 20 mat = load_matrix(mat_fname) ts = np.arange(0, mat.shape[0], max(mat.shape[0]/10, 1)) exp_name = "test_dynamic_exp" dyn_exp = DynamicSketchExperiment(exp_name, mat_fname, l1, l2, batch_size, ts, randomized=False) dyn_exp.run_experiment() dyn_exp.plot_results() dyn_exp.write_results()
def __init__(self, exp_name, mat_fname, dependent_vars, dependent_var_name, sparse=False): self.exp_name = exp_name self.mat_fname = mat_fname if sparse: # assume Matrix Market format self.mat = mmread(os.path.join(MATRIX_DIR, mat_fname)) else: self.mat = load_matrix(self.mat_fname) self.exp_dir = os.path.join(EXP_DIR, exp_name, os.path.splitext(mat_fname)[0]) # make a directory for the experiment if it doesnt exist yet try: os.makedirs(self.exp_dir) except OSError, e: if e.errno != 17: raise pass
def test_rand_exp(): mat_fname = "small_data_batch_1" l = 200 sketch_sizes = [100, 200, 300, 400, 500] batch_sizes =[5, 10, 20] alpha = 0.2 exp_name = 'test_rand_bpfd_experiment' mat = load_matrix(mat_fname) print mat.shape results = {} for l in sketch_sizes: sk_o = BatchPFDSketch(mat, l, 5000, alpha, randomized=True) sk_o.compute_sketch() results[l] = sk_o.sketching_time with open("experiments/rand_scale/results2.p", "wb") as f: pickle.dump(results, f) print "Done"
def sketch_cifar(): print "Loading model" mat = load_matrix("large_cifar_data") l = 250 alpha = 0.2 batch_size = 5000 randomized = True num_processes = 8 start_time = time.time() sketch = parallel_bpfd_sketch(mat, l, alpha, batch_size, randomized=randomized, num_processes=num_processes) sketching_time = time.time() - start_time #print "Writing sketch" #write_matrix(sketch, "sketches/w2vec_%d.txt"%l) with open("experiments/parallel_results.txt", "a") as f: f.write("""Mat: %s, Rand: %r, l: %d, b: %d, alpha: %f, Processes: %d, Time: %f\n""" % ("large_cifar", randomized, l, batch_size, alpha, num_processes, sketching_time))
def run_code(): mat = load_matrix(mat_fname) print "Mat Shape: ", mat.shape l = 100 alpha = 0.2 batch_size = 100 randomized=False num_processes=1 print "Starting" start_time = time.time() sketch = parallel_bpfd_sketch(mat, l, alpha, batch_size, randomized=randomized, num_processes=num_processes) sketching_time = time.time() - start_time with open("experiments/parallel_results.txt", "a") as f: f.write("""Mat: %s, Rand: %r, l: %d, b: %d, alpha: %f, Processes: %d, Time: %f\n""" %(mat_fname, randomized, l, batch_size, alpha, num_processes, sketching_time)) print calculateError(mat, sketch)
def dynamic_experiment(mat_fname=MATRIX, l1=320, l2=350, batch_size=400, plot=True): print "Dynamic Experiment" mat = load_matrix(mat_fname) # changepoints ts = np.arange(1, mat.shape[0], max(mat.shape[0] / 10, 1)) exp_name = "dynamic_exp_" + os.path.splitext(mat_fname)[0] dyn_exp = DynamicSketchExperiment(exp_name, mat_fname, l1, l2, batch_size, ts, randomized=False) dyn_exp.run_experiment() dyn_exp.write_results() if plot: dyn_exp.plot_results()
def run_code(): mat = load_matrix(mat_fname) print "Mat Shape: ", mat.shape l = 100 alpha = 0.2 batch_size = 100 randomized = False num_processes = 1 print "Starting" start_time = time.time() sketch = parallel_bpfd_sketch(mat, l, alpha, batch_size, randomized=randomized, num_processes=num_processes) sketching_time = time.time() - start_time with open("experiments/parallel_results.txt", "a") as f: f.write("""Mat: %s, Rand: %r, l: %d, b: %d, alpha: %f, Processes: %d, Time: %f\n""" % (mat_fname, randomized, l, batch_size, alpha, num_processes, sketching_time)) print calculateError(mat, sketch)
def load_sketch(sketch_fname): self.sketch = load_matrix(sketch_matrix) self.sketching_time = 0
def run_fd_sketch(mat_pname, write_pname, l): # do exactly what the C code is doing (including reading and writing files) mat = load_matrix(mat_pname) p_sketch = sketch(mat, l) write_matrix(p_sketch, write_pname) return p_sketch
def construct_sketches(orig_mat_fname, ls, check_c=True, force_comp=False): """ generates sketch files using input l's @param orig_mat_fname: name of input matrix file inside MATRIX_DIR @param ls: List of sketch sizes to construct @param check_c: flag whether to construct sketches using custom C implementation @param force_comp: disregard cached sketch, recompute (required to determine runtime) """ # TOOD: check if we've already constructed it. # we can check for the right filename and then some # maybe don'r run again? p_fnames = [] p_times = [] if check_c: c_fnames = [] c_times = [] if ".txt" not in orig_mat_fname: orig_mat_fname += ".txt" mat_pname = os.path.join(MATRIX_DIR, orig_mat_fname) mat = load_matrix(mat_pname) rows, cols = mat.shape # create C binary if check_c: subprocess.call(["make", "clean"]) subprocess.call(["make", "sketch"]) for l in ls: # generate p_sketch assert(l <= cols) p_sketch_pname = os.path.join(MATRIX_DIR, "p_sketch_%d_%s" %(l, orig_mat_fname)) if not(check_sketch_file(mat_pname, p_sketch_pname, l)) or force_comp: start = time.time() run_fd_sketch(mat_pname, p_sketch_pname, l) p_time = time.time() - start else: p_time = 0.0 p_fnames.append(p_sketch_pname) p_times.append(p_time) if check_c: c_sketch_pname = os.path.join(MATRIX_DIR, "c_sketch_%d_%s" %(l, orig_mat_fname)) # if not(check_sketch_file(mat_pname, c_sketch_pname, l)) or force_comp: start = time.time() # not a fair comparison because of shit c_output = subprocess.check_output([RUN_SKETCH, '-f', mat_pname, '-w', c_sketch_pname, '-l', str(l)]) c_time = time.time() - start # so we should get the time from c_output, hopefully just one float shows up #num_matches = re.findall("\d+\.\d+", c_output) #c_time = float(num_matches[0]) print"C output on: ", l, c_output print "Python time: ", c_time else: c_time = 0.0 c_fnames.append(c_sketch_pname) c_times.append(c_time) if check_c: return p_fnames, p_times, c_fnames, c_times else: return p_fnames, p_times
def test_largest_product_in_grid(): '''Test''' grid = load_matrix(data_file_path) assert 70600674 == largest_product_in_grid(grid, 4)
def main(): '''Main runner, delegates to solution.''' grid = load_matrix(data_file_path) print(largest_product_in_grid(grid, 4))
def main(): '''Main runner, delegates to solution.''' #tree = load_data_file('data/018.txt') tree = load_matrix(build_path(__file__, 'data/018.txt')) print(max_path_sum(tree))