def test_heuristics(dataset_name, min_k, max_k, k_step, nSamples, results_file, tau_scale, cores): n = LinkServerCP(dataset_name).getNumNodes() nBFS_theoretic = int(n *log(n, 2)) f = open(results_file,'w') f.write('# Dataset: %s\n'%dataset_name) f.write('\t'.join('INFEST(%d,%d)'%(init_samples, iter_samples) for init_samples, iter_samples in \ infest_heuristics) + '\n') f.write('\t'.join('Vanilla(%d)'%(vanilla_samples) for vanilla_samples in vanilla_heuristics) + '\n') f.close() for k_frac in np.arange(min_k, max_k, k_step): print "k_frac = ", k_frac k = int(n * k_frac) seeds_fname = "%s-seeds-%d.cp"%(dataset_name, k) for i in xrange(nSamples): print "sample #",i print "Testing heuristics for dataset: ", dataset_name generateSeedFiles(k, k+1, 1, range(n), 1, dataset_name + "-seeds-") seeds=cp.load(open(seeds_fname,'r')) true_value, num_cycles_full = 0, 0 print "taking %d samples to evaluate the true value of spread"%nBFS_samples true_value, num_cycles_full = runVanilla(dataset = dataset_name, seeds = seeds_fname, nCores = cores, min_samples = min_nSamples, min_relative_std_error = min_relative_error) print "true value is: ", true_value num_cycles_full = int(1. * nBFS_theoretic / nBFS_samples * num_cycles_full) infest_results = [] print "Running INFEST based heuristics" for init_samples, iter_samples in infest_heuristics: print "INFEST(%d,%d)"%(init_samples, iter_samples) approx_estimate, num_cycles_approx = runApproxHeuristic(dataset_name, seeds_fname, tau_scale, cores,\ init_samples, iter_samples) infest_results.append((approx_estimate, num_cycles_approx)) vanilla_results = [] print "Running Vanilla based heuristics" for samples in vanilla_heuristics: seq_estimate, num_cycles_seq = runVanilla(dataset_name, seeds_fname, samples, cores) vanilla_results.append((seq_estimate, num_cycles_seq)) removeFile(seeds_fname) f = open(results_file, 'a') f.write('%.6f\t%.6f\t%d\t'%(k_frac, true_value, num_cycles_full) + '\t'.join('%d\t%d'%results for results in infest_results) + '\t' + \ '\t'.join('%d\t%d'%results for results in vanilla_results) + '\n') f.close()
#n = len(V) print "Number of nodes: ", n nBFS_samples = 1000 running_times_file = results_dir + output + '-running_times-ratios_k_min-%.3f-k_max-%.3f-samples-%d-bfs_samples-%d-large'%(min_frac,max_frac,nSamples,nBFS_samples) running_times_file_raw = running_times_file + "-raw" removeFile(running_times_file) f = open(running_times_file, 'w') f_raw = open(running_times_file_raw,'w') nBFS_samples = 1000 nBFS_samples_theoretic = n * log(n,2) for nSeeds in xrange(int(min_frac * n), int(max_frac * n), int(interval * n)): print "k = ", nSeeds seeds_fname = output + "-seeds-" + str(nSeeds) + '.cp' runtimes_approx, runtimes_seq = [], [] for i in xrange(nSamples): generateSeedFiles(nSeeds, nSeeds+1, int(interval * n), range(n), 1, output + "-seeds-") perf_csv_fname = dataset + 'runtimes_large.csv' subprocess.Popen("perf stat -x, -o %s python ic_bfs_eval.py -dataset %s -res_fname %s -seeds %s -output_mode 3 -cores %d"%\ (perf_csv_fname, dataset, output + "-approx-" + str(nSeeds), seeds_fname, parameters.cores), \ shell = True, stdout = subprocess.PIPE).stdout.read() num_cycles_approx = getNumCycles(perf_csv_fname) runtimes_approx.append(num_cycles_approx) print "Done approximating, now running naive sequential algorithm" removeFile(perf_csv_fname) subprocess.Popen("perf stat -x, -o %s python seq_estimation.py -dataset %s -seeds_file %s -results_file %s -output_mode 3 -nSamples %d -cores %d"%(perf_csv_fname, dataset, seeds_fname, output + "-seq-" + str(nSeeds), nBFS_samples, parameters.cores), shell=True,stdout=subprocess.PIPE).stdout.read() runtime_seq_samples = getNumCycles(perf_csv_fname) theoretic_num_cycles = 1.0 * runtime_seq_samples / nBFS_samples * nBFS_samples_theoretic runtimes_seq.append(theoretic_num_cycles) f_raw.write('%.3f\t%.3f\t%.3f\n'%(1.*nSeeds/n, num_cycles_approx, theoretic_num_cycles)) removeFile('runtimes_large_seq.csv') print "runtimes_approx: ", runtimes_seq
n = L.getNumNodes() if k_mode == 0: nSeeds = int(n * k) else: nSeeds = int(k) perf_csv_fname = "perf_out" + str(sample(range(1000),1)[0]) + ".csv" print "Number of nodes: ", n print "nSeeds = ", nSeeds results_file = results_dir + output + '-influence_values_samples_min-%d-samples_max-%d-k-%.3f-prob_method-%d'%(min_samples,max_samples, k, prob_method) removeFile(results_file) seeds_fname = "%s-seeds-%d.cp"%(dataset,nSeeds) generateSeedFiles(nSeeds, nSeeds+1, 1, range(n), 1, dataset + "-seeds-") removeFile(results_file) values = [] for samples in xrange(min_samples, max_samples+1, samples_step): print "number of samples = ", samples output_fname = results_dir + 'nReached%d.txt'%random.randint(1,1000) subprocess.Popen("python seq_estimation.py -dataset %s -cores %d -seeds_file %s -results_file %s -output_mode 3 -nSamples %d -get_n_reached 1 -reached_nodes_file %s"%(dataset, parameters.cores, seeds_fname, output + "-seq-" + str(nSeeds), samples_step, output_fname), shell=True,stdout=subprocess.PIPE).stdout.read() removeFile(output + "-seq-" + str(nSeeds)) f_values = open(output_fname,'r') values += [int(v) for v in f_values.readline().strip().split()] f_values.close() removeFile(output_fname) print "spread values for %d samples: %s"%(samples,str(values)) f = open(results_file, 'a') f.write('%d\t%.5f\t%s\n'%(samples, np.std(values), "\t".join(str(val) for val in values))) f.close()
print "creating link-server object" if parameters.prob_method == 0: edge_prob = 0.2 else: edge_prob = [0.1,0.01] L=LinkServerCP(dataset, edges_csv, create_new=True, prob_method=parameters.prob_method, prob=edge_prob, delim=delimiter, undirected=parameters.undirected) print "n = ", L.getNumNodes() V = LoadNodesFromFile(edges_csv, delimiter) n = len(V) print 'min_frac', min_frac k_min = int(n * min_frac) k_max = int(n * max_frac) + 1 k_step = int(n * interval) generateSeedFiles(k_min, k_max, k_step, V, nSamples, 'experiments/' + output + "-seeds-") mean_errors, std_errors = [], [] for k in xrange(k_min, k_max,k_step): approx_fname = 'experiments/results/' + output + "-approx_errors-k_min-%d-k_max-%d-k-%d-samples-%d"%(k_min,k_max,k,nSamples) seq_fname = 'experiments/results/' + output + "-seq-approx-errors-k-%d-samples-%d"%(k,nSamples) print approx_fname print seq_fname removeFile(approx_fname) seeds_fname = 'experiments/' + output + "-seeds-" + str(k) + ".cp" print "Running approx algorithm for k=: ", k subprocess.Popen("python ic_bfs_eval.py -dataset %s -res_fname %s -seeds %s -output_mode 2"%\ (dataset, approx_fname, seeds_fname), \ shell = True, stdout = subprocess.PIPE).stdout.read()
print "creating link-server object" if parameters.prob_method == 0: edge_prob = 0.2 else: edge_prob = [0.1,0.01] L=LinkServerCP(dataset, edges_csv, create_new=True, prob_method=parameters.prob_method, prob=edge_prob, delim=delimiter) print "n = ", L.getNumNodes() V = LoadNodesFromFile(edges_csv, delimiter) n = len(V) k = int(n * k_frac) print "n=", n print "k=", k seeds_fname = output + "-seeds-" + str(k) + ".cp" removeFile(seeds_fname) removeFile(dataset) generateSeedFiles(k, k+1, 1, V, nSamples, output + "-seeds-") approx_fnames = ['experiments/results/' + output + "-approx-k-%d-samples-%d-scale-%.5f"%(k,nSamples,scale) for scale in drange(min_frac,max_frac,interval)] seq_fname = 'experiments/results/' + output + "-seq-k-%d-samples-%d-eps-%.5f"%(k,nSamples,eps) for i, scale in enumerate(drange(min_frac, max_frac, interval)): removeFile(approx_fnames[i]) print "Running approx algorithm for scale factor: ", scale subprocess.Popen("python ic_bfs_eval.py -dataset %s -scale %.5f -res_fname %s -seeds %s -output_mode 0"%\ (dataset, scale, approx_fnames[i], seeds_fname), \ shell = True, stdout = subprocess.PIPE).stdout.read() print "Running sequential algorithm on seed sets with eps=%.5f"%eps removeFile(seq_fname) subprocess.Popen("python seq_estimation.py -dataset %s -seeds_file %s -results_file %s -output_mode 2 -min_samples 500 -min_relative_standard_error 0.01 "%\ (dataset, seeds_fname, seq_fname), shell=True,stdout=subprocess.PIPE).stdout.read()
if k_mode == 0: nSeeds = int(n * k) else: nSeeds = int(k) approx_results_fname = output + "-equal_times-approx-" + str(k) + '-' + str(sample(range(1000),1)[0]) perf_csv_fname = "perf_out" + str(sample(range(1000),1)[0]) + ".csv" nBFS_samples = 1000 print "Number of nodes: ", n print "nSeeds = ", nSeeds results_file = results_dir + output + '-influence_concentration_samples_min-%d-samples_max-%d-k-%.3f-prob_method-%d-nSeeds_sets-%d'%(min_samples,max_samples, k, prob_method, seed_sets) removeFile(results_file) seeds_fname = "%s-seeds-%d.cp"%(results_dir + output, nSeeds) generateSeedFiles(nSeeds, nSeeds+1, 1, range(n), seed_sets, results_dir + output + "-seeds-") values = [] for samples in xrange(min_samples, max_samples+1, samples_step): print "number of samples = ", samples output_fname = results_dir + 'nReached%d.txt'%random.randint(1,1000) subprocess.Popen("python seq_estimation.py -dataset %s -cores %d -seeds_file %s -results_file %s -output_mode 3 -nSamples %d -get_n_reached 1 -reached_nodes_file %s"%(dataset, parameters.cores, seeds_fname, output + "-seq-" + str(nSeeds), samples_step, output_fname), shell=True,stdout=subprocess.PIPE).stdout.read() removeFile(output + "-seq-" + str(nSeeds)) f_values = open(output_fname,'r') f = open(results_file, 'a') for i, line in enumerate(f_values.readlines()): values += [int(v) for v in line.strip().split()] f.write('%d\t%.5f\n'%(samples, sem(values)/np.mean(values))) f.close() f_values.close() removeFile(output_fname)
'-approx-heuristic-prob_method-%d-k_min-%.4f-k_max-%.4f-tau_scale-%.3f-samples-%d-bfs_samples-%d-init_samples-%d-iter_samples-%d'%\ (parameters.prob_method, min_k, max_k, tau_scale, nSamples, nBFS_samples, init_samples, iter_samples) removeFile(results_file) # Record time to load link-server file subprocess.Popen("perf stat -x, -o %s python load_link_server.py -cp %s"%\ (perf_csv_fname, dataset), shell = True, stdout = subprocess.PIPE).stdout.read() nCycles_link_server = getNumCycles(perf_csv_fname) print "Number of cycles for loading link server: ", nCycles_link_server removeFile(perf_csv_fname) for k in xrange(int(min_k),int(max_k + 1), int(k_step)): print "k = ", k seeds_fname = "%s-seeds-%d.cp"%(output, k) for i in xrange(nSamples): print "sample #",i generateSeedFiles(k, k+1, 1, range(n), 1, output + "-seeds-") seeds=cp.load(open(seeds_fname,'r')) print "Running Vanilla with %d samples"%(nBFS_samples) true_value, num_cycles_full = runVanilla(dataset, seeds_fname, nBFS_samples, parameters.cores) print "Done. Number of cycles: %d"%num_cycles_full nCycles_per_bfs = 1.*(num_cycles_full - nCycles_link_server) / nBFS_samples print "Number of cycles per sample", nCycles_per_bfs print "Running approximation algorithm" approx_estimate, num_cycles_approx = runApproxHeuristic(dataset, seeds_fname, tau_scale, parameters.cores,\ parameters.init_samples, parameters.iter_samples, nCycles_link_server) print "Number of cycles without link-server loading: ", num_cycles_approx print "Done approximating, now running naive sequential algorithm" nVanilla_samples = int(ceil(1. * num_cycles_approx / nCycles_per_bfs)) print "Running Vanilla for %s samples"%nVanilla_samples seq_estimate, num_cycles_seq = runVanilla(dataset, seeds_fname, nVanilla_samples, parameters.cores)
create_new=True, prob_method=parameters.prob_method, prob=edge_prob, delim=delimiter, undirected=parameters.undirected) print "n = ", L.getNumNodes() V = LoadNodesFromFile(edges_csv, delimiter) n = len(V) k_min = int(n * min_frac) k_max = int(n * max_frac) + 1 k_step = int(n * interval) print "max_k = ", max_frac print "Minimum k value: %d, maximum k value: %d" % (k_min, k_max) removeFile(dataset) generateSeedFiles(k_min, k_max, k_step, V, nSamples, 'experiments/' + output + "-seeds-") results_fname = "experiments/results/" + output + '-approximations-nSamples-%d-k_frac-%.3f-%.3f' % ( nSamples, min_frac, max_frac) removeFile(results_fname) mean_errors, std_errors = [], [] for k in xrange(k_min, k_max, k_step): approx_fname = 'experiments/results/' + output + \ "-approx_errors-k_min-%d-k_max-%d-k-%d-samples-%d"%(k_min,k_max,k,nSamples) seq_fname = 'experiments/results/' + output + "-seq-approx-errors-k-%d-samples-%d" % ( k, nSamples) seeds_fname = 'experiments/' + output + "-seeds-" + str(k) + ".cp" removeFile(seq_fname) print "Running sequential algorithm for k=%d" % k subprocess.Popen("python seq_estimation.py -dataset %s -seeds_file %s -cores 40 -results_file %s -output_mode 2 -min_samples 500\
L = LinkServerCP(dataset, edges_csv, create_new=True, prob_method=parameters.prob_method, prob=edge_prob, delim=delimiter) print "n = ", L.getNumNodes() V = LoadNodesFromFile(edges_csv, delimiter) n = len(V) k = int(n * k_frac) print "n=", n print "k=", k seeds_fname = output + "-seeds-" + str(k) + ".cp" removeFile(seeds_fname) removeFile(dataset) generateSeedFiles(k, k + 1, 1, V, nSamples, output + "-seeds-") approx_fnames = [ 'experiments/results/' + output + "-approx-k-%d-samples-%d-scale-%.5f" % (k, nSamples, scale) for scale in drange(min_frac, max_frac, interval) ] seq_fname = 'experiments/results/' + output + "-seq-k-%d-samples-%d-eps-%.5f" % ( k, nSamples, eps) for i, scale in enumerate(drange(min_frac, max_frac, interval)): removeFile(approx_fnames[i]) print "Running approx algorithm for scale factor: ", scale subprocess.Popen("python ic_bfs_eval.py -dataset %s -scale %.5f -res_fname %s -seeds %s -output_mode 0"%\ (dataset, scale, approx_fnames[i], seeds_fname), \ shell = True, stdout = subprocess.PIPE).stdout.read()
L=LinkServerCP('input/datasets/' + dataset, csv_fname, create_new=True, prob_method=parameters.prob_method, prob=[0.1,0.01], delim='\t', undirected = 1) # record loading time of link-server -- for interpolation removeFile(perf_output_fname) subprocess.Popen("perf stat -x, -o %s python load_link_server.py -cp %s"%\ (perf_output_fname, "input/datasets/" + dataset), shell = True, stdout = subprocess.PIPE).stdout.read() nCycles_link_server = getNumCycles(perf_output_fname) removeFile(perf_output_fname) #removeFile(csv_fname) V = xrange(n) for nSeeds in xrange(int(min_frac * n), int(max_frac * n), int(interval * n)): for i in xrange(nSamples): seeds_fname = output + "-seeds-%d-%d.cp"%(i, nSeeds) generateSeedFiles(nSeeds, nSeeds+1, int(interval * n), V, 1, output + "-seeds-%d-"%i) subprocess.Popen("perf stat -x, -o %s python ic_bfs_eval.py -dataset %s -cores %d -res_fname %s -seeds %s -output_mode 3 -undirected 1"%\ (perf_output_fname, 'input/datasets/' + dataset, parameters.cores, output + "-approx-" + str(nSeeds), seeds_fname), \ shell = True, stdout = subprocess.PIPE).stdout.read() cycles_approx = getNumCycles(perf_output_fname) cycles_seq = {} print "Done approximating, now running naive sequential algorithm" for eps in eps_list: print "Running sequential for eps = ", eps removeFile(perf_output_fname) subprocess.Popen("perf stat -x, -o %s python seq_estimation.py -dataset %s -cores %d -seeds_file %s -results_file %s -output_mode 3 -nSamples %d"%(perf_output_fname, 'input/datasets/' + dataset, parameters.cores, seeds_fname, output + "-seq-" + str(nSeeds), nBFS_samples), shell=True,stdout=subprocess.PIPE).stdout.read() cycles_seq[eps] = (n*log(n,2) / nBFS_samples) * (getNumCycles(perf_output_fname) - nCycles_link_server) + nCycles_link_server removeFile(perf_output_fname) for i, eps in enumerate(eps_list): f = open(running_times_files[i],'a')