def run_synth_test(): """ Run a test with synthetic data and MCMC inference """ options, popn, data, client, popn_true, x_true = initialize_parallel_test_harness( ) # If x0 specified, load x0 from file x0 = None if options.x0_file is not None: with open(options.x0_file, 'r') as f: print "Initializing with state from: %s" % options.x0_file prev_x0 = cPickle.load(f) if isinstance(prev_x0, list): x0 = prev_x0[-1] else: mle_x0 = prev_x0 # HACK: We're assuming x0 came from a standard GLM mle_model = make_model('standard_glm', N=data['N']) mle_popn = Population(mle_model) mle_popn.set_data(data) x0 = popn.sample(None) x0 = convert_model(mle_popn, mle_model, mle_x0, popn, popn.model, x0) use_existing = False fname = os.path.join(options.resultsDir, '%s_marginal_lkhd.pkl' % options.model) if use_existing and \ os.path.exists(fname): print "Found existing results" with open(fname) as f: marg_lkhd = cPickle.load(f) else: N_samples = 10 popn_true.set_data(data) # Estimate the marginal log likelihood print "Performing parallel inference" marg_lkhd, log_weights = parallel_ais(client, data, x0=x0, N_samples=N_samples, steps_per_B=50, resdir=options.resultsDir) # Save results print "Saving results to %s" % fname with open(fname, 'w') as f: cPickle.dump((marg_lkhd, log_weights), f, protocol=-1)
def run_synth_test(): """ Run a test with synthetic data and MCMC inference """ options, popn, data, client, popn_true, x_true = initialize_parallel_test_harness() # If x0 specified, load x0 from file x0 = None if options.x0_file is not None: with open(options.x0_file, "r") as f: print "Initializing with state from: %s" % options.x0_file prev_x0 = cPickle.load(f) if isinstance(prev_x0, list): x0 = prev_x0[-1] else: mle_x0 = prev_x0 # HACK: We're assuming x0 came from a standard GLM mle_model = make_model("standard_glm", N=data["N"]) mle_popn = Population(mle_model) mle_popn.set_data(data) x0 = popn.sample(None) x0 = convert_model(mle_popn, mle_model, mle_x0, popn, popn.model, x0) use_existing = False fname = os.path.join(options.resultsDir, "%s_marginal_lkhd.pkl" % options.model) if use_existing and os.path.exists(fname): print "Found existing results" with open(fname) as f: marg_lkhd = cPickle.load(f) else: N_samples = 10 popn_true.set_data(data) # Estimate the marginal log likelihood print "Performing parallel inference" marg_lkhd, log_weights = parallel_ais( client, data, x0=x0, N_samples=N_samples, steps_per_B=50, resdir=options.resultsDir ) # Save results print "Saving results to %s" % fname with open(fname, "w") as f: cPickle.dump((marg_lkhd, log_weights), f, protocol=-1)
def run_parallel_map(): """ Run a test with synthetic data and MCMC inference """ # Parse command line args (options, args) = parse_cmd_line_args() # Load the data data = load_data(options) # Get a model for the data model_type = 'standard_glm' model = make_model(model_type, N=data['N']) # Get parallel clients rc = Client(profile="sge") dview = rc[:] # dview = get_engines(n_workers=8) # Load imports on the client load_imports_on_client(dview) # Initialize population objects on the clients dview.apply_sync(initialize_client, (model_type,N,data))
def load_set_of_results(N, T, graph_model='er', sample_frac=0.1): data_dir = os.path.join('/group', 'hips', 'scott', 'pyglm', 'data', 'synth', graph_model, 'N%dT%d' % (N, T)) # Evaluate the state for each of the parameter settings s_infs_mcmc = [] s_infs_map = [] s_trues = [] # Enumerate the subdirectories containing the data subdirs = os.listdir(data_dir) subdirs = reduce(lambda sd, d: sd + [d] \ if os.path.isdir(os.path.join(data_dir, d)) \ else sd, subdirs, []) # For each data subdirectory, load the true data, the MAP estimate, and the MCMC results print "WARNING: Make sure we sample all subdirs" # import pdb; pdb.set_trace() for d in subdirs: print "Loading data and results from %s" % d print "Loading true data" with open(os.path.join(data_dir, d, 'data.pkl'), 'r') as f: data = cPickle.load(f) print "Loading model" with open(os.path.join(data_dir, d, 'model.pkl'), 'r') as f: model_data = cPickle.load(f) #HACK if 'N_dims' not in model_data['network']['graph']: model_data['network']['graph']['N_dims'] = 1 if 'location_prior' not in model_data['network']['graph']: model_data['network']['graph']['location_prior'] = \ { 'type' : 'gaussian', 'mu' : 0.0, 'sigma' : 1.0 } if 'L' in data['vars']['net']['graph']: data['vars']['net']['graph']['L'] = data['vars']['net']['graph']['L'].ravel() popn_data = Population(model_data) popn_data.set_data(data) s_trues.append(popn_data.eval_state(data['vars'])) try: print "Loading map estimate" with open(os.path.join(data_dir, d, 'map', 'results.pkl'), 'r') as f: x_map = cPickle.load(f) model_map = make_model('standard_glm', N=data['N']) popn_map = Population(model_map) popn_map.set_data(data) print "Evaluating MAP state" s_infs_map.append(popn_map.eval_state(x_map)) except Exception as e: print "ERROR: Failed to load MAP estimate" try: print "Loading mcmc estimate" with open(os.path.join(data_dir, d, 'mcmc', 'results.pkl'), 'r') as f: x_mcmc = cPickle.load(f) model_mcmc = make_model('sparse_weighted_model', N=data['N']) popn_mcmc = Population(model_mcmc) popn_mcmc.set_data(data) # Now compute the true and false positive rates for MCMC # For MCMC results, only consider the tail of the samples print "Evaluating MCMC states" N_samples = len(x_mcmc) start_smpl = int(np.floor(N_samples - sample_frac*N_samples)) # Evaluate the state this_s_mcmc = [] for i in range(start_smpl, N_samples): this_s_mcmc.append(popn_mcmc.eval_state(x_mcmc[i])) s_infs_mcmc.append(this_s_mcmc) except Exception as e: print "ERROR: Failed to load MCMC estimate" return s_trues, s_infs_map, s_infs_mcmc
def run_synth_test(): """ Run a test with synthetic data and MAP inference with cross validation """ options, popn, data, popn_true, x_true = initialize_test_harness() # Get the list of models for cross validation base_model = make_model(options.model, N=data['N'], dt=0.001) models = get_xv_models(base_model) # TODO Segment data into training and cross validation sets train_frac = 0.75 T_split = data['T'] * train_frac train_data = segment_data(data, (0, T_split)) xv_data = segment_data(data, (T_split, data['T'])) # Preprocess the data sequences train_data = popn.preprocess_data(train_data) xv_data = popn.preprocess_data(xv_data) # Sample random initial state x0 = popn.sample() # Track the best model and parameters best_ind = -1 best_xv_ll = -np.Inf best_x = x0 best_model = None # Fit each model using the optimum of the previous models train_lls = np.zeros(len(models)) xv_lls = np.zeros(len(models)) total_lls = np.zeros(len(models)) for (i, model) in enumerate(models): print "Training model %d" % i x0 = copy.deepcopy(best_x) popn.set_hyperparameters(model) popn.set_data(train_data) ll0 = popn.compute_log_p(x0) print "Training LL0: %f" % ll0 # Perform inference x_inf = coord_descent(popn, x0=x0, maxiter=1) ll_train = popn.compute_log_p(x_inf) print "Training LP_inf: %f" % ll_train train_lls[i] = ll_train # Compute log lkhd on xv data popn.set_data(xv_data) ll_xv = popn.compute_ll(x_inf) print "Cross Validation LL: %f" % ll_xv xv_lls[i] = ll_xv # Compute log lkhd on total dataset popn.set_data(data) ll_total = popn.compute_ll(x_inf) print "Total LL: %f" % ll_total total_lls[i] = ll_total # Update best model if ll_xv > best_xv_ll: best_ind = i best_xv_ll = ll_xv best_x = copy.deepcopy(x_inf) best_model = copy.deepcopy(model) # Create a population with the best model popn.set_hyperparameters(best_model) popn.set_data(data) # Fit the best model on the full training data best_x = coord_descent(popn, data, x0=x0, maxiter=1, use_hessian=False, use_rop=False) # Print results summary for i in np.arange(len(models)): print "Model %d:\tTrain LL: %.1f\tXV LL: %.1f\tTotal LL: %.1f" % ( i, train_lls[i], xv_lls[i], total_lls[i]) print "Best model: %d" % best_ind print "Best Total LL: %f" % popn.compute_ll(best_x) print "True LL: %f" % popn_true.compute_ll(x_true) # Save results results_file = os.path.join(options.resultsDir, 'results.pkl') print "Saving results to %s" % results_file with open(results_file, 'w') as f: cPickle.dump(best_x, f) # Plot results plot_results(popn, best_x, popn_true, x_true, resdir=options.resultsDir)
def test_latent_distance_network_sampler(N, N_samples=10000): """ Generate a bunch of latent distance networks, run the sampler on them to see how well we mix over latent locations. :param N: Number of neurons in the network """ true_model_type = 'latent_distance' if true_model_type == 'erdos_renyi': true_model = make_model('sparse_weighted_model', N) elif true_model_type == 'latent_distance': true_model = make_model('distance_weighted_model', N) distmodel = make_model('distance_weighted_model', N) D = distmodel['network']['graph']['N_dims'] trials = 1 for t in range(trials): # Generate a true random network popn_true, x_true, A_true = sample_network_from_prior(true_model) dist_popn, x_inf, _ = sample_network_from_prior(distmodel) # Seed the inference population with the true network x_inf['net']['graph']['A'] = A_true # Create a location sampler print "Initializing latent location sampler" loc_sampler = LatentLocationUpdate() loc_sampler.preprocess(dist_popn) # Run the sampler N_samples = 1000 smpls = fit_latent_network_given_A(x_inf, loc_sampler, N_samples=N_samples) if true_model_type == 'latent_distance': # Evaluate the state L_true = x_true['net']['graph']['L'].reshape((N,D)) L_smpls = [x['net']['graph']['L'].reshape((N,D)) for x in smpls] # Visualize the results plot_latent_distance_samples(L_true, L_smpls, A_true=A_true) # Plot errors in relative distance over time compute_diff_of_dists(L_true, L_smpls) # Compute marginal likelihood of erdos renyi with the same sparsity nnz_A = float(A_true.sum()) N_conns = A_true.size # Ignore the diagonal nnz_A -= N N_conns -= N # Now compute density er_rho = nnz_A / N_conns true_er_marg_lkhd = nnz_A * np.log(er_rho) + (N_conns-nnz_A)*np.log(1-er_rho) print "True ER Marg Lkhd: ", true_er_marg_lkhd # DEBUG: Make sure AIS gives the same answer as what we just computed # er_model = make_model('sparse_weighted_model', N) # er_model['network']['graph']['rho'] = er_rho # er_popn, x_inf, _ = sample_network_from_prior(er_model) # # Make a dummy update for the ER model # er_sampler = MetropolisHastingsUpdate() # er_x0 = er_popn.sample() # er_x0['net']['graph']['A'] = A_true # er_marg_lkhd = ais_latent_network_given_A(er_x0, # er_popn.network.graph, # er_sampler # ) # # print "AIS ER Marg Lkhd: ", er_marg_lkhd # Approximate the marginal log likelihood of the distance mode dist_x0 = dist_popn.sample() dist_x0['net']['graph']['A'] = A_true dist_marg_lkhd = ais_latent_network_given_A(dist_x0, dist_popn.network.graph, loc_sampler ) print "Dist Marg Lkhd: ", dist_marg_lkhd
def fit_latent_network_to_mle(): """ Run a test with synthetic data and MCMC inference """ options, popn, data, popn_true, x_true = initialize_test_harness() import pdb; pdb.set_trace() # Load MLE parameters from command line mle_x = None if options.x0_file is not None: with open(options.x0_file, 'r') as f: print "Initializing with state from: %s" % options.x0_file mle_x = cPickle.load(f) mle_model = make_model('standard_glm', N=data['N']) mle_popn = Population(mle_model) mle_popn.set_data(data) # Create a location sampler print "Initializing latent location sampler" loc_sampler = LatentLocationUpdate() loc_sampler.preprocess(popn) # Convert the mle results into a weighted adjacency matrix x_aw = popn.sample(None) x_aw = convert_model(mle_popn, mle_model, mle_x, popn, popn.model, x_aw) # Get rid of unnecessary keys del x_aw['glms'] # Fit the latent distance network to a thresholded adjacency matrix ws = np.sort(np.abs(x_aw['net']['weights']['W'])) wperm = np.argsort(np.abs(x_aw['net']['weights']['W'])) nthrsh = 20 threshs = np.arange(ws.size, step=ws.size/nthrsh) res = [] N = popn.N for th in threshs: print "Fitting network for threshold: %.3f" % th A = np.zeros_like(ws, dtype=np.int8) A[wperm[th:]] = 1 A = A.reshape((N,N)) # A = (np.abs(x_aw['net']['weights']['W']) >= th).astype(np.int8).reshape((N,N)) # Make sure the diag is still all 1s A[np.diag_indices(N)] = 1 x = copy.deepcopy(x_aw) x['net']['graph']['A'] = A smpls = fit_latent_network_given_A(x, loc_sampler) # Index the results by the overall sparsity of A key = (np.sum(A)-N) / (np.float(np.size(A))-N) res.append((key, smpls)) # Save results results_file = os.path.join(options.resultsDir, 'fit_latent_network_results.pkl') print "Saving results to %s" % results_file with open(results_file, 'w') as f: cPickle.dump(res, f)
def run_parallel_map(): """ Run a test with synthetic data and MCMC inference """ options, popn, data, client, popn_true, x_true = initialize_parallel_test_harness() # Get the list of models for cross validation base_model = make_model(options.model, N=data['N']) models = get_xv_models(base_model) # Segment data into training and cross validation sets train_frac = 0.75 T_split = data['T'] * train_frac train_data = segment_data(data, (0,T_split)) xv_data = segment_data(data, (T_split,data['T'])) # Sample random initial state x0 = popn.sample(None) # Track the best model and parameters best_ind = -1 best_xv_ll = -np.Inf best_x = x0 best_model = None use_existing = False start_time = time.clock() # Fit each model using the optimum of the previous models train_lls = np.zeros(len(models)) xv_lls = np.zeros(len(models)) total_lls = np.zeros(len(models)) for (i,model) in enumerate(models): print "Evaluating model %d" % i set_hyperparameters_on_engines(client[:], model) add_data_on_engines(client[:], train_data) if use_existing and \ os.path.exists(os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i)): print "Found existing results for model %d" % i with open(os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i)) as f: (x_inf, ll_train, ll_xv, ll_total) = cPickle.load(f) train_lls[i] = ll_train xv_lls[i] = ll_xv total_lls[i] = ll_total else: x0 = copy.deepcopy(best_x) # set_data_on_engines(client[:], train_data) ll0 = parallel_compute_ll(client[:], x0, data['N']) print "Training LL0: %f" % ll0 # Perform inference x_inf = parallel_coord_descent(client, data['N'], x0=x0, maxiter=1, use_hessian=False, use_rop=False) ll_train = parallel_compute_ll(client[:], x_inf, data['N']) print "Training LL_inf: %f" % ll_train train_lls[i] = ll_train # Compute log lkhd on xv data add_data_on_engines(client[:], xv_data) ll_xv = parallel_compute_ll(client[:], x_inf, data['N']) print "Cross Validation LL: %f" % ll_xv xv_lls[i] = ll_xv # Compute log lkhd on total dataset add_data_on_engines(client[:], data) ll_total = parallel_compute_ll(client[:], x_inf, data['N']) print "Total LL: %f" % ll_total total_lls[i] = ll_total print "Saving partial results" with open(os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i),'w') as f: cPickle.dump((x_inf, ll_train, ll_xv, ll_total) ,f, protocol=-1) # Update best model if ll_xv > best_xv_ll: best_ind = i best_xv_ll = ll_xv best_x = copy.deepcopy(x_inf) best_model = copy.deepcopy(model) print "Training the best model (%d) with the full dataset" % best_ind # Set the best hyperparameters set_hyperparameters_on_engines(client[:], best_model) add_data_on_engines(client[:], data) # Fit the best model on the full training data best_x = parallel_coord_descent(client, data['N'], x0=best_x, maxiter=1, use_hessian=False, use_rop=False) # Print results summary for i in np.arange(len(models)): print "Model %d:\tTrain LL: %.1f\tXV LL: %.1f\tTotal LL: %.1f" % (i, train_lls[i], xv_lls[i], total_lls[i]) print "Best model: %d" % best_ind print "Best Total LL: %f" % parallel_compute_ll(client[:], best_x, data['N']) print "True LL: %f" % popn_true.compute_ll(x_true) stop_time = time.clock() # Save results with open(os.path.join(options.resultsDir, 'results.pkl'),'w') as f: cPickle.dump(best_x, f, protocol=-1) # Save runtime with open(os.path.join(options.resultsDir, 'runtime.pkl'),'w') as f: cPickle.dump(stop_time-start_time, f, protocol=-1)
def load_set_of_results(N, T, graph_model='er', sample_frac=0.1): data_dir = os.path.join('/group', 'hips', 'scott', 'pyglm', 'data', 'synth', graph_model, 'N%dT%d' % (N, T)) # Evaluate the state for each of the parameter settings s_infs_mcmc = [] s_infs_map = [] s_trues = [] # Enumerate the subdirectories containing the data subdirs = os.listdir(data_dir) subdirs = reduce(lambda sd, d: sd + [d] \ if os.path.isdir(os.path.join(data_dir, d)) \ else sd, subdirs, []) # For each data subdirectory, load the true data, the MAP estimate, and the MCMC results print "WARNING: Make sure we sample all subdirs" # import pdb; pdb.set_trace() for d in subdirs: print "Loading data and results from %s" % d print "Loading true data" with open(os.path.join(data_dir, d, 'data.pkl'), 'r') as f: data = cPickle.load(f) print "Loading model" with open(os.path.join(data_dir, d, 'model.pkl'), 'r') as f: model_data = cPickle.load(f) #HACK if 'N_dims' not in model_data['network']['graph']: model_data['network']['graph']['N_dims'] = 1 if 'location_prior' not in model_data['network']['graph']: model_data['network']['graph']['location_prior'] = \ { 'type' : 'gaussian', 'mu' : 0.0, 'sigma' : 1.0 } if 'L' in data['vars']['net']['graph']: data['vars']['net']['graph']['L'] = data['vars']['net'][ 'graph']['L'].ravel() popn_data = Population(model_data) popn_data.set_data(data) s_trues.append(popn_data.eval_state(data['vars'])) try: print "Loading map estimate" with open(os.path.join(data_dir, d, 'map', 'results.pkl'), 'r') as f: x_map = cPickle.load(f) model_map = make_model('standard_glm', N=data['N']) popn_map = Population(model_map) popn_map.set_data(data) print "Evaluating MAP state" s_infs_map.append(popn_map.eval_state(x_map)) except Exception as e: print "ERROR: Failed to load MAP estimate" try: print "Loading mcmc estimate" with open(os.path.join(data_dir, d, 'mcmc', 'results.pkl'), 'r') as f: x_mcmc = cPickle.load(f) model_mcmc = make_model('sparse_weighted_model', N=data['N']) popn_mcmc = Population(model_mcmc) popn_mcmc.set_data(data) # Now compute the true and false positive rates for MCMC # For MCMC results, only consider the tail of the samples print "Evaluating MCMC states" N_samples = len(x_mcmc) start_smpl = int(np.floor(N_samples - sample_frac * N_samples)) # Evaluate the state this_s_mcmc = [] for i in range(start_smpl, N_samples): this_s_mcmc.append(popn_mcmc.eval_state(x_mcmc[i])) s_infs_mcmc.append(this_s_mcmc) except Exception as e: print "ERROR: Failed to load MCMC estimate" return s_trues, s_infs_map, s_infs_mcmc
def run_gen_synth_data(): """ Run a test with synthetic data and MCMC inference """ options, args = parse_cmd_line_args() # Create the model dt = 0.001 model = make_model(options.model, N=options.N, dt=dt) # Set the sparsity level to minimize the risk of unstable networks stabilize_sparsity(model) print "Creating master population object" popn = Population(model) # Sample random parameters from the model x_true = popn.sample() # Check stability of matrix assert check_stability(model, x_true, options.N), "ERROR: Sampled network is unstable!" # Save the model so it can be loaded alongside the data fname_model = os.path.join(options.resultsDir, 'model.pkl') print "Saving data to %s" % fname_model with open(fname_model, 'w') as f: cPickle.dump(model, f, protocol=-1) print "Generating synthetic data with %d neurons and %.2f seconds." % \ (options.N, options.T_stop) # Set simulation parametrs dt_stim = 0.1 D_stim = (5, 5) # D_stim = model['bkgd']['D_stim'] if 'D_stim' in model['bkgd'] else 0 if isinstance(D_stim, int): D_stim = [D_stim] stim = np.random.randn(options.T_stop / dt_stim, *D_stim) data = gen_synth_data(options.N, options.T_stop, popn, x_true, dt, dt_stim, D_stim, stim) # Set the data so that the population state can be evaluated popn.add_data(data) # DEBUG Evaluate the firing rate and the simulated firing rate state = popn.eval_state(x_true) for n in np.arange(options.N): lam_true = state['glms'][n]['lam'] lam_sim = popn.glm.nlin_model.f_nlin(data['X'][:, n]) assert np.allclose(lam_true, lam_sim) # Pickle the data so we can open it more easily fname_pkl = os.path.join(options.resultsDir, 'data.pkl') print "Saving data to %s" % fname_pkl with open(fname_pkl, 'w') as f: cPickle.dump(data, f, protocol=-1) # Plot firing rates, stimulus responses, etc do_plot_imp_resonses = int(options.N) <= 16 plot_results(popn, data['vars'], resdir=options.resultsDir, do_plot_stim_resp=True, do_plot_imp_responses=do_plot_imp_resonses)
def run_gen_synth_data(): """ Run a test with synthetic data and MCMC inference """ options, args = parse_cmd_line_args() # Create the model dt = 0.001 model = make_model(options.model, N=options.N, dt=dt) # Set the sparsity level to minimize the risk of unstable networks stabilize_sparsity(model) print "Creating master population object" popn = Population(model) # Sample random parameters from the model x_true = popn.sample() # Check stability of matrix assert check_stability(model, x_true, options.N), "ERROR: Sampled network is unstable!" # Save the model so it can be loaded alongside the data fname_model = os.path.join(options.resultsDir, 'model.pkl') print "Saving data to %s" % fname_model with open(fname_model,'w') as f: cPickle.dump(model, f, protocol=-1) print "Generating synthetic data with %d neurons and %.2f seconds." % \ (options.N, options.T_stop) # Set simulation parametrs dt_stim = 0.1 D_stim = (5,5) # D_stim = model['bkgd']['D_stim'] if 'D_stim' in model['bkgd'] else 0 if isinstance(D_stim, int): D_stim = [D_stim] stim = np.random.randn(options.T_stop/dt_stim, *D_stim) data = gen_synth_data(options.N, options.T_stop, popn, x_true, dt, dt_stim, D_stim, stim) # Set the data so that the population state can be evaluated popn.add_data(data) # DEBUG Evaluate the firing rate and the simulated firing rate state = popn.eval_state(x_true) for n in np.arange(options.N): lam_true = state['glms'][n]['lam'] lam_sim = popn.glm.nlin_model.f_nlin(data['X'][:,n]) assert np.allclose(lam_true, lam_sim) # Pickle the data so we can open it more easily fname_pkl = os.path.join(options.resultsDir, 'data.pkl') print "Saving data to %s" % fname_pkl with open(fname_pkl,'w') as f: cPickle.dump(data, f, protocol=-1) # Plot firing rates, stimulus responses, etc do_plot_imp_resonses = int(options.N) <= 16 plot_results(popn, data['vars'], resdir=options.resultsDir, do_plot_stim_resp=True, do_plot_imp_responses=do_plot_imp_resonses)
def run_synth_test(): """ Run a test with synthetic data and MAP inference with cross validation """ options, popn, data, popn_true, x_true = initialize_test_harness() # Get the list of models for cross validation base_model = make_model(options.model, N=data['N'], dt=0.001) models = get_xv_models(base_model) # TODO Segment data into training and cross validation sets train_frac = 0.75 T_split = data['T'] * train_frac train_data = segment_data(data, (0,T_split)) xv_data = segment_data(data, (T_split,data['T'])) # Preprocess the data sequences train_data = popn.preprocess_data(train_data) xv_data = popn.preprocess_data(xv_data) # Sample random initial state x0 = popn.sample() # Track the best model and parameters best_ind = -1 best_xv_ll = -np.Inf best_x = x0 best_model = None # Fit each model using the optimum of the previous models train_lls = np.zeros(len(models)) xv_lls = np.zeros(len(models)) total_lls = np.zeros(len(models)) for (i,model) in enumerate(models): print "Training model %d" % i x0 = copy.deepcopy(best_x) popn.set_hyperparameters(model) popn.set_data(train_data) ll0 = popn.compute_log_p(x0) print "Training LL0: %f" % ll0 # Perform inference x_inf = coord_descent(popn, x0=x0, maxiter=1) ll_train = popn.compute_log_p(x_inf) print "Training LP_inf: %f" % ll_train train_lls[i] = ll_train # Compute log lkhd on xv data popn.set_data(xv_data) ll_xv = popn.compute_ll(x_inf) print "Cross Validation LL: %f" % ll_xv xv_lls[i] = ll_xv # Compute log lkhd on total dataset popn.set_data(data) ll_total = popn.compute_ll(x_inf) print "Total LL: %f" % ll_total total_lls[i] = ll_total # Update best model if ll_xv > best_xv_ll: best_ind = i best_xv_ll = ll_xv best_x = copy.deepcopy(x_inf) best_model = copy.deepcopy(model) # Create a population with the best model popn.set_hyperparameters(best_model) popn.set_data(data) # Fit the best model on the full training data best_x = coord_descent(popn, data, x0=x0, maxiter=1, use_hessian=False, use_rop=False) # Print results summary for i in np.arange(len(models)): print "Model %d:\tTrain LL: %.1f\tXV LL: %.1f\tTotal LL: %.1f" % (i, train_lls[i], xv_lls[i], total_lls[i]) print "Best model: %d" % best_ind print "Best Total LL: %f" % popn.compute_ll(best_x) print "True LL: %f" % popn_true.compute_ll(x_true) # Save results results_file = os.path.join(options.resultsDir, 'results.pkl') print "Saving results to %s" % results_file with open(results_file, 'w') as f: cPickle.dump(best_x, f) # Plot results plot_results(popn, best_x, popn_true, x_true, resdir=options.resultsDir)
def run_parallel_map(): """ Run a test with synthetic data and MCMC inference """ options, popn, data, client, popn_true, x_true = initialize_parallel_test_harness( ) # Get the list of models for cross validation base_model = make_model(options.model, N=data['N']) models = get_xv_models(base_model) # Segment data into training and cross validation sets train_frac = 0.75 T_split = data['T'] * train_frac train_data = segment_data(data, (0, T_split)) xv_data = segment_data(data, (T_split, data['T'])) # Sample random initial state x0 = popn.sample(None) # Track the best model and parameters best_ind = -1 best_xv_ll = -np.Inf best_x = x0 best_model = None use_existing = False start_time = time.clock() # Fit each model using the optimum of the previous models train_lls = np.zeros(len(models)) xv_lls = np.zeros(len(models)) total_lls = np.zeros(len(models)) for (i, model) in enumerate(models): print "Evaluating model %d" % i set_hyperparameters_on_engines(client[:], model) add_data_on_engines(client[:], train_data) if use_existing and \ os.path.exists(os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i)): print "Found existing results for model %d" % i with open( os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i)) as f: (x_inf, ll_train, ll_xv, ll_total) = cPickle.load(f) train_lls[i] = ll_train xv_lls[i] = ll_xv total_lls[i] = ll_total else: x0 = copy.deepcopy(best_x) # set_data_on_engines(client[:], train_data) ll0 = parallel_compute_ll(client[:], x0, data['N']) print "Training LL0: %f" % ll0 # Perform inference x_inf = parallel_coord_descent(client, data['N'], x0=x0, maxiter=1, use_hessian=False, use_rop=False) ll_train = parallel_compute_ll(client[:], x_inf, data['N']) print "Training LL_inf: %f" % ll_train train_lls[i] = ll_train # Compute log lkhd on xv data add_data_on_engines(client[:], xv_data) ll_xv = parallel_compute_ll(client[:], x_inf, data['N']) print "Cross Validation LL: %f" % ll_xv xv_lls[i] = ll_xv # Compute log lkhd on total dataset add_data_on_engines(client[:], data) ll_total = parallel_compute_ll(client[:], x_inf, data['N']) print "Total LL: %f" % ll_total total_lls[i] = ll_total print "Saving partial results" with open( os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i), 'w') as f: cPickle.dump((x_inf, ll_train, ll_xv, ll_total), f, protocol=-1) # Update best model if ll_xv > best_xv_ll: best_ind = i best_xv_ll = ll_xv best_x = copy.deepcopy(x_inf) best_model = copy.deepcopy(model) print "Training the best model (%d) with the full dataset" % best_ind # Set the best hyperparameters set_hyperparameters_on_engines(client[:], best_model) add_data_on_engines(client[:], data) # Fit the best model on the full training data best_x = parallel_coord_descent(client, data['N'], x0=best_x, maxiter=1, use_hessian=False, use_rop=False) # Print results summary for i in np.arange(len(models)): print "Model %d:\tTrain LL: %.1f\tXV LL: %.1f\tTotal LL: %.1f" % ( i, train_lls[i], xv_lls[i], total_lls[i]) print "Best model: %d" % best_ind print "Best Total LL: %f" % parallel_compute_ll(client[:], best_x, data['N']) print "True LL: %f" % popn_true.compute_ll(x_true) stop_time = time.clock() # Save results with open(os.path.join(options.resultsDir, 'results.pkl'), 'w') as f: cPickle.dump(best_x, f, protocol=-1) # Save runtime with open(os.path.join(options.resultsDir, 'runtime.pkl'), 'w') as f: cPickle.dump(stop_time - start_time, f, protocol=-1)