def collect_n_samples(RIPL, n, mh_iter, ids, max_runtime=600, verbose=True, callback=None, callback_kwargs={}): '''Tries to collect n samples before timeout''' samples = np.zeros((len(ids), 0)) experiment_start = time.clock() start = time.clock() iteration = 0 for unused in range(n): if time.clock() - experiment_start < max_runtime: # Sample iteration += 1 RIPL.infer(mh_iter) # Record sample samples = np.column_stack([samples, [RIPL.report_value(an_id) for an_id in ids]]) if verbose: print 'Iteration %d : Score %s' % (iteration, callback(samples=samples, **callback_kwargs)) else: break finish = time.clock() time_per_sample = max(finish - start, 0.01) / iteration if verbose: print '%d iterations : %f seconds : %f seconds per iteration' % (iteration, finish - start, time_per_sample) # Compute average ess start = time.clock() ess = np.mean([(samples.shape[1]) / act.batch_means(samples[i,:]) for i in range(samples.shape[0])]) if np.isnan(ess) or np.isinf(ess): ess = 1 finish = time.clock() time_to_compute_ess = max(finish - start, 0.01) if verbose: print 'ESS = %f : %f seconds' % (ess, time_to_compute_ess) else: #print 'ESS = %3.0f' % ess pass # Finished - return samples and ...TODO return {'samples' : samples, 'ess' : ess, 'runtime' : time.clock() - start}
def collect_n_es(RIPL, n, mh_iter, ids, min_samples=None, max_runtime=600, verbose=True): '''Tries to collect enough samples of ids to have an ess > n''' #### TODO - Note the min time of 0.01 and the 1 percent computation heuristic of 100 ess = 0 samples = np.zeros((len(ids), 0)) trial_iterations = max(n, min_samples) experiment_start = time.clock() # While samples not large enough and not timed out while (np.floor(ess) + 1 < n) and (time.clock() - experiment_start < max_runtime): # Perform some iterations, recording how fast the sampler is running start = time.clock() iteration = 0 for unused in range(trial_iterations): if time.clock() - experiment_start < max_runtime: # Sample iteration += 1 RIPL.infer(mh_iter) # Record sample samples = np.column_stack([samples, [RIPL.report_value(an_id) for an_id in ids]]) finish = time.clock() time_per_sample = max(finish - start, 0.01) / iteration if verbose: print '%d iterations : %f seconds : %f seconds per iteration' % (iteration, finish - start, time_per_sample) # Compute average ess start = time.clock() ess = np.mean([(samples.shape[1]) / act.batch_means(samples[i,:]) for i in range(samples.shape[0])]) if np.isnan(ess) or np.isinf(ess): ess = 1 finish = time.clock() time_to_compute_ess = max(finish - start, 0.01) if verbose: print 'ESS = %f : %f seconds' % (ess, time_to_compute_ess) else: #print 'ESS = %3.0f' % ess pass # Decide how long to sample for samples_per_effective_sample = samples.shape[1] / ess estimated_samples_required = samples_per_effective_sample * (n - ess) if np.isnan(estimated_samples_required) or np.isinf(estimated_samples_required): estimated_samples_required = 1 else: estimated_samples_required = int(np.floor(estimated_samples_required)) if verbose: print 'Estimated samples required %d' % estimated_samples_required balance_computation_samples = int(np.floor(100 * time_to_compute_ess / time_per_sample)) if verbose: print '1%% samples %d' % balance_computation_samples trial_iterations = min(estimated_samples_required, balance_computation_samples) if verbose: print 'Sampling for %d' % trial_iterations # Finished - return samples and ...TODO return {'samples' : samples, 'ess' : ess}
def IRM_HighSchool(fold=1,burn=50,n_samples=100,mh_iter=100,verbose=True): # Create RIPL and clear any previous session MyRIPL = venture_engine MyRIPL.clear() # Load high school data set data = scipy.io.loadmat("../data/hs/hs_%dof5.mat" % fold, squeeze_me=True) observed = list(zip(data['train_i'].flat, data['train_j'].flat, data['train_v'].flat)) missing = list(zip(data['test_i'].flat, data['test_j'].flat, data['test_v'].flat)) # Convenience functions MyRIPL.assume('min-2', parse('(lambda (x y) if (< x y) x y)')) MyRIPL.assume('max-2', parse('(lambda (x y) if (> x y) x y)')) # Instantiate CRP MyRIPL.assume('alpha', parse('(uniform-continuous 0.0001 2.0)')) MyRIPL.assume('cluster-crp', parse('(CRP/make alpha)')) # Create class assignment lookup function MyRIPL.assume('node->class', parse('(mem (lambda (nodes) (cluster-crp)))')) # Create class interaction probability lookup function MyRIPL.assume('classes->parameters', parse('(mem (lambda (class1 class2) (beta 0.5 0.5)))')) MyRIPL.assume('classes->parameters-symmetric', parse('(lambda (class1 class2) (classes->parameters (min-2 class1 class2) (max-2 class1 class2)))')) # Create relation evaluation function MyRIPL.assume('p-friends', parse('(lambda (node1 node2) (classes->parameters-symmetric (node->class node1) (node->class node2)))')) MyRIPL.assume('friends', parse('(lambda (node1 node2) (bernoulli (p-friends node1 node2)))')) # Tell Venture about observations for (i,j,v) in observed: if verbose: print 'Observing (%3d, %3d) = %d' % (i, j, v) if v: MyRIPL.observe(parse('(friends %d %d)' % (i, j)), 'true') else: MyRIPL.observe(parse('(friends %d %d)' % (i, j)), 'false') # Tell Venture that we want to predict P(unobserved link) truth = [] missing_links = [] for (i,j,v) in missing: if verbose: print 'Predicting (%3d, %3d) = %d' % (i, j, v) truth.append(int(v)) missing_links.append(MyRIPL.predict(parse('(p-friends %d %d)' % (i, j)))) # Perform inference for sample_number in range(burn): MyRIPL.infer(mh_iter) print 'Burn in sample %4d' % (sample_number + 1) roc_data = [] for (true_link, (missing_link, _)) in zip(truth, missing_links): roc_data.append((true_link, MyRIPL.report_value(missing_link))) print ROCData(roc_data).auc() samples = np.zeros((len(truth), 0)) #ess_samples = [] for sample_number in range(n_samples): MyRIPL.infer(mh_iter) print 'Sample %4d' % (sample_number + 1) roc_data = [] for (true_link, (missing_link, _)) in zip(truth, missing_links): roc_data.append((true_link, MyRIPL.report_value(missing_link))) samples = np.column_stack([samples, [roc_datum[1] for roc_datum in roc_data]]) ess = (sample_number + 1) / act.batch_means(samples[0,:]) print ess print ROCData(roc_data).auc() print 'Fold complete' predictions = list(samples.mean(axis=1)) roc_data = [] for (true_link, prediction) in zip(truth, predictions): roc_data.append((true_link, prediction)) AUC = ROCData(roc_data).auc() print 'AUC = %f' % AUC return {'truth' : truth, 'predictions' : predictions, 'samples' : samples, 'AUC' : AUC}
def collect_n_samples_before_timeout(RIPL, n, initial_mh_iter, ids, max_runtime=600, verbose=True): '''Alters the number of intermediate mh_iter to n samples in max_runtime''' #### FIXME - Dangerous - thinning depends on the state! - need to do a trial run! #### Maybe it does need linear regression to be awesome? #### TODO - start here samples = np.zeros((len(ids), 0)) experiment_start = time.clock() start = time.clock() iteration = 0 mh_iter = initial_mh_iter mh_sum = 0 # For a maximum of n iterations for unused in range(n): if time.clock() - experiment_start < max_runtime: # Sample iteration += 1 if (iteration > 10) and (iteration < n): # A few samples have been collected, adjust mh_iter to finish on time #### TODO - a filtering approach would be more adaptive now = time.clock() time_elapsed = now - start time_remaining = max_runtime - time_elapsed time_per_mh_iter = time_elapsed / mh_sum samples_remaining = n - iteration time_remaining_per_sample = time_remaining / samples_remaining mh_iter = max(1, int(round(time_remaining_per_sample / time_per_mh_iter))) if verbose: print 'Time elapsed ', time_elapsed print 'Time remaining', time_remaining print 'Time per mh ', time_per_mh_iter print 'Samples to go ', samples_remaining print 'Time per samp.', time_remaining print 'Remaining per.', time_remaining_per_sample print 'mh_iter', mh_iter RIPL.infer(mh_iter) mh_sum += mh_iter if verbose: print 'Iteration %d' % iteration # Record sample samples = np.column_stack([samples, [RIPL.report_value(an_id) for an_id in ids]]) else: break finish = time.clock() time_per_sample = max(finish - start, 0.01) / iteration if verbose: print '%d iterations : %f seconds : %f seconds per iteration' % (iteration, finish - start, time_per_sample) # Compute average ess start = time.clock() ess = np.mean([(samples.shape[1]) / act.batch_means(samples[i,:]) for i in range(samples.shape[0])]) if np.isnan(ess) or np.isinf(ess): ess = 1 finish = time.clock() time_to_compute_ess = max(finish - start, 0.01) if verbose: print 'ESS = %f : %f seconds' % (ess, time_to_compute_ess) else: #print 'ESS = %3.0f' % ess pass # Finished - return samples and ...TODO max_mem = memory() return {'samples' : samples, 'ess' : ess, 'max_memory' : max_mem}