def tmpdir_combine(tmpdir, results_filename): logger.debug(f"Combining results into {results_filename}") tmp_files = sorted(glob.glob(os.path.join(tmpdir, '*.hdf5'))) with h5py.File(results_filename, 'a') as all_f: for tmp_file in tmp_files: logger.log(1, f"Processing {tmp_file}") with h5py.File(tmp_file, 'r') as f: for key in f: if key in all_f: del all_f[key] f.copy(key, all_f) os.remove(tmp_file) shutil.rmtree(tmpdir)
def worker(task): apogee_ids, worker_id, c, results_path, prior, tmpdir, global_rnd = task # This worker's results: results_filename = os.path.join(tmpdir, f'worker-{worker_id}.hdf5') metadata = QTable.read(c.metadata_path) rnd = global_rnd.seed(worker_id) logger.log(1, f"Worker {worker_id}: Creating TheJoker instance with {rnd}") prior = c.get_prior() joker = tj.TheJoker(prior, random_state=rnd) logger.debug(f"Worker {worker_id} on node {socket.gethostname()}: " f"{len(apogee_ids)} stars left to process") # Initialize to get packed column order: logger.log(1, f"Worker {worker_id}: Loading prior samples from cache " f"{c.prior_cache_file}") with h5py.File(c.tasks_path, 'r') as tasks_f: data = tj.RVData.from_timeseries(tasks_f[apogee_ids[0]]) joker_helper = joker._make_joker_helper(data) _slice = slice(0, c.max_prior_samples, 1) batch = read_batch(c.prior_cache_file, joker_helper.packed_order, slice_or_idx=_slice, units=joker_helper.internal_units) ln_prior = read_batch(c.prior_cache_file, ['ln_prior'], _slice)[:, 0] logger.log(1, f"Worker {worker_id}: Loaded {len(batch)} prior samples") for apogee_id in apogee_ids: if apogee_id not in metadata['APOGEE_ID']: logger.debug(f"{apogee_id} not found in metadata file!") continue with h5py.File(c.tasks_path, 'r') as tasks_f: data = tj.RVData.from_timeseries(tasks_f[apogee_id]) # Subtract out MAP sample, run on residual: metadata_row = metadata[metadata['APOGEE_ID'] == apogee_id] MAP_sample = extract_MAP_sample(metadata_row) orbit = MAP_sample.get_orbit(0) new_rv = data.rv - orbit.radial_velocity(data.t) data = tj.RVData(t=data.t, rv=new_rv, rv_err=data.rv_err) logger.debug(f"Worker {worker_id}: Running {apogee_id} " f"({len(data)} visits)") t0 = time.time() try: samples = joker.iterative_rejection_sample( data=data, n_requested_samples=c.requested_samples_per_star, prior_samples=batch, init_batch_size=250_000, growth_factor=32, randomize_prior_order=c.randomize_prior_order, return_logprobs=ln_prior, in_memory=True) except Exception as e: logger.warning(f"\t Failed sampling for star {apogee_id} " f"\n Error: {e}") continue dt = time.time() - t0 logger.debug(f"Worker {worker_id}: {apogee_id} ({len(data)} visits): " f"done sampling - {len(samples)} raw samples returned " f"({dt:.2f} seconds)") # Ensure only positive K values samples.wrap_K() with h5py.File(results_filename, 'a') as results_f: if apogee_id in results_f: del results_f[apogee_id] g = results_f.create_group(apogee_id) samples.write(g) result = {'tmp_filename': results_filename, 'joker_results_path': results_path, 'hostname': socket.gethostname(), 'worker_id': worker_id} return result
logger.info(f'Done preparing tasks: split into {len(tasks)} task chunks') for r in pool.map(worker, tasks, callback=callback): pass if __name__ == '__main__': from threadpoolctl import threadpool_limits from hq.script_helpers import get_parser # Define parser object parser = get_parser(description='Run The Joker on APOGEE data', loggers=[logger, joker_logger]) parser.add_argument("-s", "--seed", dest="seed", default=None, type=int, help="Random number seed") args = parser.parse_args() seed = args.seed if seed is None: seed = np.random.randint(2**32 - 1) logger.log(1, f"No random number seed specified, so using seed: {seed}") with threadpool_limits(limits=1, user_api='blas'): with args.Pool(**args.Pool_kwargs) as pool: main(run_name=args.run_name, pool=pool, overwrite=args.overwrite, seed=args.seed) sys.exit(0)
def main(c, prior, metadata_row, overwrite=False): mcmc_cache_path = os.path.join(c.run_path, 'mcmc') os.makedirs(mcmc_cache_path, exist_ok=True) apogee_id = metadata_row['APOGEE_ID'] this_cache_path = os.path.join(mcmc_cache_path, apogee_id) if os.path.exists(this_cache_path) and not overwrite: logger.info(f"{apogee_id} already done!") # Assume it's already done return # Set up The Joker: joker = tj.TheJoker(prior) # Load the data: logger.debug(f"{apogee_id}: Loading all data") allstar, allvisit = c.load_alldata() allstar = allstar[np.isin(allstar['APOGEE_ID'].astype(str), apogee_id)] allvisit = allvisit[np.isin(allvisit['APOGEE_ID'].astype(str), allstar['APOGEE_ID'].astype(str))] visits = allvisit[allvisit['APOGEE_ID'] == apogee_id] data = get_rvdata(visits) t0 = time.time() # Read MAP sample: MAP_sample = extract_MAP_sample(metadata_row) logger.log(1, f"{apogee_id}: MAP sample loaded") # Run MCMC: with joker.prior.model as model: logger.log(1, f"{apogee_id}: Setting up MCMC...") mcmc_init = joker.setup_mcmc(data, MAP_sample) logger.log(1, f"{apogee_id}: ...setup complete") if 'ln_prior' not in model.named_vars: ln_prior_var = None for k in joker.prior._nonlinear_equiv_units: var = model.named_vars[k] try: if ln_prior_var is None: ln_prior_var = var.distribution.logp(var) else: ln_prior_var = ln_prior_var + var.distribution.logp( var) except Exception as e: logger.warning("Cannot auto-compute log-prior value for " f"parameter {var}.") print(e) continue pm.Deterministic('ln_prior', ln_prior_var) logger.log(1, f"{apogee_id}: setting up ln_prior in pymc3 model") if 'logp' not in model.named_vars: pm.Deterministic('logp', model.logpt) logger.log(1, f"{apogee_id}: setting up logp in pymc3 model") logger.debug(f"{apogee_id}: Starting MCMC sampling") trace = pm.sample(start=mcmc_init, chains=4, cores=1, step=xo.get_dense_nuts_step(target_accept=0.95), tune=c.tune, draws=c.draws) pm.save_trace(trace, directory=this_cache_path, overwrite=True) logger.debug( "{apogee_id}: Finished MCMC sampling ({time:.2f} seconds)".format( apogee_id=apogee_id, time=time.time() - t0))