def make_run(name): run_path = os.path.join(HQ_CACHE_PATH, name) if os.path.exists(run_path): logger.info( "Run already exists at '{}'! Delete that path and re-run " "if you want to create a new run with this name.".format(run_path)) return logger.debug("Creating new run '{}' at path {}".format(name, run_path)) os.makedirs(run_path) # Now copy in the template configuration file: HQ_ROOT_PATH = os.path.split(hq.__file__)[0] tmpl = os.path.join(HQ_ROOT_PATH, 'pkgdata', 'template_config.yml') with open(tmpl, 'r') as f: template_config = f.read() template_config = template_config.replace("name: null", f"name: {name}") new_config_path = os.path.join(run_path, 'config.yml') with open(new_config_path, 'w') as f: f.write(template_config) # Now copy template prior: prior_path = os.path.join(HQ_ROOT_PATH, 'pkgdata', 'template_prior.py') new_prior_path = os.path.join(run_path, 'prior.py') with open(prior_path, 'r') as f: with open(new_prior_path, 'w') as new_f: new_f.write(f.read()) logger.info( f"Created an HQ run at: {run_path}\n" f"\tNow edit the configuration file at: {new_config_path}\n" f"\tAnd edit the prior specification file at: {new_prior_path}")
def main(run_name, pool): c = Config.from_run_name(run_name) apogee_ids = sorted([ x for x in os.listdir(os.path.join(c.run_path, 'mcmc')) if not x.startswith('.') ]) tasks = batch_tasks(len(apogee_ids), pool.size, arr=apogee_ids, args=(c, )) logger.info(f'Done preparing tasks: {len(tasks)} stars in process queue') sub_tbls = [] all_samples = {} for result in tqdm(pool.map(worker, tasks), total=len(tasks)): if result is not None: sub_tbls.append(result['tbl']) all_samples.update(result['samples']) # Write the MCMC metadata table tbl = vstack(sub_tbls) for k in result['units']: tbl[k].unit = result['units'][k] tbl = QTable(tbl) tbl.write(c.metadata_mcmc_path, overwrite=True) # Now write out all of the individual samplings: with h5py.File(c.mcmc_results_path, 'a') as results_f: for apogee_id, samples in all_samples.items(): if apogee_id in results_f: del results_f[apogee_id] g = results_f.create_group(apogee_id) samples.write(g)
def main(run_name, pool, overwrite=False): c = Config.from_run_name(run_name) if path.exists(c.tasks_path) and not overwrite: logger.info( f"File {c.tasks_path} already exists: Use -o/--overwrite if" " needed") return # Load the full allvisit file, but only some columns: allstar, _allvisit = c.load_alldata() allvisit = Table() for k in ['APOGEE_ID', 'JD', 'VHELIO', 'VRELERR', 'SNR']: allvisit[k] = _allvisit[k] logger.debug("Loading data and preparing tasks...") apogee_ids = np.unique(allstar['APOGEE_ID']) with h5py.File(c.tasks_path, 'w') as f: for apogee_id in tqdm(apogee_ids): visits = allvisit[allvisit['APOGEE_ID'] == apogee_id] data = get_rvdata(visits) g = f.create_group(apogee_id) data.to_timeseries().write(g, format='hdf5', serialize_meta=True) logger.info('Done preparing tasks: {0} stars in process queue'.format( len(apogee_ids)))
def main(run_name, pool, overwrite=False, seed=None, limit=None): c = Config.from_run_name(run_name) if not os.path.exists(c.prior_cache_file): raise IOError(f"Prior cache file {c.prior_cache_file} does not exist! " "Did you run make_prior_cache.py?") if not os.path.exists(c.tasks_path): raise IOError("Tasks file '{0}' does not exist! Did you run " "make_tasks.py?") # Make directory for temp. files, one per worker: tmpdir = os.path.join(c.run_path, 'thejoker') if os.path.exists(tmpdir): logger.warning(f"Stale temp. file directory found at {tmpdir}: " "combining files first...") tmpdir_combine(tmpdir, c.joker_results_path) # ensure the results file exists logger.debug("Loading past results...") with h5py.File(c.joker_results_path, 'a') as f: done_apogee_ids = list(f.keys()) if overwrite: done_apogee_ids = list() # Get data files out of config file: logger.debug("Loading data...") allstar, _ = c.load_alldata() allstar = allstar[~np.isin(allstar['APOGEE_ID'], done_apogee_ids)] # Create TheJoker sampler instance with the specified random seed and pool rnd = np.random.RandomState(seed=seed) logger.debug(f"Processing pool has size = {pool.size}") apogee_ids = np.unique(allstar['APOGEE_ID']) if limit is not None: apogee_ids = apogee_ids[:limit] if done_apogee_ids: logger.info(f"{len(done_apogee_ids)} already completed - " f"{len(apogee_ids)} left to process") # Load the prior: logger.debug("Creating JokerPrior instance...") prior = c.get_prior() os.makedirs(tmpdir) atexit.register(tmpdir_combine, tmpdir, c.joker_results_path) logger.debug("Preparing tasks...") if len(apogee_ids) > 10 * pool.size: n_tasks = min(16 * pool.size, len(apogee_ids)) else: n_tasks = pool.size tasks = batch_tasks(len(apogee_ids), n_tasks, arr=apogee_ids, args=(c, prior, tmpdir, rnd)) logger.info(f'Done preparing tasks: split into {len(tasks)} task chunks') for r in pool.map(worker, tasks, callback=callback): pass
def main(run_name, pool): c = Config.from_run_name(run_name) # numbers we need to validate for path in [c.joker_results_path, c.tasks_path]: if not os.path.exists(path): raise IOError(f"File {path} does not exist! Did you run the " "preceding pipeline steps?") # Get data files out of config file: logger.debug("Loading data...") allstar, _ = c.load_alldata() apogee_ids = np.unique(allstar['APOGEE_ID']) tasks = batch_tasks(len(apogee_ids), pool.size, arr=apogee_ids, args=(c, )) logger.info(f'Done preparing tasks: {len(tasks)} stars in process queue') sub_tbls = [] for tbl in tqdm(pool.map(worker, tasks), total=len(tasks)): if tbl is not None: sub_tbls.append(tbl) tbl = vstack(sub_tbls) # load results from running run_fit_constant.py: constant_path = os.path.join(c.run_path, 'constant.fits') constant_tbl = QTable.read(constant_path) tbl = join(tbl, constant_tbl, keys='APOGEE_ID') tbl.write(c.metadata_joker_path, overwrite=True)
def main(run_name, overwrite=False): c = Config.from_run_name(run_name) if os.path.exists(c.metadata_path) and not overwrite: logger.info(f"metadata file already exists {c.metadata_path}") return meta = Table.read(c.metadata_joker_path) mcmc_meta = Table.read(c.metadata_mcmc_path) final_colnames = [ 'APOGEE_ID', 'n_visits', 'MAP_P', 'MAP_P_err', 'MAP_e', 'MAP_e_err', 'MAP_omega', 'MAP_omega_err', 'MAP_M0', 'MAP_M0_err', 'MAP_K', 'MAP_K_err', 'MAP_v0', 'MAP_v0_err', 'MAP_s', 'MAP_s_err', 't0_bmjd', 'baseline', 'MAP_ln_likelihood', 'MAP_ln_prior', 'max_unmarginalized_ln_likelihood', 'max_phase_gap', 'periods_spanned', 'phase_coverage', 'phase_coverage_per_period', 'unimodal', 'joker_completed', 'mcmc_completed', 'mcmc_success', 'gelman_rubin_max', 'constant_ln_likelihood', 'robust_constant_ln_likelihood', 'constant_ln_evidence', 'kepler_ln_evidence' ] master = join(meta, mcmc_meta, keys='APOGEE_ID', join_type='left', uniq_col_name="{table_name}{col_name}", table_names=["", "mcmc_"]) master['mcmc_completed'] = master['mcmc_completed'].filled(False) master['mcmc_success'] = master['mcmc_success'].filled(False) master['joker_completed'] = master['joker_completed'].filled(False) for colname in mcmc_meta.colnames: if colname == 'APOGEE_ID': continue mcmc_colname = f'mcmc_{colname}' if mcmc_colname not in master.colnames: mcmc_colname = colname print(f"Filling {colname} with {mcmc_colname}") master[colname][master['mcmc_success']] = master[mcmc_colname][ master['mcmc_success']] master = master[final_colnames] master = QTable(master) for col in master.colnames: if col.endswith('_err'): master[col][~master['mcmc_completed']] = np.nan master.write(c.metadata_path, overwrite=True)
def main(run_name, pool, overwrite=False, seed=None): c = Config.from_run_name(run_name) # Get paths to files needed to run: results_path = os.path.join(c.run_path, 'constant.fits') if os.path.exists(results_path) and not overwrite: logger.info( "Results file {} already exists. Use --overwrite if needed".format( results_path)) return if not os.path.exists(c.tasks_path): raise IOError("Tasks file '{0}' does not exist! Did you run " "make_tasks.py?") # Get data files out of config file: logger.debug("Loading data...") allstar, _ = c.load_alldata() apogee_ids = np.unique(allstar['APOGEE_ID']) # Make batches of APOGEE_IDs logger.debug("Preparing tasks...") if len(apogee_ids) > 10 * pool.size: n_tasks = min(16 * pool.size, len(apogee_ids)) else: n_tasks = pool.size tasks = batch_tasks(len(apogee_ids), n_tasks, arr=apogee_ids, args=(c, )) logger.info(f'Done preparing tasks: split into {len(tasks)} task chunks') results = [] for r in tqdm(pool.map(worker, tasks), total=len(tasks)): results.append(r) tbl = Table([item for sublist in results for item in sublist]) tbl.write(results_path, overwrite=True)
def main(run_name, pool, overwrite=False, seed=None): run_path = join(HQ_CACHE_PATH, run_name) with open(join(run_path, 'config.yml'), 'r') as f: config = yaml.load(f.read()) logger.debug("Config file loaded") # Get paths to files needed to run params = config_to_jokerparams(config) prior_cache_path = config_to_prior_cache(config, params) results_path = join(HQ_CACHE_PATH, run_name, 'thejoker-injected.hdf5') logger.debug(f"Caching to {results_path}") if not exists(prior_cache_path): raise IOError("Prior cache file '{0}' does not exist! Did you run " "make_prior_cache.py?") with h5py.File(results_path, 'a') as f: # ensure the file exists pass # Get data files out of config file: logger.debug("Loading APOGEE data...") allstar, allvisit = config_to_alldata(config) logger.debug("Loading APOGEE data...finished") # Read metadata file: logger.debug("Loading metadata from previous HQ run...") meta_file = join(HQ_CACHE_PATH, run_name, 'metadata-master.fits') meta = at.Table.read(meta_file) logger.debug("Joining with APOGEE data...") master = at.join(meta, allstar, keys='APOGEE_ID') logger.debug("Joining with APOGEE data...finished") # n_control = len(allstar) // 10 n_control = 4 # TODO: remove this when running in production idx = np.random.choice(len(master), size=n_control, replace=False) master = master[idx] allvisit = allvisit[np.isin(allvisit['APOGEE_ID'], master['APOGEE_ID'])] # Create TheJoker sampler instance with the specified random seed and pool rnd = np.random.RandomState(seed=seed) logger.debug("Creating TheJoker instance with {0}".format(rnd)) joker = TheJoker(params, random_state=rnd) logger.debug("Processing pool has size = {0}".format(pool.size)) logger.debug("{0} stars left to process for run '{1}'".format( len(master), run_name)) # generate companion orbital parameters companion_samples = get_orbit_samples(n_control, joker) tasks = [] logger.debug("Loading data and preparing tasks...") for i, star in enumerate(tqdm(master)): visits = allvisit[allvisit['APOGEE_ID'] == star['APOGEE_ID']] data = get_rvdata(visits) # Get MAP orbit from metadata file: orbit = extract_MAP_orbit(star) print(orbit.elements) flat_rv = data.rv - orbit.radial_velocity(data.t) base_rv = np.median(data.rv) + flat_rv # Inject a companion signal: orbit = companion_samples.get_orbit(i) new_rv = base_rv + orbit.radial_velocity(data.t) # Observe the companion signal: new_rv = rnd.normal(new_rv.value, data.stddev.value) * data.rv.unit new_data = RVData(rv=new_rv, t=data.t, stddev=data.stddev) tasks.append( [joker, star['APOGEE_ID'], new_data, config, results_path]) logger.info('Done preparing tasks: {0} stars in process queue'.format( len(tasks))) for r in tqdm(pool.starmap(worker, tasks, callback=callback), total=len(tasks)): pass
def main(c, prior, metadata_row, overwrite=False): mcmc_cache_path = os.path.join(c.run_path, 'mcmc') os.makedirs(mcmc_cache_path, exist_ok=True) apogee_id = metadata_row['APOGEE_ID'] this_cache_path = os.path.join(mcmc_cache_path, apogee_id) if os.path.exists(this_cache_path) and not overwrite: logger.info(f"{apogee_id} already done!") # Assume it's already done return # Set up The Joker: joker = tj.TheJoker(prior) # Load the data: logger.debug(f"{apogee_id}: Loading all data") allstar, allvisit = c.load_alldata() allstar = allstar[np.isin(allstar['APOGEE_ID'].astype(str), apogee_id)] allvisit = allvisit[np.isin(allvisit['APOGEE_ID'].astype(str), allstar['APOGEE_ID'].astype(str))] visits = allvisit[allvisit['APOGEE_ID'] == apogee_id] data = get_rvdata(visits) t0 = time.time() # Read MAP sample: MAP_sample = extract_MAP_sample(metadata_row) logger.log(1, f"{apogee_id}: MAP sample loaded") # Run MCMC: with joker.prior.model as model: logger.log(1, f"{apogee_id}: Setting up MCMC...") mcmc_init = joker.setup_mcmc(data, MAP_sample) logger.log(1, f"{apogee_id}: ...setup complete") if 'ln_prior' not in model.named_vars: ln_prior_var = None for k in joker.prior._nonlinear_equiv_units: var = model.named_vars[k] try: if ln_prior_var is None: ln_prior_var = var.distribution.logp(var) else: ln_prior_var = ln_prior_var + var.distribution.logp( var) except Exception as e: logger.warning("Cannot auto-compute log-prior value for " f"parameter {var}.") print(e) continue pm.Deterministic('ln_prior', ln_prior_var) logger.log(1, f"{apogee_id}: setting up ln_prior in pymc3 model") if 'logp' not in model.named_vars: pm.Deterministic('logp', model.logpt) logger.log(1, f"{apogee_id}: setting up logp in pymc3 model") logger.debug(f"{apogee_id}: Starting MCMC sampling") trace = pm.sample(start=mcmc_init, chains=4, cores=1, step=xo.get_dense_nuts_step(target_accept=0.95), tune=c.tune, draws=c.draws) pm.save_trace(trace, directory=this_cache_path, overwrite=True) logger.debug( "{apogee_id}: Finished MCMC sampling ({time:.2f} seconds)".format( apogee_id=apogee_id, time=time.time() - t0))