Пример #1
0
def make_run(name):
    run_path = os.path.join(HQ_CACHE_PATH, name)

    if os.path.exists(run_path):
        logger.info(
            "Run already exists at '{}'! Delete that path and re-run "
            "if you want to create a new run with this name.".format(run_path))
        return

    logger.debug("Creating new run '{}' at path {}".format(name, run_path))
    os.makedirs(run_path)

    # Now copy in the template configuration file:
    HQ_ROOT_PATH = os.path.split(hq.__file__)[0]
    tmpl = os.path.join(HQ_ROOT_PATH, 'pkgdata', 'template_config.yml')
    with open(tmpl, 'r') as f:
        template_config = f.read()

    template_config = template_config.replace("name: null", f"name: {name}")

    new_config_path = os.path.join(run_path, 'config.yml')
    with open(new_config_path, 'w') as f:
        f.write(template_config)

    # Now copy template prior:
    prior_path = os.path.join(HQ_ROOT_PATH, 'pkgdata', 'template_prior.py')
    new_prior_path = os.path.join(run_path, 'prior.py')
    with open(prior_path, 'r') as f:
        with open(new_prior_path, 'w') as new_f:
            new_f.write(f.read())

    logger.info(
        f"Created an HQ run at: {run_path}\n"
        f"\tNow edit the configuration file at: {new_config_path}\n"
        f"\tAnd edit the prior specification file at: {new_prior_path}")
Пример #2
0
def main(run_name, pool):
    c = Config.from_run_name(run_name)

    apogee_ids = sorted([
        x for x in os.listdir(os.path.join(c.run_path, 'mcmc'))
        if not x.startswith('.')
    ])

    tasks = batch_tasks(len(apogee_ids), pool.size, arr=apogee_ids, args=(c, ))
    logger.info(f'Done preparing tasks: {len(tasks)} stars in process queue')

    sub_tbls = []
    all_samples = {}
    for result in tqdm(pool.map(worker, tasks), total=len(tasks)):
        if result is not None:
            sub_tbls.append(result['tbl'])
            all_samples.update(result['samples'])

    # Write the MCMC metadata table
    tbl = vstack(sub_tbls)
    for k in result['units']:
        tbl[k].unit = result['units'][k]
    tbl = QTable(tbl)
    tbl.write(c.metadata_mcmc_path, overwrite=True)

    # Now write out all of the individual samplings:
    with h5py.File(c.mcmc_results_path, 'a') as results_f:
        for apogee_id, samples in all_samples.items():
            if apogee_id in results_f:
                del results_f[apogee_id]
            g = results_f.create_group(apogee_id)
            samples.write(g)
Пример #3
0
def main(run_name, pool, overwrite=False):
    c = Config.from_run_name(run_name)

    if path.exists(c.tasks_path) and not overwrite:
        logger.info(
            f"File {c.tasks_path} already exists: Use -o/--overwrite if"
            " needed")
        return

    # Load the full allvisit file, but only some columns:
    allstar, _allvisit = c.load_alldata()
    allvisit = Table()
    for k in ['APOGEE_ID', 'JD', 'VHELIO', 'VRELERR', 'SNR']:
        allvisit[k] = _allvisit[k]

    logger.debug("Loading data and preparing tasks...")
    apogee_ids = np.unique(allstar['APOGEE_ID'])
    with h5py.File(c.tasks_path, 'w') as f:
        for apogee_id in tqdm(apogee_ids):
            visits = allvisit[allvisit['APOGEE_ID'] == apogee_id]
            data = get_rvdata(visits)

            g = f.create_group(apogee_id)
            data.to_timeseries().write(g, format='hdf5', serialize_meta=True)

    logger.info('Done preparing tasks: {0} stars in process queue'.format(
        len(apogee_ids)))
Пример #4
0
def main(run_name, pool, overwrite=False, seed=None, limit=None):
    c = Config.from_run_name(run_name)

    if not os.path.exists(c.prior_cache_file):
        raise IOError(f"Prior cache file {c.prior_cache_file} does not exist! "
                      "Did you run make_prior_cache.py?")

    if not os.path.exists(c.tasks_path):
        raise IOError("Tasks file '{0}' does not exist! Did you run "
                      "make_tasks.py?")

    # Make directory for temp. files, one per worker:
    tmpdir = os.path.join(c.run_path, 'thejoker')
    if os.path.exists(tmpdir):
        logger.warning(f"Stale temp. file directory found at {tmpdir}: "
                       "combining files first...")
        tmpdir_combine(tmpdir, c.joker_results_path)

    # ensure the results file exists
    logger.debug("Loading past results...")
    with h5py.File(c.joker_results_path, 'a') as f:
        done_apogee_ids = list(f.keys())
    if overwrite:
        done_apogee_ids = list()

    # Get data files out of config file:
    logger.debug("Loading data...")
    allstar, _ = c.load_alldata()
    allstar = allstar[~np.isin(allstar['APOGEE_ID'], done_apogee_ids)]

    # Create TheJoker sampler instance with the specified random seed and pool
    rnd = np.random.RandomState(seed=seed)
    logger.debug(f"Processing pool has size = {pool.size}")

    apogee_ids = np.unique(allstar['APOGEE_ID'])
    if limit is not None:
        apogee_ids = apogee_ids[:limit]

    if done_apogee_ids:
        logger.info(f"{len(done_apogee_ids)} already completed - "
                    f"{len(apogee_ids)} left to process")

    # Load the prior:
    logger.debug("Creating JokerPrior instance...")
    prior = c.get_prior()

    os.makedirs(tmpdir)
    atexit.register(tmpdir_combine, tmpdir, c.joker_results_path)

    logger.debug("Preparing tasks...")
    if len(apogee_ids) > 10 * pool.size:
        n_tasks = min(16 * pool.size, len(apogee_ids))
    else:
        n_tasks = pool.size
    tasks = batch_tasks(len(apogee_ids), n_tasks, arr=apogee_ids,
                        args=(c, prior, tmpdir, rnd))

    logger.info(f'Done preparing tasks: split into {len(tasks)} task chunks')
    for r in pool.map(worker, tasks, callback=callback):
        pass
Пример #5
0
def main(run_name, pool):
    c = Config.from_run_name(run_name)

    # numbers we need to validate
    for path in [c.joker_results_path, c.tasks_path]:
        if not os.path.exists(path):
            raise IOError(f"File {path} does not exist! Did you run the "
                          "preceding pipeline steps?")

    # Get data files out of config file:
    logger.debug("Loading data...")
    allstar, _ = c.load_alldata()
    apogee_ids = np.unique(allstar['APOGEE_ID'])
    tasks = batch_tasks(len(apogee_ids), pool.size, arr=apogee_ids, args=(c, ))

    logger.info(f'Done preparing tasks: {len(tasks)} stars in process queue')

    sub_tbls = []
    for tbl in tqdm(pool.map(worker, tasks), total=len(tasks)):
        if tbl is not None:
            sub_tbls.append(tbl)

    tbl = vstack(sub_tbls)

    # load results from running run_fit_constant.py:
    constant_path = os.path.join(c.run_path, 'constant.fits')
    constant_tbl = QTable.read(constant_path)
    tbl = join(tbl, constant_tbl, keys='APOGEE_ID')

    tbl.write(c.metadata_joker_path, overwrite=True)
Пример #6
0
def main(run_name, overwrite=False):
    c = Config.from_run_name(run_name)

    if os.path.exists(c.metadata_path) and not overwrite:
        logger.info(f"metadata file already exists {c.metadata_path}")
        return

    meta = Table.read(c.metadata_joker_path)
    mcmc_meta = Table.read(c.metadata_mcmc_path)

    final_colnames = [
        'APOGEE_ID', 'n_visits', 'MAP_P', 'MAP_P_err', 'MAP_e', 'MAP_e_err',
        'MAP_omega', 'MAP_omega_err', 'MAP_M0', 'MAP_M0_err', 'MAP_K',
        'MAP_K_err', 'MAP_v0', 'MAP_v0_err', 'MAP_s', 'MAP_s_err', 't0_bmjd',
        'baseline', 'MAP_ln_likelihood', 'MAP_ln_prior',
        'max_unmarginalized_ln_likelihood', 'max_phase_gap', 'periods_spanned',
        'phase_coverage', 'phase_coverage_per_period', 'unimodal',
        'joker_completed', 'mcmc_completed', 'mcmc_success',
        'gelman_rubin_max', 'constant_ln_likelihood',
        'robust_constant_ln_likelihood', 'constant_ln_evidence',
        'kepler_ln_evidence'
    ]

    master = join(meta,
                  mcmc_meta,
                  keys='APOGEE_ID',
                  join_type='left',
                  uniq_col_name="{table_name}{col_name}",
                  table_names=["", "mcmc_"])

    master['mcmc_completed'] = master['mcmc_completed'].filled(False)
    master['mcmc_success'] = master['mcmc_success'].filled(False)
    master['joker_completed'] = master['joker_completed'].filled(False)

    for colname in mcmc_meta.colnames:
        if colname == 'APOGEE_ID':
            continue

        mcmc_colname = f'mcmc_{colname}'
        if mcmc_colname not in master.colnames:
            mcmc_colname = colname

        print(f"Filling {colname} with {mcmc_colname}")
        master[colname][master['mcmc_success']] = master[mcmc_colname][
            master['mcmc_success']]

    master = master[final_colnames]
    master = QTable(master)

    for col in master.colnames:
        if col.endswith('_err'):
            master[col][~master['mcmc_completed']] = np.nan

    master.write(c.metadata_path, overwrite=True)
Пример #7
0
def main(run_name, pool, overwrite=False, seed=None):
    c = Config.from_run_name(run_name)

    # Get paths to files needed to run:
    results_path = os.path.join(c.run_path, 'constant.fits')

    if os.path.exists(results_path) and not overwrite:
        logger.info(
            "Results file {} already exists. Use --overwrite if needed".format(
                results_path))
        return

    if not os.path.exists(c.tasks_path):
        raise IOError("Tasks file '{0}' does not exist! Did you run "
                      "make_tasks.py?")

    # Get data files out of config file:
    logger.debug("Loading data...")
    allstar, _ = c.load_alldata()
    apogee_ids = np.unique(allstar['APOGEE_ID'])

    # Make batches of APOGEE_IDs
    logger.debug("Preparing tasks...")
    if len(apogee_ids) > 10 * pool.size:
        n_tasks = min(16 * pool.size, len(apogee_ids))
    else:
        n_tasks = pool.size
    tasks = batch_tasks(len(apogee_ids), n_tasks, arr=apogee_ids, args=(c, ))
    logger.info(f'Done preparing tasks: split into {len(tasks)} task chunks')

    results = []
    for r in tqdm(pool.map(worker, tasks), total=len(tasks)):
        results.append(r)

    tbl = Table([item for sublist in results for item in sublist])
    tbl.write(results_path, overwrite=True)
Пример #8
0
def main(run_name, pool, overwrite=False, seed=None):
    run_path = join(HQ_CACHE_PATH, run_name)
    with open(join(run_path, 'config.yml'), 'r') as f:
        config = yaml.load(f.read())
    logger.debug("Config file loaded")

    # Get paths to files needed to run
    params = config_to_jokerparams(config)
    prior_cache_path = config_to_prior_cache(config, params)
    results_path = join(HQ_CACHE_PATH, run_name, 'thejoker-injected.hdf5')
    logger.debug(f"Caching to {results_path}")

    if not exists(prior_cache_path):
        raise IOError("Prior cache file '{0}' does not exist! Did you run "
                      "make_prior_cache.py?")

    with h5py.File(results_path, 'a') as f:  # ensure the file exists
        pass

    # Get data files out of config file:
    logger.debug("Loading APOGEE data...")
    allstar, allvisit = config_to_alldata(config)
    logger.debug("Loading APOGEE data...finished")

    # Read metadata file:
    logger.debug("Loading metadata from previous HQ run...")
    meta_file = join(HQ_CACHE_PATH, run_name, 'metadata-master.fits')
    meta = at.Table.read(meta_file)
    logger.debug("Joining with APOGEE data...")
    master = at.join(meta, allstar, keys='APOGEE_ID')
    logger.debug("Joining with APOGEE data...finished")

    # n_control = len(allstar) // 10
    n_control = 4  # TODO: remove this when running in production
    idx = np.random.choice(len(master), size=n_control, replace=False)
    master = master[idx]
    allvisit = allvisit[np.isin(allvisit['APOGEE_ID'], master['APOGEE_ID'])]

    # Create TheJoker sampler instance with the specified random seed and pool
    rnd = np.random.RandomState(seed=seed)

    logger.debug("Creating TheJoker instance with {0}".format(rnd))
    joker = TheJoker(params, random_state=rnd)
    logger.debug("Processing pool has size = {0}".format(pool.size))
    logger.debug("{0} stars left to process for run '{1}'".format(
        len(master), run_name))

    # generate companion orbital parameters
    companion_samples = get_orbit_samples(n_control, joker)

    tasks = []
    logger.debug("Loading data and preparing tasks...")
    for i, star in enumerate(tqdm(master)):
        visits = allvisit[allvisit['APOGEE_ID'] == star['APOGEE_ID']]
        data = get_rvdata(visits)

        # Get MAP orbit from metadata file:
        orbit = extract_MAP_orbit(star)
        print(orbit.elements)
        flat_rv = data.rv - orbit.radial_velocity(data.t)
        base_rv = np.median(data.rv) + flat_rv

        # Inject a companion signal:
        orbit = companion_samples.get_orbit(i)
        new_rv = base_rv + orbit.radial_velocity(data.t)

        # Observe the companion signal:
        new_rv = rnd.normal(new_rv.value, data.stddev.value) * data.rv.unit
        new_data = RVData(rv=new_rv, t=data.t, stddev=data.stddev)

        tasks.append(
            [joker, star['APOGEE_ID'], new_data, config, results_path])

    logger.info('Done preparing tasks: {0} stars in process queue'.format(
        len(tasks)))

    for r in tqdm(pool.starmap(worker, tasks, callback=callback),
                  total=len(tasks)):
        pass
Пример #9
0
def main(c, prior, metadata_row, overwrite=False):
    mcmc_cache_path = os.path.join(c.run_path, 'mcmc')
    os.makedirs(mcmc_cache_path, exist_ok=True)

    apogee_id = metadata_row['APOGEE_ID']

    this_cache_path = os.path.join(mcmc_cache_path, apogee_id)
    if os.path.exists(this_cache_path) and not overwrite:
        logger.info(f"{apogee_id} already done!")
        # Assume it's already done
        return

    # Set up The Joker:
    joker = tj.TheJoker(prior)

    # Load the data:
    logger.debug(f"{apogee_id}: Loading all data")
    allstar, allvisit = c.load_alldata()
    allstar = allstar[np.isin(allstar['APOGEE_ID'].astype(str), apogee_id)]
    allvisit = allvisit[np.isin(allvisit['APOGEE_ID'].astype(str),
                                allstar['APOGEE_ID'].astype(str))]
    visits = allvisit[allvisit['APOGEE_ID'] == apogee_id]
    data = get_rvdata(visits)

    t0 = time.time()

    # Read MAP sample:
    MAP_sample = extract_MAP_sample(metadata_row)
    logger.log(1, f"{apogee_id}: MAP sample loaded")

    # Run MCMC:
    with joker.prior.model as model:
        logger.log(1, f"{apogee_id}: Setting up MCMC...")
        mcmc_init = joker.setup_mcmc(data, MAP_sample)
        logger.log(1, f"{apogee_id}: ...setup complete")

        if 'ln_prior' not in model.named_vars:
            ln_prior_var = None
            for k in joker.prior._nonlinear_equiv_units:
                var = model.named_vars[k]
                try:
                    if ln_prior_var is None:
                        ln_prior_var = var.distribution.logp(var)
                    else:
                        ln_prior_var = ln_prior_var + var.distribution.logp(
                            var)
                except Exception as e:
                    logger.warning("Cannot auto-compute log-prior value for "
                                   f"parameter {var}.")
                    print(e)
                    continue

            pm.Deterministic('ln_prior', ln_prior_var)
            logger.log(1, f"{apogee_id}: setting up ln_prior in pymc3 model")

        if 'logp' not in model.named_vars:
            pm.Deterministic('logp', model.logpt)
            logger.log(1, f"{apogee_id}: setting up logp in pymc3 model")

        logger.debug(f"{apogee_id}: Starting MCMC sampling")
        trace = pm.sample(start=mcmc_init,
                          chains=4,
                          cores=1,
                          step=xo.get_dense_nuts_step(target_accept=0.95),
                          tune=c.tune,
                          draws=c.draws)

    pm.save_trace(trace, directory=this_cache_path, overwrite=True)
    logger.debug(
        "{apogee_id}: Finished MCMC sampling ({time:.2f} seconds)".format(
            apogee_id=apogee_id, time=time.time() - t0))