Exemplo n.º 1
0
def estimate_stellar_labels(pks, processes=32):

    pks = deserialize_pks(pks, flatten=True)

    in_parallel = (processes != 1)
    if in_parallel:
        with mp.Pool(processes=processes) as pool:
            pool.map(_estimate_stellar_labels, pks)

    else:
        for pk in pks:
            _estimate_stellar_labels(pk)
Exemplo n.º 2
0
def add_meta_to_task_instances(pks):
    pks = deserialize_pks(pks, flatten=True)
    for pk in pks:
        add_meta_to_task_instance(pk)
    return pks
Exemplo n.º 3
0
def estimate_stellar_labels(pks,
                            default_num_uncertainty_draws=100,
                            default_large_error=1e10):
    """
    Estimate the stellar parameters for APOGEE ApStar observations,
    where task instances have been created with the given primary keys (`pks`).

    :param pks:
        The primary keys of task instances that include information about what
        ApStar observation to load.
         
    :param default_num_uncertainty_draws: [optional]
        The number of random draws to make of the flux uncertainties, which will be
        propagated into the estimate of the stellar parameter uncertainties (default: 100).
    
    :param default_large_error: [optional]
        An arbitrarily large error value to assign to bad pixels (default: 1e10).
    """

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    log.info(f"Running APOGEENet on device {device} with:")
    log.info(f"\tpks: {pks}")

    log.debug(
        f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'")

    log.debug(f"Using torch version {torch.__version__} in {torch.__path__}")

    models = {}

    pks = deserialize_pks(pks, flatten=True)
    total = len(pks)

    log.info(f"There are {total} primary keys to process: {pks}")

    for instance, path, spectrum in tqdm(prepare_data(pks), total=total):
        if spectrum is None: continue

        model_path = instance.parameters["model_path"]

        # Load the model.
        try:
            model = models[model_path]
        except KeyError:
            log.info(f"Loaded model from {model_path}")

            models[model_path] = model = Model(model_path, device)

        N, P = spectrum.flux.shape

        # Build metadata array.
        metadata_keys, metadata, metadata_norm = get_metadata(spectrum)

        flux = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape(
            (N, 1, P))
        meta = np.tile(metadata_norm, N).reshape((N, -1))

        flux = torch.from_numpy(flux).to(device)
        meta = torch.from_numpy(meta).to(device)

        with torch.set_grad_enabled(False):
            predictions = model.predict_spectra(flux, meta)
            if device != "cpu":
                predictions = predictions.cpu().data.numpy()

        # Replace infinites with non-finite.
        predictions[~np.isfinite(predictions)] = np.nan

        # Create results array.
        log_g, log_teff, fe_h = predictions.T
        teff = 10**log_teff
        result = dict(
            snr=spectrum.meta["snr"],
            teff=teff.tolist(),
            logg=log_g.tolist(),
            fe_h=fe_h.tolist(),
        )

        num_uncertainty_draws = int(
            instance.parameters.get("num_uncertainty_draws",
                                    default_num_uncertainty_draws))

        if num_uncertainty_draws > 0:
            large_error = float(
                instance.parameters.get("large_error", default_large_error))

            flux_error = np.nan_to_num(
                spectrum.uncertainty.array**-0.5).astype(np.float32).reshape(
                    (N, 1, P))
            median_error = 5 * np.median(flux_error, axis=(1, 2))

            for j, value in enumerate(median_error):
                bad_pixel = (flux_error[j]
                             == large_error) | (flux_error[j] >= value)
                flux_error[j][bad_pixel] = value

            flux_error = torch.from_numpy(flux_error).to(device)

            inputs = torch.randn((num_uncertainty_draws, N, 1, P),
                                 device=device) * flux_error + flux
            inputs = inputs.reshape((num_uncertainty_draws * N, 1, P))

            meta_error = meta.repeat(num_uncertainty_draws, 1)
            with torch.set_grad_enabled(False):
                draws = model.predict_spectra(inputs, meta_error)
                if device != "cpu":
                    draws = draws.cpu().data.numpy()

            draws = draws.reshape((num_uncertainty_draws, N, -1))

            # Need to put the log(teffs) to teffs before calculating std_dev
            draws[:, :, 1] = 10**draws[:, :, 1]

            median_draw_predictions = np.nanmedian(draws, axis=0)
            std_draw_predictions = np.nanstd(draws, axis=0)

            log_g_median, teff_median, fe_h_median = median_draw_predictions.T
            log_g_std, teff_std, fe_h_std = std_draw_predictions.T

            result.update(_teff_median=teff_median.tolist(),
                          _logg_median=log_g_median.tolist(),
                          _fe_h_median=fe_h_median.tolist(),
                          u_teff=teff_std.tolist(),
                          u_logg=log_g_std.tolist(),
                          u_fe_h=fe_h_std.tolist())

        else:
            median_draw_predictions, std_draw_predictions = (None, None)

        # Add the bitmask flag.
        bitmask_flag = create_bitmask(
            predictions,
            median_draw_predictions=median_draw_predictions,
            std_draw_predictions=std_draw_predictions)

        result.update(bitmask_flag=bitmask_flag.tolist())

        # Write the result to database.
        create_task_output(instance, astradb.ApogeeNet, **result)

    log.info(f"Completed processing of {total} primary keys")
Exemplo n.º 4
0
Arquivo: utils.py Projeto: sdss/astra
def prepare_data(pks):
    """
    Return the task instance, data model path, and spectrum for each given primary key,
    and apply any spectrum callbacks to the spectrum as it is loaded.

    :param pks:
        Primary keys of task instances to load data products for.

    :returns:
        Yields a four length tuple containing the task instance, the spectrum path, the
        original spectrum, and the modified spectrum after any spectrum callbacks have been
        executed. If no spectrum callback is executed, then the modified spectrum will be
        `None`.
    """

    trees = {}

    for pk in deserialize_pks(pks, flatten=True):
        q = session.query(
            astradb.TaskInstance).filter(astradb.TaskInstance.pk == pk)
        instance = q.one_or_none()

        if instance is None:
            log.warning(f"No task instance found for primary key {pk}")
            path = spectrum = None

        else:
            release = instance.parameters["release"]
            tree = trees.get(release, None)
            if tree is None:
                trees[release] = tree = SDSSPath(release=release)

            # Monkey-patch BOSS Spec paths.
            try:
                path = tree.full(**instance.parameters)
            except:
                if instance.parameters["filetype"] == "spec":
                    from astra.utils import monkey_patch_get_boss_spec_path
                    path = monkey_patch_get_boss_spec_path(
                        **instance.parameters)
                else:
                    raise

            try:
                spectrum = Spectrum1D.read(path)
            except:
                log.exception(
                    f"Unable to load Spectrum1D from path {path} on task instance {instance}"
                )
                spectrum = None
            else:
                # Are there any spectrum callbacks?
                spectrum_callback = instance.parameters.get(
                    "spectrum_callback", None)
                if spectrum_callback is not None:
                    spectrum_callback_kwargs = instance.parameters.get(
                        "spectrum_callback_kwargs", "{}")
                    try:
                        spectrum_callback_kwargs = literal_eval(
                            spectrum_callback_kwargs)
                    except:
                        log.exception(
                            f"Unable to literally evalute spectrum callback kwargs for {instance}: {spectrum_callback_kwargs}"
                        )
                        raise

                    try:
                        func = string_to_callable(spectrum_callback)

                        spectrum = func(spectrum=spectrum,
                                        path=path,
                                        instance=instance,
                                        **spectrum_callback_kwargs)

                    except:
                        log.exception(
                            f"Unable to execute spectrum callback '{spectrum_callback}' on {instance}"
                        )
                        raise

        yield (instance, path, spectrum)
Exemplo n.º 5
0
def classify_apstar(pks, dag, task, run_id, **kwargs):
    """
    Classify observations of APOGEE (ApStar) sources, given the existing classifications of the
    individual visits.

    :param pks:
        The primary keys of task instances where visits have been classified. These primary keys will
        be used to work out which stars need classifying, before tasks
    """

    pks = deserialize_pks(pks, flatten=True)

    # For each unique apStar object, we need to find all the visits that have been classified.
    distinct_apogee_drp_star_pk = session.query(
        distinct(astradb.TaskInstanceMeta.apogee_drp_star_pk)).filter(
            astradb.TaskInstance.pk.in_(pks),
            astradb.TaskInstanceMeta.ti_pk == astradb.TaskInstance.pk).all()

    # We need to make sure that we will only retrieve results on apVisit objects, and not on apStar objects.
    parameter_pk, = session.query(astradb.Parameter.pk).filter(
        astradb.Parameter.parameter_name == "filetype",
        astradb.Parameter.parameter_value == "apVisit").one_or_none()

    for star_pk in distinct_apogee_drp_star_pk:

        results = session.query(
            astradb.TaskInstance, astradb.TaskInstanceMeta,
            astradb.Classification
        ).filter(
            astradb.Classification.output_pk == astradb.TaskInstance.output_pk,
            astradb.TaskInstance.pk == astradb.TaskInstanceMeta.ti_pk,
            astradb.TaskInstanceMeta.apogee_drp_star_pk == star_pk,
            astradb.TaskInstanceParameter.ti_pk == astradb.TaskInstance.pk,
            astradb.TaskInstanceParameter.parameter_pk == parameter_pk).all()

        column_func = lambda column_name: column_name.startswith("lp_")

        lps = {}
        for j, (ti, meta, classification) in enumerate(results):
            if j == 0:
                for column_name in classification.__table__.columns.keys():
                    if column_func(column_name):
                        lps[column_name] = []

            for column_name in lps.keys():
                values = getattr(classification, column_name)
                if values is None: continue
                assert len(
                    values
                ) == 1, "We are getting results from apStars and re-adding to apStars!"
                lps[column_name].append(values[0])

        # Calculate total log probabilities.
        joint_lps = np.array(
            [np.sum(lp) for lp in lps.values() if len(lp) > 0])
        keys = [key for key, lp in lps.items() if len(lp) > 0]

        # Calculate normalized probabilities.
        with np.errstate(under="ignore"):
            relative_log_probs = joint_lps - logsumexp(joint_lps)

        # Round for PostgreSQL 'real' type.
        # https://www.postgresql.org/docs/9.1/datatype-numeric.html
        # and
        # https://stackoverflow.com/questions/9556586/floating-point-numbers-of-python-float-and-postgresql-double-precision
        decimals = 3
        probs = np.round(np.exp(relative_log_probs), decimals)

        joint_result = {k: [float(lp)] for k, lp in zip(keys, joint_lps)}
        joint_result.update({k[1:]: [float(v)] for k, v in zip(keys, probs)})

        # Create a task for this classification.
        # To do that we need to construct the parameters for the task.
        columns = (
            apogee_drpdb.Star.apred_vers.label(
                "apred"),  # TODO: Raise with Nidever
            apogee_drpdb.Star.healpix,
            apogee_drpdb.Star.telescope,
            apogee_drpdb.Star.apogee_id.label(
                "obj"),  # TODO: Raise with Nidever
        )
        apred, healpix, telescope, obj = sdss_session.query(*columns).filter(
            apogee_drpdb.Star.pk == star_pk).one()
        parameters = dict(apred=apred,
                          healpix=healpix,
                          telescope=telescope,
                          obj=obj,
                          release="sdss5",
                          filetype="apStar",
                          apstar="stars")

        args = (dag.dag_id, task.task_id, run_id)

        # Get a string representation of the python callable to store in the database.

        instance = create_task_instance(*args, parameters)
        output = create_task_output(instance.pk, astradb.Classification,
                                    **joint_result)

        raise a
Exemplo n.º 6
0
def estimate_stellar_labels(pks,
                            model_path,
                            dwave_slam=10.,
                            p_slam=(1E-8, 1E-7),
                            q_slam=0.7,
                            ivar_block_slam=None,
                            eps_slam=1E-19,
                            rsv_frac_slam=2.,
                            n_jobs_slam=1,
                            verbose_slam=5):
    """
    Estimate the stellar parameters for APOGEE ApStar observations,
    where task instances have been created with the given primary keys (`pks`).

    :param pks:
        The primary keys of task instances that include information about what
        ApStar observation to load.
    
    :param model_path:
        The disk path of the pre-trained model.
        
    :param dwave_slam: float
        binning width
        
    :param p_slam: tuple of 2 ps [optional]
        smoothing parameter between 0 and 1: (default: 1E-8, 1E-7)
        0 -> LS-straight line
        1 -> cubic spline interpolant
        
    :param q_slam: float in range of [0, 100] [optional]
        percentile, between 0 and 1 (default: 0.7)
        
    :param ivar_block_slam: ndarray (n_pix, ) | None [optional]
        ivar array (default: None)
        
    :param eps_slam: float [optional]
        the ivar threshold (default: 1E-19)
    
    :param rsv_frac_slam: float [optional]
        the fraction of pixels reserved in terms of std. default is 3.
    
    :param n_jobs_slam: int [optional]
        number of processes launched by joblib (default: 1)
        
    :param verbose_slam: int / bool [optional]
        verbose level (default: 5)
    """
    '''
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    log.info(f"Running APOGEENet on device {device} with:")
    log.info(f"\tmodel_path: {model_path}")
    log.info(f"\tpks: {pks}")

    log.debug(f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'")

    log.debug(f"Using torch version {torch.__version__} in {torch.__path__}")
    
    # Load the model.
    ### model = Model(model_path, device)
    '''

    # Load the model.
    model = Slam.load_dump(model_path)  ### ("./models/btsettl.dump")
    ### wave_interp = np.load("./models/wave_interp_R1800.npz")['wave'] ### ??? how to load properly
    wave_interp = model.wave

    log.info(f"Loaded model from {model_path}")

    pks = deserialize_pks(pks, flatten=True)
    total = len(pks)

    log.info(f"There are {total} primary keys to process: {pks}")

    for instance, path, spectrum in tqdm(prepare_data(pks), total=total):
        if spectrum is None: continue

        N, P = spectrum.flux.shape
        '''
        ### original code in apogeenet
        flux = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape((N, 1, P))
        
        ### original code in MDwarfMachine
        fluxes, invars = [], []
        for i in tqdm(range(len(obs_spec))):
            fluxes += [obs_spec[i]['flux_resamp']]
            invars += [obs_spec[i]['invar_resamp']]
        fluxes, invars = np.array(fluxes), np.array(invars)
        '''
        ### wave   = np.nan_to_num(spectrum.spectral_axis.value).astype(np.float32).reshape((N, 1, P))
        ### fluxes = np.nan_to_num(spectrum.flux.value).astype(np.float32).reshape((N, 1, P)) ### ??? reshape to what format
        ### invars = np.nan_to_num(spectrum.uncertainty.array).astype(np.float32).reshape((N, 1, P)) ### ???  spectrum.uncertainity format
        wave = spectrum.spectral_axis
        fluxes = spectrum.flux
        invars = specrrum.uncertainty

        fluxes_resamp, invars_resamp = [], []
        for i in tqdm(range(N)):
            fluxes_temp, invars_temp = resample(wave[i], fluxes[i], invars[i],
                                                wave_interp)
            fluxes_resamp += [fluxes_temp]
            invars_resamp += [invars_temp]
        fluxes_resamp, invars_resamp = np.array(fluxes_resamp), np.array(
            invars_resamp)

        ### normalization of each spetra
        ### fluxes_norm, fluxes_cont = normalize_spectra_block(wave_interp, fluxes_resamp,
        ###                                           (6147., 8910.), 10., p=(1E-8, 1E-7), q=0.7,
        ###                                           eps=1E-19, rsv_frac=2., n_jobs=1, verbose=5) ### ??? inputs
        fluxes_norm, fluxes_cont = normalize_spectra_block(
            wave_interp,
            fluxes_resamp, (6147., 8910.),
            dwave_slam,
            p=p_slam,
            q=q_slam,
            ivar_block=ivar_block_slam,
            eps=eps_slam,
            rsv_frac=rsv_frac_slam,
            n_jobs=n_jobs_slam,
            verbose=verbose_slam)

        invars_norm = fluxes_cont**2 * invars_resamp

        ### Initial estimation: get initial estimate of parameters by chi2 best match
        label_init = model.predict_labels_quick(fluxes_norm,
                                                invars_norm,
                                                n_jobs=1)

        ### SLAM prediction: optimize parameters
        results_pred = model.predict_labels_multi(label_init, fluxes_norm,
                                                  invars_norm)
        label_pred = np.array([label['x'] for label in results_pred])
        std_pred = np.array([label['pstd'] for label in results_pred])

        ### modify the following block for SLAM style
        # Create results array.
        ### log_g, log_teff, fe_h = predictions.T
        ### teff = 10**log_teff
        teff = label_pred[:, 0]
        m_h = label_pred[:, 1]
        log_g = label_pred[:, 2]
        alpha_m = label_pred[:, 3]
        u_teff = std_pred[:, 0]
        u_m_h = std_pred[:, 1]
        u_log_g = std_pred[:, 2]
        u_alpha_m = std_pred[:, 3]
        result = dict(
            snr=spectrum.meta["snr"],
            teff=teff.tolist(),
            m_h=m_h.tolist(),
            logg=log_g.tolist(),
            alpha_m=alpha_m.tolist(),
            u_teff=u_teff.tolist(),
            u_m_h=u_m_h.tolist(),
            u_logg=u_log_g.tolist(),
            u_alpha_m=u_alpha_m.tolist(),
        )

        # Write the result to database.
        ### create_task_output(instance, astradb.ApogeeNet, **result)
        create_task_output(instance, astradb.SLAM, **result)

    log.info(f"Completed processing of {total} primary keys")