예제 #1
0
def fit_all_bands(jd, fid, magpsf, sigmapsf) -> np.array:
    """ Perform a Bazin fit for all alerts and all bands.

    For a given set of parameters (a, b, ...), and a given
    set of bands (g, r, ...), the final feature vector must be of the form:

    features = [
        [ga, gb, ... ra, rb, ... ], # alert 0
        [ga, gb, ... ra, rb, ... ], # alert 1
        [ga, gb, ... ra, rb, ... ], # alert ...
    ]

    Parameters
    ----------
    in: 2D np.array (alerts, time-series)
        Array of property vectors (float)

    Returns
    ----------
    features: 2D np.array (alerts, features x bands)
        Array of feature vectors (all bands for each alert)
    """
    features = []
    unique_bands = [1, 2, 3]
    # Loop over all alerts
    for alert_data in zip(jd, fid, magpsf, sigmapsf):
        (ajd, afid, amagpsf, asigmapsf) = alert_data

        feature_alert = []
        # For each alert, estimate the parameters for each band
        for band in unique_bands:
            maskband = afid == band
            masknan = amagpsf == amagpsf
            masknone = amagpsf != None
            mask = maskband * masknan * masknone
            if ajd is None or len(ajd[mask]) < 5:
                # Not sure what is going on in this case
                feature_alert.extend(np.zeros(5, dtype=np.float))
            else:
                # Compute flux
                flux, sigmaflux = mag2fluxcal_snana(amagpsf[mask],
                                                    asigmapsf[mask])
                feature_alert.extend(fit_scipy(ajd[mask], flux))
        features.append(np.array(feature_alert))
    return np.array(features)
예제 #2
0
def rfscore_pca(jd,
                fid,
                magpsf,
                sigmapsf,
                model=None,
                bands=None,
                num_pc_components=None,
                min_flux_threshold=None) -> pd.Series:

    if bands is None:
        bands = ['g', 'r']
    if num_pc_components is None:
        num_pc_components = 3
    if min_flux_threshold is None:
        min_flux_threshold = 200
    # Flag empty alerts
    mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 3
    if len(jd[mask]) == 0:
        return pd.Series(np.zeros(len(jd), dtype=float))

    # Load pre-trained model `clf`
    if model is not None:
        clf = load_scikit_model(model.values[0])

    #need to define this later
    else:
        clf = load_scikit_model("models/pickle_model.pkl")

    #else:
    #    curdir = os.path.dirname(os.path.abspath(__file__))
    #    model = curdir + 'models/pickle_model.pkl'
    #
    #    clf = load_scikit_model(model)

    #remember to initialize bands and

    test_features = []
    ids = pd.Series(range(len(jd)))
    for id in ids[mask]:
        # compute flux and flux error
        data = [
            mag2fluxcal_snana(*args) for args in zip(magpsf[id], sigmapsf[id])
        ]
        flux, error = np.transpose(data)

        # make a Pandas DataFrame with exploded series
        pdf_id = [id] * len(flux)
        pdf = pd.DataFrame.from_dict({
            'SNID': [int(i) for i in pdf_id],
            'MJD': [int(i) for i in jd[id]],
            'FLUXCAL':
            flux,
            'FLUXCALERR':
            error,
            'FLT':
            pd.Series(fid[id]).replace({
                1: 'g',
                2: 'r'
            })
        })

        pdf = Table.from_pandas(pdf)
        data_obj = create_alert_data_obj(pdf, bands)

        # move to dataframe class
        pc = PredictLightCurve(data_obj, object_id=pdf['SNID'][0])
        coeff_dict, num_pts_dict = pc.predict_lc_coeff(
            current_date=None,
            num_pc_components=3,
            decouple_pc_bands=False,
            decouple_prediction_bands=True,
            min_flux_threshold=min_flux_threshold,
            bands=bands,
            band_choice='u')

        features = np.zeros((num_pc_components + 1) * len(bands))
        for i, band in enumerate(bands):
            for j in range(num_pc_components):
                if j == 0:
                    features[i * 4] = num_pts_dict[band]
                features[i * 4 + j + 1] = coeff_dict[band][j]

        test_features.append(features)

    # Make predictions
    probabilities = clf.predict_proba(test_features)

    # Take only probabilities to be KN
    to_return = np.zeros(len(jd), dtype=float)
    to_return[mask] = probabilities.T[1]

    return pd.Series(to_return)
예제 #3
0
def rfscore_sigmoid_full(jd,
                         fid,
                         magpsf,
                         sigmapsf,
                         cdsxmatch,
                         ndethist,
                         model=None) -> pd.Series:
    """ Return the probability of an alert to be a SNe Ia using a Random
    Forest Classifier (sigmoid fit).

    You need to run the SIMBAD crossmatch before.

    Parameters
    ----------
    jd: Spark DataFrame Column
        JD times (vectors of floats)
    fid: Spark DataFrame Column
        Filter IDs (vectors of ints)
    magpsf, sigmapsf: Spark DataFrame Columns
        Magnitude from PSF-fit photometry, and 1-sigma error (vectors of floats)
    cdsxmatch: Spark DataFrame Column
        Type of object found in Simbad (string)
    ndethist: Spark DataFrame Column
        Column containing the number of detection by ZTF at 3 sigma (int)
    model: Spark DataFrame Column, optional
        Path to the trained model. Default is None, in which case the default
        model `data/models/default-model.obj` is loaded.

    Returns
    ----------
    probabilities: 1D np.array of float
        Probability between 0 (non-Ia) and 1 (Ia).

    Examples
    ----------
    >>> from fink_science.xmatch.processor import cdsxmatch
    >>> from fink_science.utilities import concat_col
    >>> from pyspark.sql import functions as F

    >>> df = spark.read.load(ztf_alert_sample)

    >>> colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']]
    >>> df = df.withColumn('cdsxmatch', cdsxmatch(*colnames))

    # Required alert columns
    >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf']

    # Use for creating temp name
    >>> prefix = 'c'
    >>> what_prefix = [prefix + i for i in what]

    # Append temp columns with historical + current measurements
    >>> for colname in what:
    ...    df = concat_col(df, colname, prefix=prefix)

    # Perform the fit + classification (default model)
    >>> args = [F.col(i) for i in what_prefix]
    >>> args += [F.col('cdsxmatch'), F.col('candidate.ndethist')]
    >>> df = df.withColumn('pIa', rfscore_sigmoid_full(*args))

    >>> df.filter(df['pIa'] > 0.5).count()
    6

    >>> df.filter(df['pIa'] > 0.5).select(['rf_snia_vs_nonia', 'pIa']).show()
    +----------------+-----+
    |rf_snia_vs_nonia|  pIa|
    +----------------+-----+
    |           0.839|0.839|
    |           0.782|0.782|
    |           0.887|0.887|
    |           0.785|0.785|
    |            0.88| 0.88|
    |           0.777|0.777|
    +----------------+-----+
    <BLANKLINE>

    # Note that we can also specify a model
    >>> args = [F.col(i) for i in what_prefix]
    >>> args += [F.col('cdsxmatch'), F.col('candidate.ndethist')]
    >>> args += [F.lit(model_path_sigmoid)]
    >>> df = df.withColumn('pIa', rfscore_sigmoid_full(*args))

    >>> df.filter(df['pIa'] > 0.5).count()
    6

    >>> df.agg({"pIa": "max"}).collect()[0][0] < 1.0
    True
    """
    # Flag empty alerts
    mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 3

    mask *= (ndethist.astype(int) <= 20)

    list_of_sn_host = return_list_of_sn_host()
    mask *= cdsxmatch.apply(lambda x: x in list_of_sn_host)

    if len(jd[mask]) == 0:
        return pd.Series(np.zeros(len(jd), dtype=float))

    # add an exploded column with SNID
    df_tmp = pd.DataFrame.from_dict({
        'jd': jd[mask],
        'SNID': range(len(jd[mask]))
    })
    df_tmp = df_tmp.explode('jd')

    # compute flux and flux error
    data = [
        mag2fluxcal_snana(*args)
        for args in zip(magpsf[mask].explode(), sigmapsf[mask].explode())
    ]
    flux, error = np.transpose(data)

    # make a Pandas DataFrame with exploded series
    pdf = pd.DataFrame.from_dict({
        'SNID':
        df_tmp['SNID'],
        'MJD':
        df_tmp['jd'],
        'FLUXCAL':
        flux,
        'FLUXCALERR':
        error,
        'FLT':
        fid[mask].explode().replace({
            1: 'g',
            2: 'r'
        })
    })

    # Load pre-trained model `clf`
    if model is not None:
        clf = load_scikit_model(model.values[0])
    else:
        curdir = os.path.dirname(os.path.abspath(__file__))
        model = curdir + '/data/models/default-model_sigmoid.obj'
        clf = load_scikit_model(model)

    test_features = []
    for id in np.unique(pdf['SNID']):
        pdf_sub = pdf[pdf['SNID'] == id]
        features = get_sigmoid_features_dev(pdf_sub)
        test_features.append(features)

    # Make predictions
    probabilities = clf.predict_proba(test_features)

    # Take only probabilities to be Ia
    to_return = np.zeros(len(jd), dtype=float)
    to_return[mask] = probabilities.T[0]

    return pd.Series(to_return)
예제 #4
0
def extract_features_rf_snia(jd, fid, magpsf, sigmapsf) -> pd.Series:
    """ Return the features used by the RF classifier.

    There are 12 features. Order is:
    a_g,b_g,c_g,snratio_g,chisq_g,nrise_g,
    a_r,b_r,c_r,snratio_r,chisq_r,nrise_r

    Parameters
    ----------
    jd: Spark DataFrame Column
        JD times (float)
    fid: Spark DataFrame Column
        Filter IDs (int)
    magpsf, sigmapsf: Spark DataFrame Columns
        Magnitude from PSF-fit photometry, and 1-sigma error

    Returns
    ----------
    features: list of str
        List of string.

    Examples
    ----------
    >>> from pyspark.sql.functions import split
    >>> from pyspark.sql.types import FloatType
    >>> from fink_science.utilities import concat_col
    >>> from pyspark.sql import functions as F

    >>> df = spark.read.load(ztf_alert_sample)

    # Required alert columns
    >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf']

    # Use for creating temp name
    >>> prefix = 'c'
    >>> what_prefix = [prefix + i for i in what]

    # Append temp columns with historical + current measurements
    >>> for colname in what:
    ...    df = concat_col(df, colname, prefix=prefix)

    # Perform the fit + classification (default model)
    >>> args = [F.col(i) for i in what_prefix]
    >>> df = df.withColumn('features', extract_features_rf_snia(*args))

    >>> for name in RF_FEATURE_NAMES:
    ...   index = RF_FEATURE_NAMES.index(name)
    ...   df = df.withColumn(name, split(df['features'], ',')[index].astype(FloatType()))

    # Trigger something
    >>> df.agg({RF_FEATURE_NAMES[0]: "min"}).collect()[0][0]
    -2663.2421875
    """
    # Flag empty alerts
    mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 3
    if len(jd[mask]) == 0:
        return pd.Series(np.zeros(len(jd), dtype=float))

    # add an exploded column with SNID
    df_tmp = pd.DataFrame.from_dict({
        'jd': jd[mask],
        'SNID': range(len(jd[mask]))
    })
    df_tmp = df_tmp.explode('jd')

    # compute flux and flux error
    data = [
        mag2fluxcal_snana(*args)
        for args in zip(magpsf[mask].explode(), sigmapsf[mask].explode())
    ]
    flux, error = np.transpose(data)

    # make a Pandas DataFrame with exploded series
    pdf = pd.DataFrame.from_dict({
        'SNID':
        df_tmp['SNID'],
        'MJD':
        df_tmp['jd'],
        'FLUXCAL':
        flux,
        'FLUXCALERR':
        error,
        'FLT':
        fid[mask].explode().replace({
            1: 'g',
            2: 'r'
        })
    })

    test_features = []
    for id in np.unique(pdf['SNID']):
        pdf_sub = pdf[pdf['SNID'] == id]
        features = get_sigmoid_features_dev(pdf_sub)
        test_features.append(features)

    to_return_features = np.zeros((len(jd), len(RF_FEATURE_NAMES)),
                                  dtype=float)
    to_return_features[mask] = test_features

    concatenated_features = [
        ','.join(np.array(i, dtype=str)) for i in to_return_features
    ]

    return pd.Series(concatenated_features)
예제 #5
0
def rfscore_kn_pca(jd,
                   fid,
                   magpsf,
                   sigmapsf,
                   model=None,
                   num_pc_components=None,
                   min_flux_threshold=None) -> pd.Series:
    """ Return the probability of an alert to be a SNe Ia using a Random
    Forest Classifier (bazin fit).
    Parameters
    ----------
    jd: Spark DataFrame Column
        JD times (float)
    fid: Spark DataFrame Column
        Filter IDs (int)
    magpsf, sigmapsf: Spark DataFrame Columns
        Magnitude from PSF-fit photometry, and 1-sigma error
    model: Spark DataFrame Column, optional
        Path to the trained model. Default is None, in which case the default
        model `data/models/kilonova_model.pkl` is loaded.
    num_pc_components: int
        Number of principle components to be considered for the fit. Default is none, in which case the default
        value of 3 is assigned
    min_flux_threshold: int
        Minimum value of amplitude of a band for prediction. Default is None, in which case the default
        value of 200 is assigned

    Returns
    ----------
    probabilities: 1D np.array of float
        Probability between 0 (non-kilonova) and 1 (kilonova).


    Example
    __________

    >>> from fink_science.utilities import concat_col

    >>> df = spark.read.load(ztf_alert_sample)

    # Required alert columns
    >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf']

    # Use for creating temp name
    >>> prefix = 'c'
    >>> what_prefix = [prefix + i for i in what]

    # Append temp columns with historical + current measurements
    >>> for colname in what:
    ...    df = concat_col(df, colname, prefix=prefix)

    # Perform the fit + classification (default model)
    >>> args = ['cjd', 'cfid', 'cmagpsf','csigmapsf']

    >>> df = df.withColumn('pKN',  rfscore_kn_pca(*args))

    >>> df.agg({"pKN": "min"}).collect()[0][0] >= 0.0
    True
    """

    if num_pc_components is None:
        num_pc_components = 3
    if min_flux_threshold is None:
        min_flux_threshold = 200

    if num_pc_components != 3:
        print('error ')
    # Flag empty alerts
    mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 3
    if len(jd[mask]) == 0:
        return pd.Series(np.zeros(len(jd), dtype=float))

    # Load pre-trained model `clf`
    if model is not None:
        clf = load_scikit_model(model.values[0])

    # set path
    else:
        curdir = os.path.dirname(os.path.abspath(__file__))
        model = curdir + '/data/models/kilonova_model.pkl'
        clf = load_scikit_model(model)

    # remember to initialize bands
    bands = ['g', 'r']
    test_features = []
    ids = pd.Series(range(len(jd)))
    for id in ids[mask]:
        # compute flux and flux error
        data = [
            mag2fluxcal_snana(*args) for args in zip(magpsf[id], sigmapsf[id])
        ]
        flux, error = np.transpose(data)

        # make a Pandas DataFrame with exploded series
        pdf_id = [id] * len(flux)
        pdf = pd.DataFrame.from_dict({
            'SNID': [int(i) for i in pdf_id],
            'MJD': [int(i) for i in jd[id]],
            'FLUXCAL':
            flux,
            'FLUXCALERR':
            error,
            'FLT':
            pd.Series(fid[id]).replace({
                1: 'g',
                2: 'r'
            })
        })

        # move to dataframe class
        pc = PredictLightCurve(pdf, object_id=pdf['SNID'][0])
        features = pc.predict_lc_coeff(num_pc_components=num_pc_components,
                                       min_flux_threshold=min_flux_threshold,
                                       bands=bands,
                                       band_choice='u')

        test_features.append(features)

    # Make predictions
    probabilities = clf.predict_proba(test_features)

    # Take only probabilities to be KN
    to_return = np.zeros(len(jd), dtype=float)
    to_return[mask] = probabilities.T[1]

    return pd.Series(to_return)
예제 #6
0
def rfscore_sigmoid(jd, fid, magpsf, sigmapsf, model=None) -> pd.Series:
    """ Return the probability of an alert to be a SNe Ia using a Random
    Forest Classifier (sigmoid fit).

    Parameters
    ----------
    jd: Spark DataFrame Column
        JD times (float)
    fid: Spark DataFrame Column
        Filter IDs (int)
    magpsf, sigmapsf: Spark DataFrame Columns
        Magnitude from PSF-fit photometry, and 1-sigma error
    model: Spark DataFrame Column, optional
        Path to the trained model. Default is None, in which case the default
        model `data/models/default-model.obj` is loaded.

    Returns
    ----------
    probabilities: 1D np.array of float
        Probability between 0 (non-Ia) and 1 (Ia).

    Examples
    ----------
    >>> from fink_science.utilities import concat_col
    >>> from pyspark.sql import functions as F

    >>> df = spark.read.load(ztf_alert_sample)

    # Required alert columns
    >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf']

    # Use for creating temp name
    >>> prefix = 'c'
    >>> what_prefix = [prefix + i for i in what]

    # Append temp columns with historical + current measurements
    >>> for colname in what:
    ...    df = concat_col(df, colname, prefix=prefix)

    # Perform the fit + classification (default model)
    >>> args = [F.col(i) for i in what_prefix]
    >>> df = df.withColumn('pIa', rfscore_sigmoid(*args))

    # Note that we can also specify a model
    >>> args = [F.col(i) for i in what_prefix] + [F.lit(model_path_sigmoid)]
    >>> df = df.withColumn('pIa', rfscore_sigmoid(*args))

    # Drop temp columns
    >>> df = df.drop(*what_prefix)

    >>> df.agg({"pIa": "min"}).collect()[0][0]
    0.0

    >>> df.agg({"pIa": "max"}).collect()[0][0] < 1.0
    True
    """
    # Flag empty alerts
    mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 3
    if len(jd[mask]) == 0:
        return pd.Series(np.zeros(len(jd), dtype=float))

    # Load pre-trained model `clf`
    if model is not None:
        clf = load_scikit_model(model.values[0])
    else:
        curdir = os.path.dirname(os.path.abspath(__file__))
        model = curdir + '/../data/models/default-model_sigmoid.obj'
        clf = load_scikit_model(model)

    test_features = []
    ids = pd.Series(range(len(jd)))
    for id in ids[mask]:
        # compute flux and flux error
        data = [
            mag2fluxcal_snana(*args) for args in zip(magpsf[id], sigmapsf[id])
        ]
        flux, error = np.transpose(data)

        # make a Pandas DataFrame with exploded series
        pdf = pd.DataFrame.from_dict({
            'SNID': [id] * len(flux),
            'MJD':
            jd[id],
            'FLUXCAL':
            flux,
            'FLUXCALERR':
            error,
            'FLT':
            pd.Series(fid[id]).replace({
                1: 'g',
                2: 'r'
            })
        })

        features = get_sigmoid_features_dev(pdf)
        test_features.append(features)

    # Make predictions
    probabilities = clf.predict_proba(test_features)

    # Take only probabilities to be Ia
    to_return = np.zeros(len(jd), dtype=float)
    to_return[mask] = probabilities.T[0]

    return pd.Series(to_return)
예제 #7
0
def snn_ia(candid,
           jd,
           fid,
           magpsf,
           sigmapsf,
           roid,
           cdsxmatch,
           jdstarthist,
           model_name,
           model_ext=None) -> pd.Series:
    """ Compute probabilities of alerts to be SN Ia using SuperNNova

    Parameters
    ----------
    candid: Spark DataFrame Column
        Candidate IDs (int64)
    jd: Spark DataFrame Column
        JD times (float)
    fid: Spark DataFrame Column
        Filter IDs (int)
    magpsf, sigmapsf: Spark DataFrame Columns
        Magnitude from PSF-fit photometry, and 1-sigma error
    model_name: Spark DataFrame Column
        SuperNNova pre-trained model. Currently available:
            * snn_snia_vs_nonia
            * snn_sn_vs_all
    model_ext: Spark DataFrame Column, optional
        Path to the trained model (overwrite `model`). Default is None

    Returns
    ----------
    probabilities: 1D np.array of float
        Probability between 0 (non-Ia) and 1 (Ia).

    Examples
    ----------
    >>> from fink_science.xmatch.processor import cdsxmatch
    >>> from fink_science.asteroids.processor import roid_catcher
    >>> from fink_science.utilities import concat_col
    >>> from pyspark.sql import functions as F

    >>> df = spark.read.load(ztf_alert_sample)

    # Add SIMBAD field
    >>> colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']]
    >>> df = df.withColumn('cdsxmatch', cdsxmatch(*colnames))

    # Required alert columns
    >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf']

    # Use for creating temp name
    >>> prefix = 'c'
    >>> what_prefix = [prefix + i for i in what]

    # Append temp columns with historical + current measurements
    >>> for colname in what:
    ...    df = concat_col(df, colname, prefix=prefix)

    # Add SSO field
    >>> args_roid = [
    ...    'cjd', 'cmagpsf',
    ...    'candidate.ndethist', 'candidate.sgscore1',
    ...    'candidate.ssdistnr', 'candidate.distpsnr1']
    >>> df = df.withColumn('roid', roid_catcher(*args_roid))

    # Perform the fit + classification (default model)
    >>> args = ['candid', 'cjd', 'cfid', 'cmagpsf', 'csigmapsf']
    >>> args += [F.col('roid'), F.col('cdsxmatch'), F.col('candidate.jdstarthist')]
    >>> args += [F.lit('snn_snia_vs_nonia')]
    >>> df = df.withColumn('pIa', snn_ia(*args))

    >>> df.filter(df['pIa'] > 0.5).count()
    7

    # Note that we can also specify a model
    >>> args = [F.col(i) for i in ['candid', 'cjd', 'cfid', 'cmagpsf', 'csigmapsf']]
    >>> args += [F.col('roid'), F.col('cdsxmatch'), F.col('candidate.jdstarthist')]
    >>> args += [F.lit(''), F.lit(model_path)]
    >>> df = df.withColumn('pIa', snn_ia(*args))

    >>> df.filter(df['pIa'] > 0.5).count()
    7
    """
    # Flag empty alerts
    mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 1

    mask *= jd.apply(lambda x: float(x[-1])) - jdstarthist.astype(float) <= 90

    mask *= roid.astype(int) != 3

    list_of_sn_host = return_list_of_sn_host()
    mask *= cdsxmatch.apply(lambda x: x in list_of_sn_host)

    if len(jd[mask]) == 0:
        return pd.Series(np.zeros(len(jd), dtype=float))

    if model_ext is not None:
        # take the first element of the Series
        model = model_ext.values[0]
    else:
        # Load pre-trained model
        curdir = os.path.dirname(os.path.abspath(__file__))
        model = curdir + '/data/models/snn_models/{}/model.pt'.format(
            model_name.values[0])

    # add an exploded column with SNID
    df_tmp = pd.DataFrame.from_dict({
        'jd':
        jd[mask],
        'SNID': [str(i) for i in candid[mask].values]
    })

    df_tmp = df_tmp.explode('jd')

    # compute flux and flux error
    data = [
        mag2fluxcal_snana(*args)
        for args in zip(magpsf[mask].explode(), sigmapsf[mask].explode())
    ]
    flux, error = np.transpose(data)

    # make a Pandas DataFrame with exploded series
    pdf = pd.DataFrame.from_dict({
        'SNID':
        df_tmp['SNID'],
        'MJD':
        df_tmp['jd'],
        'FLUXCAL':
        flux,
        'FLUXCALERR':
        error,
        'FLT':
        fid[mask].explode().replace({
            1: 'g',
            2: 'r'
        })
    })

    # Compute predictions
    ids, pred_probs = classify_lcs(pdf, model, 'cpu')

    # Reformat and re-index
    preds_df = reformat_to_df(pred_probs, ids=ids)
    preds_df.index = preds_df.SNID

    # Take only probabilities to be Ia
    to_return = np.zeros(len(jd), dtype=float)
    ia = preds_df.reindex([str(i) for i in candid[mask].values])
    to_return[mask] = ia.prob_class0.values

    # return probabilities to be Ia
    return pd.Series(to_return)
예제 #8
0
def snn_ia(candid, jd, fid, magpsf, sigmapsf, model=None) -> pd.Series:
    """ Compute probabilities of alerts to be SN Ia using SuperNNova

    Parameters
    ----------
    candid: Spark DataFrame Column
        Candidate IDs (int64)
    jd: Spark DataFrame Column
        JD times (float)
    fid: Spark DataFrame Column
        Filter IDs (int)
    magpsf, sigmapsf: Spark DataFrame Columns
        Magnitude from PSF-fit photometry, and 1-sigma error
    model: Spark DataFrame Column, optional
        Path to the trained model. Default is None, in which case the default
        model `data/models/<vanilla_S_0_...>.pt` is loaded.

    Returns
    ----------
    probabilities: 1D np.array of float
        Probability between 0 (non-Ia) and 1 (Ia).

    Examples
    ----------
    >>> from fink_science.utilities import concat_col
    >>> from pyspark.sql import functions as F

    >>> df = spark.read.load(ztf_alert_sample)

    # Required alert columns
    >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf']

    # Use for creating temp name
    >>> prefix = 'c'
    >>> what_prefix = [prefix + i for i in what]

    # Append temp columns with historical + current measurements
    >>> for colname in what:
    ...    df = concat_col(df, colname, prefix=prefix)

    # Perform the fit + classification (default model)
    >>> args = [F.col(i) for i in ['candid', 'cjd', 'cfid', 'cmagpsf', 'csigmapsf']]
    >>> df = df.withColumn('pIa', snn_ia(*args))

    # Note that we can also specify a model
    >>> args = [F.col(i) for i in ['candid', 'cjd', 'cfid', 'cmagpsf', 'csigmapsf']] + [F.lit(model_path)]
    >>> df = df.withColumn('pIa', snn_ia(*args))

    # Drop temp columns
    >>> df = df.drop(*what_prefix)

    >>> df.agg({"pIa": "min"}).collect()[0][0] >= 0.0
    True

    >>> df.agg({"pIa": "max"}).collect()[0][0] < 1.0
    True
    """
    # Load pre-trained model
    if model is None:
        curdir = os.path.dirname(os.path.abspath(__file__))
        model = curdir + '/../data/models/vanilla_S_0_CLF_2_R_none_photometry_DF_1.0_N_global_lstm_32x2_0.05_128_True_mean_C.pt'
    else:
        # take the first element of the Series
        model = model[0]

    # add an exploded column with SNID
    df_tmp = pd.DataFrame.from_dict({
        'jd': jd,
        'SNID': [str(i) for i in candid.values]
    })

    df_tmp = df_tmp.explode('jd')

    # compute flux and flux error
    data = [
        mag2fluxcal_snana(*args)
        for args in zip(magpsf.explode(), sigmapsf.explode())
    ]
    flux, error = np.transpose(data)

    # make a Pandas DataFrame with exploded series
    pdf = pd.DataFrame.from_dict({
        'SNID': df_tmp['SNID'],
        'MJD': df_tmp['jd'],
        'FLUXCAL': flux,
        'FLUXCALERR': error,
        'FLT': fid.explode().replace({
            1: 'g',
            2: 'r'
        })
    })

    # Compute predictions
    ids, pred_probs = classify_lcs(pdf, model, 'cpu')

    # Reformat and re-index
    preds_df = reformat_to_df(pred_probs, ids=ids)
    preds_df.index = preds_df.SNID
    ia = preds_df.reindex([str(i) for i in candid.values])

    # return probabilities to be Ia
    return ia.prob_class0
예제 #9
0
def knscore(jd, fid, magpsf, sigmapsf, jdstarthist, cdsxmatch, ndethist, model_path=None, pcs_path=None, npcs=None) -> pd.Series:
    """ Return the probability of an alert to be a Kilonova using a Random
    Forest Classifier.

    You need to run the SIMBAD crossmatch before.

    Parameters
    ----------
    jd: Spark DataFrame Column
        JD times (vectors of floats)
    fid: Spark DataFrame Column
        Filter IDs (vectors of ints)
    magpsf, sigmapsf: Spark DataFrame Columns
        Magnitude from PSF-fit photometry, and 1-sigma error (vectors of floats)
    cdsxmatch: Spark DataFrame Column
        Type of object found in Simbad (string)
    ndethist: Spark DataFrame Column
        Column containing the number of detection by ZTF at 3 sigma (int)
    jdstarthist: Spark DataFrame Column
        Column containing first time variability has been seen
    model_path: Spark DataFrame Column, optional
        Path to the trained model. Default is None, in which case the default
        model `data/models/KN_model_2PC.pkl` is loaded.
    pcs_path: Spark DataFrame Column, optional
        Path to the Principal Component file. Default is None, in which case
        the `data/models/components.csv` is loaded.
    npcs: Spark DataFrame Column, optional
        Integer representing the number of Principal Component to use. It
        should be consistent to the training model used. Default is None (i.e.
        default npcs for the default `model_path`, that is 1).

    Returns
    ----------
    probabilities: 1D np.array of float
        Probability between 0 (non-KNe) and 1 (KNe).

    Examples
    ----------
    >>> from fink_science.xmatch.processor import cdsxmatch
    >>> from fink_science.utilities import concat_col
    >>> from pyspark.sql import functions as F

    >>> df = spark.read.load(ztf_alert_sample)

    >>> colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']]
    >>> df = df.withColumn('cdsxmatch', cdsxmatch(*colnames))

    # Required alert columns
    >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf']

    # Use for creating temp name
    >>> prefix = 'c'
    >>> what_prefix = [prefix + i for i in what]

    # Append temp columns with historical + current measurements
    >>> for colname in what:
    ...    df = concat_col(df, colname, prefix=prefix)

    # Perform the fit + classification (default model)
    >>> args = [F.col(i) for i in what_prefix]
    >>> args += [F.col('candidate.jdstarthist'), F.col('cdsxmatch'), F.col('candidate.ndethist')]
    >>> df = df.withColumn('pKNe', knscore(*args))

    >>> df.filter(df['pKNe'] > 0.5).count()
    0

    >>> df.filter(df['pKNe'] > 0.5).select(['rf_kn_vs_nonkn', 'pKNe']).show()
    +--------------+----+
    |rf_kn_vs_nonkn|pKNe|
    +--------------+----+
    +--------------+----+
    <BLANKLINE>

    # Note that we can also specify a model
    >>> extra_args = [F.col('candidate.jdstarthist'), F.col('cdsxmatch'), F.col('candidate.ndethist')]
    >>> extra_args += [F.lit(model_path), F.lit(comp_path), F.lit(2)]
    >>> args = [F.col(i) for i in what_prefix] + extra_args
    >>> df = df.withColumn('pKNe', knscore(*args))

    # Drop temp columns
    >>> df = df.drop(*what_prefix)

    >>> df.filter(df['pKNe'] > 0.5).count()
    0
    """
    epoch_lim = [-50, 50]
    time_bin = 0.25
    flux_lim = 0

    # Flag empty alerts
    mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 1

    mask *= (ndethist.astype(int) <= 20)

    mask *= jd.apply(lambda x: float(x[-1])) - jdstarthist.astype(float) < 20

    list_of_kn_host = return_list_of_kn_host()
    mask *= cdsxmatch.apply(lambda x: x in list_of_kn_host)

    if len(jd[mask]) == 0:
        return pd.Series(np.zeros(len(jd), dtype=float))

    # add an exploded column with SNID
    df_tmp = pd.DataFrame.from_dict(
        {
            'jd': jd[mask],
            'SNID': range(len(jd[mask]))
        }
    )
    df_tmp = df_tmp.explode('jd')

    # compute flux and flux error
    data = [mag2fluxcal_snana(*args) for args in zip(
        magpsf[mask].explode(),
        sigmapsf[mask].explode())]
    flux, error = np.transpose(data)

    # make a Pandas DataFrame with exploded series
    pdf = pd.DataFrame.from_dict({
        'SNID': df_tmp['SNID'],
        'MJD': df_tmp['jd'],
        'FLUXCAL': flux,
        'FLUXCALERR': error,
        'FLT': fid[mask].explode().replace({1: 'g', 2: 'r'})
    })

    # Load pre-trained model `clf`
    if model_path is not None:
        model = load_scikit_model(model_path.values[0])
    else:
        curdir = os.path.dirname(os.path.abspath(__file__))
        model_path = curdir + '/data/models/KN_model_2PC.pkl'
        model = load_scikit_model(model_path)

    # Load pcs
    if npcs is not None:
        npcs = int(npcs.values[0])
    else:
        npcs = 2
    if pcs_path is not None:
        pcs_path_ = pcs_path.values[0]
    else:
        curdir = os.path.dirname(os.path.abspath(__file__))
        pcs_path_ = curdir + '/data/models/components.csv'
    pcs = load_pcs(pcs_path_, npcs=npcs)

    test_features = []
    filters = ['g', 'r']

    # extract features (all filters) for each ID
    for id in np.unique(pdf['SNID']):
        pdf_sub = pdf[pdf['SNID'] == id]
        pdf_sub = pdf_sub[pdf_sub['FLUXCAL'] == pdf_sub['FLUXCAL']]
        features = extract_all_filters_fink(
            epoch_lim=epoch_lim, pcs=pcs,
            time_bin=time_bin, filters=filters,
            lc=pdf_sub, flux_lim=flux_lim)
        test_features.append(features)

    # Remove pathological values
    names_root = [
        'npoints_',
        'residuo_'
    ] + [
        'coeff' + str(i + 1) + '_' for i in range(len(pcs.keys()))
    ] + ['maxflux_']

    columns = [i + j for j in ['g', 'r'] for i in names_root]

    matrix = pd.DataFrame(test_features, columns=columns)

    zeros = np.logical_or(
        matrix['coeff1_g'].values == 0,
        matrix['coeff1_r'].values == 0
    )

    matrix_clean = matrix[~zeros]

    # If all alerts are flagged as bad
    if np.shape(matrix_clean) == (0, len(get_features_name(npcs))):
        to_return = np.zeros(len(jd), dtype=float)
        return pd.Series(to_return)

    # Otherwise make predictions
    probabilities = model.predict_proba(matrix_clean.values)
    probabilities_notkne = np.zeros(len(test_features))
    probabilities_kne = np.zeros(len(test_features))

    probabilities_notkne[~zeros] = probabilities.T[0]
    probabilities_kne[~zeros] = probabilities.T[1]
    probabilities_ = np.array([probabilities_notkne, probabilities_kne]).T

    # Take only probabilities to be Ia
    to_return = np.zeros(len(jd), dtype=float)
    to_return[mask] = probabilities_.T[1]

    return pd.Series(to_return)
예제 #10
0
def extract_features_knscore(jd, fid, magpsf, sigmapsf, pcs_path=None, npcs=None) -> pd.Series:
    """ Extract features used by the Kilonova classifier (using a Random
    Forest Classifier).

    Parameters
    ----------
    jd: Spark DataFrame Column
        JD times (float)
    fid: Spark DataFrame Column
        Filter IDs (int)
    magpsf, sigmapsf: Spark DataFrame Columns
        Magnitude from PSF-fit photometry, and 1-sigma error
    pcs_path: Spark DataFrame Column, optional
        Path to the Principal Component file. Default is None, in which case
        the `data/models/components.csv` is loaded.
    npcs: Spark DataFrame Column, optional
        Integer representing the number of Principal Component to use. It
        should be consistent to the training model used. Default is None (i.e.
        default npcs for the default `model_path`, that is 1).

    Returns
    ----------
    out: str
        comma separated features

    Examples
    ----------
    >>> from pyspark.sql.functions import split
    >>> from pyspark.sql.types import FloatType
    >>> from fink_science.utilities import concat_col
    >>> from fink_science.kilonova.lib_kn import get_features_name
    >>> from pyspark.sql import functions as F

    >>> df = spark.read.load(ztf_alert_sample)

    # Required alert columns
    >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf']

    # Use for creating temp name
    >>> prefix = 'c'
    >>> what_prefix = [prefix + i for i in what]

    # Append temp columns with historical + current measurements
    >>> for colname in what:
    ...    df = concat_col(df, colname, prefix=prefix)

    # Perform the fit + classification (default model)
    >>> args = [F.col(i) for i in what_prefix]
    >>> df = df.withColumn('features', extract_features_knscore(*args))

    >>> KN_FEATURE_NAMES_2PC = get_features_name(2)
    >>> for name in KN_FEATURE_NAMES_2PC:
    ...   index = KN_FEATURE_NAMES_2PC.index(name)
    ...   df = df.withColumn(name, split(df['features'], ',')[index].astype(FloatType()))

    # Trigger something
    >>> df.agg({KN_FEATURE_NAMES_2PC[0]: "min"}).collect()[0][0]
    0.0
    """
    epoch_lim = [-50, 50]
    time_bin = 0.25
    flux_lim = 0

    # Flag empty alerts
    mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 1
    if len(jd[mask]) == 0:
        return pd.Series(np.zeros(len(jd), dtype=float))

    # add an exploded column with SNID
    df_tmp = pd.DataFrame.from_dict(
        {
            'jd': jd[mask],
            'SNID': range(len(jd[mask]))
        }
    )
    df_tmp = df_tmp.explode('jd')

    # compute flux and flux error
    data = [mag2fluxcal_snana(*args) for args in zip(
        magpsf[mask].explode(),
        sigmapsf[mask].explode())]
    flux, error = np.transpose(data)

    # make a Pandas DataFrame with exploded series
    pdf = pd.DataFrame.from_dict({
        'SNID': df_tmp['SNID'],
        'MJD': df_tmp['jd'],
        'FLUXCAL': flux,
        'FLUXCALERR': error,
        'FLT': fid[mask].explode().replace({1: 'g', 2: 'r'})
    })

    # Load pcs
    if npcs is not None:
        npcs = int(npcs.values[0])
    else:
        npcs = 2
    if pcs_path is not None:
        pcs_path_ = pcs_path.values[0]
    else:
        curdir = os.path.dirname(os.path.abspath(__file__))
        pcs_path_ = curdir + '/data/models/components.csv'
    pcs = load_pcs(pcs_path_, npcs=npcs)

    test_features = []
    filters = ['g', 'r']

    # extract features (all filters) for each ID
    for id in np.unique(pdf['SNID']):
        pdf_sub = pdf[pdf['SNID'] == id]
        pdf_sub = pdf_sub[pdf_sub['FLUXCAL'] == pdf_sub['FLUXCAL']]
        features = extract_all_filters_fink(
            epoch_lim=epoch_lim, pcs=pcs,
            time_bin=time_bin, filters=filters,
            lc=pdf_sub, flux_lim=flux_lim)
        test_features.append(features)

    to_return_features = np.zeros(
        (len(jd), len(get_features_name(npcs))),
        dtype=float
    )
    to_return_features[mask] = test_features

    concatenated_features = [
        ','.join(np.array(i, dtype=str)) for i in to_return_features
    ]

    return pd.Series(concatenated_features)