def fit_all_bands(jd, fid, magpsf, sigmapsf) -> np.array: """ Perform a Bazin fit for all alerts and all bands. For a given set of parameters (a, b, ...), and a given set of bands (g, r, ...), the final feature vector must be of the form: features = [ [ga, gb, ... ra, rb, ... ], # alert 0 [ga, gb, ... ra, rb, ... ], # alert 1 [ga, gb, ... ra, rb, ... ], # alert ... ] Parameters ---------- in: 2D np.array (alerts, time-series) Array of property vectors (float) Returns ---------- features: 2D np.array (alerts, features x bands) Array of feature vectors (all bands for each alert) """ features = [] unique_bands = [1, 2, 3] # Loop over all alerts for alert_data in zip(jd, fid, magpsf, sigmapsf): (ajd, afid, amagpsf, asigmapsf) = alert_data feature_alert = [] # For each alert, estimate the parameters for each band for band in unique_bands: maskband = afid == band masknan = amagpsf == amagpsf masknone = amagpsf != None mask = maskband * masknan * masknone if ajd is None or len(ajd[mask]) < 5: # Not sure what is going on in this case feature_alert.extend(np.zeros(5, dtype=np.float)) else: # Compute flux flux, sigmaflux = mag2fluxcal_snana(amagpsf[mask], asigmapsf[mask]) feature_alert.extend(fit_scipy(ajd[mask], flux)) features.append(np.array(feature_alert)) return np.array(features)
def rfscore_pca(jd, fid, magpsf, sigmapsf, model=None, bands=None, num_pc_components=None, min_flux_threshold=None) -> pd.Series: if bands is None: bands = ['g', 'r'] if num_pc_components is None: num_pc_components = 3 if min_flux_threshold is None: min_flux_threshold = 200 # Flag empty alerts mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 3 if len(jd[mask]) == 0: return pd.Series(np.zeros(len(jd), dtype=float)) # Load pre-trained model `clf` if model is not None: clf = load_scikit_model(model.values[0]) #need to define this later else: clf = load_scikit_model("models/pickle_model.pkl") #else: # curdir = os.path.dirname(os.path.abspath(__file__)) # model = curdir + 'models/pickle_model.pkl' # # clf = load_scikit_model(model) #remember to initialize bands and test_features = [] ids = pd.Series(range(len(jd))) for id in ids[mask]: # compute flux and flux error data = [ mag2fluxcal_snana(*args) for args in zip(magpsf[id], sigmapsf[id]) ] flux, error = np.transpose(data) # make a Pandas DataFrame with exploded series pdf_id = [id] * len(flux) pdf = pd.DataFrame.from_dict({ 'SNID': [int(i) for i in pdf_id], 'MJD': [int(i) for i in jd[id]], 'FLUXCAL': flux, 'FLUXCALERR': error, 'FLT': pd.Series(fid[id]).replace({ 1: 'g', 2: 'r' }) }) pdf = Table.from_pandas(pdf) data_obj = create_alert_data_obj(pdf, bands) # move to dataframe class pc = PredictLightCurve(data_obj, object_id=pdf['SNID'][0]) coeff_dict, num_pts_dict = pc.predict_lc_coeff( current_date=None, num_pc_components=3, decouple_pc_bands=False, decouple_prediction_bands=True, min_flux_threshold=min_flux_threshold, bands=bands, band_choice='u') features = np.zeros((num_pc_components + 1) * len(bands)) for i, band in enumerate(bands): for j in range(num_pc_components): if j == 0: features[i * 4] = num_pts_dict[band] features[i * 4 + j + 1] = coeff_dict[band][j] test_features.append(features) # Make predictions probabilities = clf.predict_proba(test_features) # Take only probabilities to be KN to_return = np.zeros(len(jd), dtype=float) to_return[mask] = probabilities.T[1] return pd.Series(to_return)
def rfscore_sigmoid_full(jd, fid, magpsf, sigmapsf, cdsxmatch, ndethist, model=None) -> pd.Series: """ Return the probability of an alert to be a SNe Ia using a Random Forest Classifier (sigmoid fit). You need to run the SIMBAD crossmatch before. Parameters ---------- jd: Spark DataFrame Column JD times (vectors of floats) fid: Spark DataFrame Column Filter IDs (vectors of ints) magpsf, sigmapsf: Spark DataFrame Columns Magnitude from PSF-fit photometry, and 1-sigma error (vectors of floats) cdsxmatch: Spark DataFrame Column Type of object found in Simbad (string) ndethist: Spark DataFrame Column Column containing the number of detection by ZTF at 3 sigma (int) model: Spark DataFrame Column, optional Path to the trained model. Default is None, in which case the default model `data/models/default-model.obj` is loaded. Returns ---------- probabilities: 1D np.array of float Probability between 0 (non-Ia) and 1 (Ia). Examples ---------- >>> from fink_science.xmatch.processor import cdsxmatch >>> from fink_science.utilities import concat_col >>> from pyspark.sql import functions as F >>> df = spark.read.load(ztf_alert_sample) >>> colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']] >>> df = df.withColumn('cdsxmatch', cdsxmatch(*colnames)) # Required alert columns >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf'] # Use for creating temp name >>> prefix = 'c' >>> what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements >>> for colname in what: ... df = concat_col(df, colname, prefix=prefix) # Perform the fit + classification (default model) >>> args = [F.col(i) for i in what_prefix] >>> args += [F.col('cdsxmatch'), F.col('candidate.ndethist')] >>> df = df.withColumn('pIa', rfscore_sigmoid_full(*args)) >>> df.filter(df['pIa'] > 0.5).count() 6 >>> df.filter(df['pIa'] > 0.5).select(['rf_snia_vs_nonia', 'pIa']).show() +----------------+-----+ |rf_snia_vs_nonia| pIa| +----------------+-----+ | 0.839|0.839| | 0.782|0.782| | 0.887|0.887| | 0.785|0.785| | 0.88| 0.88| | 0.777|0.777| +----------------+-----+ <BLANKLINE> # Note that we can also specify a model >>> args = [F.col(i) for i in what_prefix] >>> args += [F.col('cdsxmatch'), F.col('candidate.ndethist')] >>> args += [F.lit(model_path_sigmoid)] >>> df = df.withColumn('pIa', rfscore_sigmoid_full(*args)) >>> df.filter(df['pIa'] > 0.5).count() 6 >>> df.agg({"pIa": "max"}).collect()[0][0] < 1.0 True """ # Flag empty alerts mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 3 mask *= (ndethist.astype(int) <= 20) list_of_sn_host = return_list_of_sn_host() mask *= cdsxmatch.apply(lambda x: x in list_of_sn_host) if len(jd[mask]) == 0: return pd.Series(np.zeros(len(jd), dtype=float)) # add an exploded column with SNID df_tmp = pd.DataFrame.from_dict({ 'jd': jd[mask], 'SNID': range(len(jd[mask])) }) df_tmp = df_tmp.explode('jd') # compute flux and flux error data = [ mag2fluxcal_snana(*args) for args in zip(magpsf[mask].explode(), sigmapsf[mask].explode()) ] flux, error = np.transpose(data) # make a Pandas DataFrame with exploded series pdf = pd.DataFrame.from_dict({ 'SNID': df_tmp['SNID'], 'MJD': df_tmp['jd'], 'FLUXCAL': flux, 'FLUXCALERR': error, 'FLT': fid[mask].explode().replace({ 1: 'g', 2: 'r' }) }) # Load pre-trained model `clf` if model is not None: clf = load_scikit_model(model.values[0]) else: curdir = os.path.dirname(os.path.abspath(__file__)) model = curdir + '/data/models/default-model_sigmoid.obj' clf = load_scikit_model(model) test_features = [] for id in np.unique(pdf['SNID']): pdf_sub = pdf[pdf['SNID'] == id] features = get_sigmoid_features_dev(pdf_sub) test_features.append(features) # Make predictions probabilities = clf.predict_proba(test_features) # Take only probabilities to be Ia to_return = np.zeros(len(jd), dtype=float) to_return[mask] = probabilities.T[0] return pd.Series(to_return)
def extract_features_rf_snia(jd, fid, magpsf, sigmapsf) -> pd.Series: """ Return the features used by the RF classifier. There are 12 features. Order is: a_g,b_g,c_g,snratio_g,chisq_g,nrise_g, a_r,b_r,c_r,snratio_r,chisq_r,nrise_r Parameters ---------- jd: Spark DataFrame Column JD times (float) fid: Spark DataFrame Column Filter IDs (int) magpsf, sigmapsf: Spark DataFrame Columns Magnitude from PSF-fit photometry, and 1-sigma error Returns ---------- features: list of str List of string. Examples ---------- >>> from pyspark.sql.functions import split >>> from pyspark.sql.types import FloatType >>> from fink_science.utilities import concat_col >>> from pyspark.sql import functions as F >>> df = spark.read.load(ztf_alert_sample) # Required alert columns >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf'] # Use for creating temp name >>> prefix = 'c' >>> what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements >>> for colname in what: ... df = concat_col(df, colname, prefix=prefix) # Perform the fit + classification (default model) >>> args = [F.col(i) for i in what_prefix] >>> df = df.withColumn('features', extract_features_rf_snia(*args)) >>> for name in RF_FEATURE_NAMES: ... index = RF_FEATURE_NAMES.index(name) ... df = df.withColumn(name, split(df['features'], ',')[index].astype(FloatType())) # Trigger something >>> df.agg({RF_FEATURE_NAMES[0]: "min"}).collect()[0][0] -2663.2421875 """ # Flag empty alerts mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 3 if len(jd[mask]) == 0: return pd.Series(np.zeros(len(jd), dtype=float)) # add an exploded column with SNID df_tmp = pd.DataFrame.from_dict({ 'jd': jd[mask], 'SNID': range(len(jd[mask])) }) df_tmp = df_tmp.explode('jd') # compute flux and flux error data = [ mag2fluxcal_snana(*args) for args in zip(magpsf[mask].explode(), sigmapsf[mask].explode()) ] flux, error = np.transpose(data) # make a Pandas DataFrame with exploded series pdf = pd.DataFrame.from_dict({ 'SNID': df_tmp['SNID'], 'MJD': df_tmp['jd'], 'FLUXCAL': flux, 'FLUXCALERR': error, 'FLT': fid[mask].explode().replace({ 1: 'g', 2: 'r' }) }) test_features = [] for id in np.unique(pdf['SNID']): pdf_sub = pdf[pdf['SNID'] == id] features = get_sigmoid_features_dev(pdf_sub) test_features.append(features) to_return_features = np.zeros((len(jd), len(RF_FEATURE_NAMES)), dtype=float) to_return_features[mask] = test_features concatenated_features = [ ','.join(np.array(i, dtype=str)) for i in to_return_features ] return pd.Series(concatenated_features)
def rfscore_kn_pca(jd, fid, magpsf, sigmapsf, model=None, num_pc_components=None, min_flux_threshold=None) -> pd.Series: """ Return the probability of an alert to be a SNe Ia using a Random Forest Classifier (bazin fit). Parameters ---------- jd: Spark DataFrame Column JD times (float) fid: Spark DataFrame Column Filter IDs (int) magpsf, sigmapsf: Spark DataFrame Columns Magnitude from PSF-fit photometry, and 1-sigma error model: Spark DataFrame Column, optional Path to the trained model. Default is None, in which case the default model `data/models/kilonova_model.pkl` is loaded. num_pc_components: int Number of principle components to be considered for the fit. Default is none, in which case the default value of 3 is assigned min_flux_threshold: int Minimum value of amplitude of a band for prediction. Default is None, in which case the default value of 200 is assigned Returns ---------- probabilities: 1D np.array of float Probability between 0 (non-kilonova) and 1 (kilonova). Example __________ >>> from fink_science.utilities import concat_col >>> df = spark.read.load(ztf_alert_sample) # Required alert columns >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf'] # Use for creating temp name >>> prefix = 'c' >>> what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements >>> for colname in what: ... df = concat_col(df, colname, prefix=prefix) # Perform the fit + classification (default model) >>> args = ['cjd', 'cfid', 'cmagpsf','csigmapsf'] >>> df = df.withColumn('pKN', rfscore_kn_pca(*args)) >>> df.agg({"pKN": "min"}).collect()[0][0] >= 0.0 True """ if num_pc_components is None: num_pc_components = 3 if min_flux_threshold is None: min_flux_threshold = 200 if num_pc_components != 3: print('error ') # Flag empty alerts mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 3 if len(jd[mask]) == 0: return pd.Series(np.zeros(len(jd), dtype=float)) # Load pre-trained model `clf` if model is not None: clf = load_scikit_model(model.values[0]) # set path else: curdir = os.path.dirname(os.path.abspath(__file__)) model = curdir + '/data/models/kilonova_model.pkl' clf = load_scikit_model(model) # remember to initialize bands bands = ['g', 'r'] test_features = [] ids = pd.Series(range(len(jd))) for id in ids[mask]: # compute flux and flux error data = [ mag2fluxcal_snana(*args) for args in zip(magpsf[id], sigmapsf[id]) ] flux, error = np.transpose(data) # make a Pandas DataFrame with exploded series pdf_id = [id] * len(flux) pdf = pd.DataFrame.from_dict({ 'SNID': [int(i) for i in pdf_id], 'MJD': [int(i) for i in jd[id]], 'FLUXCAL': flux, 'FLUXCALERR': error, 'FLT': pd.Series(fid[id]).replace({ 1: 'g', 2: 'r' }) }) # move to dataframe class pc = PredictLightCurve(pdf, object_id=pdf['SNID'][0]) features = pc.predict_lc_coeff(num_pc_components=num_pc_components, min_flux_threshold=min_flux_threshold, bands=bands, band_choice='u') test_features.append(features) # Make predictions probabilities = clf.predict_proba(test_features) # Take only probabilities to be KN to_return = np.zeros(len(jd), dtype=float) to_return[mask] = probabilities.T[1] return pd.Series(to_return)
def rfscore_sigmoid(jd, fid, magpsf, sigmapsf, model=None) -> pd.Series: """ Return the probability of an alert to be a SNe Ia using a Random Forest Classifier (sigmoid fit). Parameters ---------- jd: Spark DataFrame Column JD times (float) fid: Spark DataFrame Column Filter IDs (int) magpsf, sigmapsf: Spark DataFrame Columns Magnitude from PSF-fit photometry, and 1-sigma error model: Spark DataFrame Column, optional Path to the trained model. Default is None, in which case the default model `data/models/default-model.obj` is loaded. Returns ---------- probabilities: 1D np.array of float Probability between 0 (non-Ia) and 1 (Ia). Examples ---------- >>> from fink_science.utilities import concat_col >>> from pyspark.sql import functions as F >>> df = spark.read.load(ztf_alert_sample) # Required alert columns >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf'] # Use for creating temp name >>> prefix = 'c' >>> what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements >>> for colname in what: ... df = concat_col(df, colname, prefix=prefix) # Perform the fit + classification (default model) >>> args = [F.col(i) for i in what_prefix] >>> df = df.withColumn('pIa', rfscore_sigmoid(*args)) # Note that we can also specify a model >>> args = [F.col(i) for i in what_prefix] + [F.lit(model_path_sigmoid)] >>> df = df.withColumn('pIa', rfscore_sigmoid(*args)) # Drop temp columns >>> df = df.drop(*what_prefix) >>> df.agg({"pIa": "min"}).collect()[0][0] 0.0 >>> df.agg({"pIa": "max"}).collect()[0][0] < 1.0 True """ # Flag empty alerts mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 3 if len(jd[mask]) == 0: return pd.Series(np.zeros(len(jd), dtype=float)) # Load pre-trained model `clf` if model is not None: clf = load_scikit_model(model.values[0]) else: curdir = os.path.dirname(os.path.abspath(__file__)) model = curdir + '/../data/models/default-model_sigmoid.obj' clf = load_scikit_model(model) test_features = [] ids = pd.Series(range(len(jd))) for id in ids[mask]: # compute flux and flux error data = [ mag2fluxcal_snana(*args) for args in zip(magpsf[id], sigmapsf[id]) ] flux, error = np.transpose(data) # make a Pandas DataFrame with exploded series pdf = pd.DataFrame.from_dict({ 'SNID': [id] * len(flux), 'MJD': jd[id], 'FLUXCAL': flux, 'FLUXCALERR': error, 'FLT': pd.Series(fid[id]).replace({ 1: 'g', 2: 'r' }) }) features = get_sigmoid_features_dev(pdf) test_features.append(features) # Make predictions probabilities = clf.predict_proba(test_features) # Take only probabilities to be Ia to_return = np.zeros(len(jd), dtype=float) to_return[mask] = probabilities.T[0] return pd.Series(to_return)
def snn_ia(candid, jd, fid, magpsf, sigmapsf, roid, cdsxmatch, jdstarthist, model_name, model_ext=None) -> pd.Series: """ Compute probabilities of alerts to be SN Ia using SuperNNova Parameters ---------- candid: Spark DataFrame Column Candidate IDs (int64) jd: Spark DataFrame Column JD times (float) fid: Spark DataFrame Column Filter IDs (int) magpsf, sigmapsf: Spark DataFrame Columns Magnitude from PSF-fit photometry, and 1-sigma error model_name: Spark DataFrame Column SuperNNova pre-trained model. Currently available: * snn_snia_vs_nonia * snn_sn_vs_all model_ext: Spark DataFrame Column, optional Path to the trained model (overwrite `model`). Default is None Returns ---------- probabilities: 1D np.array of float Probability between 0 (non-Ia) and 1 (Ia). Examples ---------- >>> from fink_science.xmatch.processor import cdsxmatch >>> from fink_science.asteroids.processor import roid_catcher >>> from fink_science.utilities import concat_col >>> from pyspark.sql import functions as F >>> df = spark.read.load(ztf_alert_sample) # Add SIMBAD field >>> colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']] >>> df = df.withColumn('cdsxmatch', cdsxmatch(*colnames)) # Required alert columns >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf'] # Use for creating temp name >>> prefix = 'c' >>> what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements >>> for colname in what: ... df = concat_col(df, colname, prefix=prefix) # Add SSO field >>> args_roid = [ ... 'cjd', 'cmagpsf', ... 'candidate.ndethist', 'candidate.sgscore1', ... 'candidate.ssdistnr', 'candidate.distpsnr1'] >>> df = df.withColumn('roid', roid_catcher(*args_roid)) # Perform the fit + classification (default model) >>> args = ['candid', 'cjd', 'cfid', 'cmagpsf', 'csigmapsf'] >>> args += [F.col('roid'), F.col('cdsxmatch'), F.col('candidate.jdstarthist')] >>> args += [F.lit('snn_snia_vs_nonia')] >>> df = df.withColumn('pIa', snn_ia(*args)) >>> df.filter(df['pIa'] > 0.5).count() 7 # Note that we can also specify a model >>> args = [F.col(i) for i in ['candid', 'cjd', 'cfid', 'cmagpsf', 'csigmapsf']] >>> args += [F.col('roid'), F.col('cdsxmatch'), F.col('candidate.jdstarthist')] >>> args += [F.lit(''), F.lit(model_path)] >>> df = df.withColumn('pIa', snn_ia(*args)) >>> df.filter(df['pIa'] > 0.5).count() 7 """ # Flag empty alerts mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 1 mask *= jd.apply(lambda x: float(x[-1])) - jdstarthist.astype(float) <= 90 mask *= roid.astype(int) != 3 list_of_sn_host = return_list_of_sn_host() mask *= cdsxmatch.apply(lambda x: x in list_of_sn_host) if len(jd[mask]) == 0: return pd.Series(np.zeros(len(jd), dtype=float)) if model_ext is not None: # take the first element of the Series model = model_ext.values[0] else: # Load pre-trained model curdir = os.path.dirname(os.path.abspath(__file__)) model = curdir + '/data/models/snn_models/{}/model.pt'.format( model_name.values[0]) # add an exploded column with SNID df_tmp = pd.DataFrame.from_dict({ 'jd': jd[mask], 'SNID': [str(i) for i in candid[mask].values] }) df_tmp = df_tmp.explode('jd') # compute flux and flux error data = [ mag2fluxcal_snana(*args) for args in zip(magpsf[mask].explode(), sigmapsf[mask].explode()) ] flux, error = np.transpose(data) # make a Pandas DataFrame with exploded series pdf = pd.DataFrame.from_dict({ 'SNID': df_tmp['SNID'], 'MJD': df_tmp['jd'], 'FLUXCAL': flux, 'FLUXCALERR': error, 'FLT': fid[mask].explode().replace({ 1: 'g', 2: 'r' }) }) # Compute predictions ids, pred_probs = classify_lcs(pdf, model, 'cpu') # Reformat and re-index preds_df = reformat_to_df(pred_probs, ids=ids) preds_df.index = preds_df.SNID # Take only probabilities to be Ia to_return = np.zeros(len(jd), dtype=float) ia = preds_df.reindex([str(i) for i in candid[mask].values]) to_return[mask] = ia.prob_class0.values # return probabilities to be Ia return pd.Series(to_return)
def snn_ia(candid, jd, fid, magpsf, sigmapsf, model=None) -> pd.Series: """ Compute probabilities of alerts to be SN Ia using SuperNNova Parameters ---------- candid: Spark DataFrame Column Candidate IDs (int64) jd: Spark DataFrame Column JD times (float) fid: Spark DataFrame Column Filter IDs (int) magpsf, sigmapsf: Spark DataFrame Columns Magnitude from PSF-fit photometry, and 1-sigma error model: Spark DataFrame Column, optional Path to the trained model. Default is None, in which case the default model `data/models/<vanilla_S_0_...>.pt` is loaded. Returns ---------- probabilities: 1D np.array of float Probability between 0 (non-Ia) and 1 (Ia). Examples ---------- >>> from fink_science.utilities import concat_col >>> from pyspark.sql import functions as F >>> df = spark.read.load(ztf_alert_sample) # Required alert columns >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf'] # Use for creating temp name >>> prefix = 'c' >>> what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements >>> for colname in what: ... df = concat_col(df, colname, prefix=prefix) # Perform the fit + classification (default model) >>> args = [F.col(i) for i in ['candid', 'cjd', 'cfid', 'cmagpsf', 'csigmapsf']] >>> df = df.withColumn('pIa', snn_ia(*args)) # Note that we can also specify a model >>> args = [F.col(i) for i in ['candid', 'cjd', 'cfid', 'cmagpsf', 'csigmapsf']] + [F.lit(model_path)] >>> df = df.withColumn('pIa', snn_ia(*args)) # Drop temp columns >>> df = df.drop(*what_prefix) >>> df.agg({"pIa": "min"}).collect()[0][0] >= 0.0 True >>> df.agg({"pIa": "max"}).collect()[0][0] < 1.0 True """ # Load pre-trained model if model is None: curdir = os.path.dirname(os.path.abspath(__file__)) model = curdir + '/../data/models/vanilla_S_0_CLF_2_R_none_photometry_DF_1.0_N_global_lstm_32x2_0.05_128_True_mean_C.pt' else: # take the first element of the Series model = model[0] # add an exploded column with SNID df_tmp = pd.DataFrame.from_dict({ 'jd': jd, 'SNID': [str(i) for i in candid.values] }) df_tmp = df_tmp.explode('jd') # compute flux and flux error data = [ mag2fluxcal_snana(*args) for args in zip(magpsf.explode(), sigmapsf.explode()) ] flux, error = np.transpose(data) # make a Pandas DataFrame with exploded series pdf = pd.DataFrame.from_dict({ 'SNID': df_tmp['SNID'], 'MJD': df_tmp['jd'], 'FLUXCAL': flux, 'FLUXCALERR': error, 'FLT': fid.explode().replace({ 1: 'g', 2: 'r' }) }) # Compute predictions ids, pred_probs = classify_lcs(pdf, model, 'cpu') # Reformat and re-index preds_df = reformat_to_df(pred_probs, ids=ids) preds_df.index = preds_df.SNID ia = preds_df.reindex([str(i) for i in candid.values]) # return probabilities to be Ia return ia.prob_class0
def knscore(jd, fid, magpsf, sigmapsf, jdstarthist, cdsxmatch, ndethist, model_path=None, pcs_path=None, npcs=None) -> pd.Series: """ Return the probability of an alert to be a Kilonova using a Random Forest Classifier. You need to run the SIMBAD crossmatch before. Parameters ---------- jd: Spark DataFrame Column JD times (vectors of floats) fid: Spark DataFrame Column Filter IDs (vectors of ints) magpsf, sigmapsf: Spark DataFrame Columns Magnitude from PSF-fit photometry, and 1-sigma error (vectors of floats) cdsxmatch: Spark DataFrame Column Type of object found in Simbad (string) ndethist: Spark DataFrame Column Column containing the number of detection by ZTF at 3 sigma (int) jdstarthist: Spark DataFrame Column Column containing first time variability has been seen model_path: Spark DataFrame Column, optional Path to the trained model. Default is None, in which case the default model `data/models/KN_model_2PC.pkl` is loaded. pcs_path: Spark DataFrame Column, optional Path to the Principal Component file. Default is None, in which case the `data/models/components.csv` is loaded. npcs: Spark DataFrame Column, optional Integer representing the number of Principal Component to use. It should be consistent to the training model used. Default is None (i.e. default npcs for the default `model_path`, that is 1). Returns ---------- probabilities: 1D np.array of float Probability between 0 (non-KNe) and 1 (KNe). Examples ---------- >>> from fink_science.xmatch.processor import cdsxmatch >>> from fink_science.utilities import concat_col >>> from pyspark.sql import functions as F >>> df = spark.read.load(ztf_alert_sample) >>> colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']] >>> df = df.withColumn('cdsxmatch', cdsxmatch(*colnames)) # Required alert columns >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf'] # Use for creating temp name >>> prefix = 'c' >>> what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements >>> for colname in what: ... df = concat_col(df, colname, prefix=prefix) # Perform the fit + classification (default model) >>> args = [F.col(i) for i in what_prefix] >>> args += [F.col('candidate.jdstarthist'), F.col('cdsxmatch'), F.col('candidate.ndethist')] >>> df = df.withColumn('pKNe', knscore(*args)) >>> df.filter(df['pKNe'] > 0.5).count() 0 >>> df.filter(df['pKNe'] > 0.5).select(['rf_kn_vs_nonkn', 'pKNe']).show() +--------------+----+ |rf_kn_vs_nonkn|pKNe| +--------------+----+ +--------------+----+ <BLANKLINE> # Note that we can also specify a model >>> extra_args = [F.col('candidate.jdstarthist'), F.col('cdsxmatch'), F.col('candidate.ndethist')] >>> extra_args += [F.lit(model_path), F.lit(comp_path), F.lit(2)] >>> args = [F.col(i) for i in what_prefix] + extra_args >>> df = df.withColumn('pKNe', knscore(*args)) # Drop temp columns >>> df = df.drop(*what_prefix) >>> df.filter(df['pKNe'] > 0.5).count() 0 """ epoch_lim = [-50, 50] time_bin = 0.25 flux_lim = 0 # Flag empty alerts mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 1 mask *= (ndethist.astype(int) <= 20) mask *= jd.apply(lambda x: float(x[-1])) - jdstarthist.astype(float) < 20 list_of_kn_host = return_list_of_kn_host() mask *= cdsxmatch.apply(lambda x: x in list_of_kn_host) if len(jd[mask]) == 0: return pd.Series(np.zeros(len(jd), dtype=float)) # add an exploded column with SNID df_tmp = pd.DataFrame.from_dict( { 'jd': jd[mask], 'SNID': range(len(jd[mask])) } ) df_tmp = df_tmp.explode('jd') # compute flux and flux error data = [mag2fluxcal_snana(*args) for args in zip( magpsf[mask].explode(), sigmapsf[mask].explode())] flux, error = np.transpose(data) # make a Pandas DataFrame with exploded series pdf = pd.DataFrame.from_dict({ 'SNID': df_tmp['SNID'], 'MJD': df_tmp['jd'], 'FLUXCAL': flux, 'FLUXCALERR': error, 'FLT': fid[mask].explode().replace({1: 'g', 2: 'r'}) }) # Load pre-trained model `clf` if model_path is not None: model = load_scikit_model(model_path.values[0]) else: curdir = os.path.dirname(os.path.abspath(__file__)) model_path = curdir + '/data/models/KN_model_2PC.pkl' model = load_scikit_model(model_path) # Load pcs if npcs is not None: npcs = int(npcs.values[0]) else: npcs = 2 if pcs_path is not None: pcs_path_ = pcs_path.values[0] else: curdir = os.path.dirname(os.path.abspath(__file__)) pcs_path_ = curdir + '/data/models/components.csv' pcs = load_pcs(pcs_path_, npcs=npcs) test_features = [] filters = ['g', 'r'] # extract features (all filters) for each ID for id in np.unique(pdf['SNID']): pdf_sub = pdf[pdf['SNID'] == id] pdf_sub = pdf_sub[pdf_sub['FLUXCAL'] == pdf_sub['FLUXCAL']] features = extract_all_filters_fink( epoch_lim=epoch_lim, pcs=pcs, time_bin=time_bin, filters=filters, lc=pdf_sub, flux_lim=flux_lim) test_features.append(features) # Remove pathological values names_root = [ 'npoints_', 'residuo_' ] + [ 'coeff' + str(i + 1) + '_' for i in range(len(pcs.keys())) ] + ['maxflux_'] columns = [i + j for j in ['g', 'r'] for i in names_root] matrix = pd.DataFrame(test_features, columns=columns) zeros = np.logical_or( matrix['coeff1_g'].values == 0, matrix['coeff1_r'].values == 0 ) matrix_clean = matrix[~zeros] # If all alerts are flagged as bad if np.shape(matrix_clean) == (0, len(get_features_name(npcs))): to_return = np.zeros(len(jd), dtype=float) return pd.Series(to_return) # Otherwise make predictions probabilities = model.predict_proba(matrix_clean.values) probabilities_notkne = np.zeros(len(test_features)) probabilities_kne = np.zeros(len(test_features)) probabilities_notkne[~zeros] = probabilities.T[0] probabilities_kne[~zeros] = probabilities.T[1] probabilities_ = np.array([probabilities_notkne, probabilities_kne]).T # Take only probabilities to be Ia to_return = np.zeros(len(jd), dtype=float) to_return[mask] = probabilities_.T[1] return pd.Series(to_return)
def extract_features_knscore(jd, fid, magpsf, sigmapsf, pcs_path=None, npcs=None) -> pd.Series: """ Extract features used by the Kilonova classifier (using a Random Forest Classifier). Parameters ---------- jd: Spark DataFrame Column JD times (float) fid: Spark DataFrame Column Filter IDs (int) magpsf, sigmapsf: Spark DataFrame Columns Magnitude from PSF-fit photometry, and 1-sigma error pcs_path: Spark DataFrame Column, optional Path to the Principal Component file. Default is None, in which case the `data/models/components.csv` is loaded. npcs: Spark DataFrame Column, optional Integer representing the number of Principal Component to use. It should be consistent to the training model used. Default is None (i.e. default npcs for the default `model_path`, that is 1). Returns ---------- out: str comma separated features Examples ---------- >>> from pyspark.sql.functions import split >>> from pyspark.sql.types import FloatType >>> from fink_science.utilities import concat_col >>> from fink_science.kilonova.lib_kn import get_features_name >>> from pyspark.sql import functions as F >>> df = spark.read.load(ztf_alert_sample) # Required alert columns >>> what = ['jd', 'fid', 'magpsf', 'sigmapsf'] # Use for creating temp name >>> prefix = 'c' >>> what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements >>> for colname in what: ... df = concat_col(df, colname, prefix=prefix) # Perform the fit + classification (default model) >>> args = [F.col(i) for i in what_prefix] >>> df = df.withColumn('features', extract_features_knscore(*args)) >>> KN_FEATURE_NAMES_2PC = get_features_name(2) >>> for name in KN_FEATURE_NAMES_2PC: ... index = KN_FEATURE_NAMES_2PC.index(name) ... df = df.withColumn(name, split(df['features'], ',')[index].astype(FloatType())) # Trigger something >>> df.agg({KN_FEATURE_NAMES_2PC[0]: "min"}).collect()[0][0] 0.0 """ epoch_lim = [-50, 50] time_bin = 0.25 flux_lim = 0 # Flag empty alerts mask = magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) > 1 if len(jd[mask]) == 0: return pd.Series(np.zeros(len(jd), dtype=float)) # add an exploded column with SNID df_tmp = pd.DataFrame.from_dict( { 'jd': jd[mask], 'SNID': range(len(jd[mask])) } ) df_tmp = df_tmp.explode('jd') # compute flux and flux error data = [mag2fluxcal_snana(*args) for args in zip( magpsf[mask].explode(), sigmapsf[mask].explode())] flux, error = np.transpose(data) # make a Pandas DataFrame with exploded series pdf = pd.DataFrame.from_dict({ 'SNID': df_tmp['SNID'], 'MJD': df_tmp['jd'], 'FLUXCAL': flux, 'FLUXCALERR': error, 'FLT': fid[mask].explode().replace({1: 'g', 2: 'r'}) }) # Load pcs if npcs is not None: npcs = int(npcs.values[0]) else: npcs = 2 if pcs_path is not None: pcs_path_ = pcs_path.values[0] else: curdir = os.path.dirname(os.path.abspath(__file__)) pcs_path_ = curdir + '/data/models/components.csv' pcs = load_pcs(pcs_path_, npcs=npcs) test_features = [] filters = ['g', 'r'] # extract features (all filters) for each ID for id in np.unique(pdf['SNID']): pdf_sub = pdf[pdf['SNID'] == id] pdf_sub = pdf_sub[pdf_sub['FLUXCAL'] == pdf_sub['FLUXCAL']] features = extract_all_filters_fink( epoch_lim=epoch_lim, pcs=pcs, time_bin=time_bin, filters=filters, lc=pdf_sub, flux_lim=flux_lim) test_features.append(features) to_return_features = np.zeros( (len(jd), len(get_features_name(npcs))), dtype=float ) to_return_features[mask] = test_features concatenated_features = [ ','.join(np.array(i, dtype=str)) for i in to_return_features ] return pd.Series(concatenated_features)