def url_checker(url_str): """ Checks if the URL is valid or not. Parameters ----------- url_str : `str` URL of the website to evaluate. Raises ---------- LSSUtils_Error : `Exception` Program exception if input parameters are accepted """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters if not (isinstance(url_str, str)): msg = '{0} `url_str` ({1}) is not a STRING!'.format( file_msg, type(url_str)) raise LSSUtils_Error(msg) ## ## Checking Website request_url = requests.get(url_str) if (request_url.status_code != 200): msg = '{0} `url_str` ({1}) does not exist!'.format(file_msg, url_str) raise LSSUtils_Error(msg)
def read_hdf5_file_to_pandas_DF(hdf5_file, key=None): """ Reads content of HDF5 file and converts it to a Pandas DataFrame Parameters ---------- hdf5_file : str Path to the HDF5 file. This is the file that will be converted to a pandas DataFrame. key : str or NoneType, optional Key or path in `hdf5_file` for the pandas DataFrame and the normal HDF5 file. Returns ---------- df : `pandas.DataFrame` DataFrame from `hdf5_file` under the `key` directory. """ file_msg = fd.Program_Msg(__file__) fd.File_Exists(hdf5_file) # Reading in Pandas DataFrame try: df = pd.read_hdf(hdf5_file, key=key) except: msg = '{0} Could not read `hdf5_file` ({1})! Please check if it exists' msg = msg.format(file_msg, hdf5_file) raise LSSUtils_Error(file_msg) return df
def pandas_file_to_hdf5_file(df_file, hdf5_file, key=None, mode='w'): """ Converts a HDF5 with pandas format and converts it to normal HDF5 file Paramters --------- df_file : str Path to the `df_file` containing the pandas DataFrame to be converted hdf5_file : str Path to the output HDF5 file containg arrays as keys key : str or NoneType, optional Key or path in HDF5 file for the `df_file` and `hdf5_file` """ file_msg = fd.Program_Msg(__file__) fd.File_Exists(filename) # Reading in DataFrame if not key: data, key = read_pandas_hdf5(df_file, key=None, ret=True) else: data = read_pandas_hdf5(df_file, key=key) # Rearranging data arr_names = data.dtypes.index.values dtype_arr = data.dtypes.values dtypes_arr = np.array([x.str for x in dtypes_arr]) data_dtypes = np.dtype(zip(arr_names, dtypes_arr)) dataset = np.recarray((len(data), ), dtype=data_dtypes) for name in dataset.dtype.names: dataset[name] = data[name] # Saving file to HDF5 format hdf5_obj = h5py.File(hdf5_file, mode=mode) hdf5_obj.create_dataset(key, data=dataset) hdf5_obj.close() msg = '{0} HDF5 file created: {1}'.format(file_msg, hdf5_file)
def pandas_df_to_hdf5_file(df, hdf5_file, key=None, mode='w', complevel=8): """ Saves a `pandas.DataFrame` into a `pandas` HDF5 FILE. Parameters ---------- df : `pandas.DataFrame` DataFrame to be converted and saved into a HDF5 file. hdf5_file : str Path to the output HDF5 file key : str or NoneType, optional Key or path, under which `df` will be saved in the `hdf5_file`. mode : {'w','a'}, optional Mode to handle `hdf5_file`. This value is set to `w` by default, which stand for `write`. complevel : int, optional Level of compression for `hdf5_file`. The range of `complevel` is rane(0-9). This is set to a default of 8. """ file_msg = fd.Program_Msg(__file__) # Saving DataFrame to `hdf5_file` try: data.to_hdf(hdf5_file, key, mode=mode, complevel=complevel) msg = '{0} HDF5 file created: {1}'.format(file_msg, hdf5_file) print(msg) except: msg = '{0} Could not create HDF5 file'.format(file_msg) raise LSSUtils_Error(msg)
def Bins_array_create(arr, base=10): """ Generates an evenly-spaced array between the minimum and maximum value of a given array, Parameters ---------- arr : array_like Array of of numbers or floats base : int or float, optional Interval used to create the evenly-spaced array of elements Returns ---------- bins_arr : `numpy.ndarray` Array of elements separated in intervals of `base` """ file_msg = fd.Program_Msg(__file__) # Transforming input data base = float(base) arr = np.asarray(arr) # Checking array dimensions if arr.ndim != 1: msg = '{0} The input array is not of dimension 1, but of `{1}`'.format( file_msg, arr.ndim) raise LSSUtils_Error(msg) # Creating evenly-spaced array arr_min = myfloor(arr.min(), base=base) arr_max = myceil(arr.max(), base=base) bins_arr = np.arange(arr_min, arr_max + 0.5 * base, base) return bins_arr
def IDL_read_file(idl_file): """ Reads an IDL file and converts it to a Python dictionary Parameters ---------- idl_file : string Path to the filename being used Returns ---------- idl_dict : python dictionary Dictionary with the data from `idl_file` """ # Checking that file exists fd.File_Exists(idl_file) # Converting to dictionary try: idl_dict = readsav(idl_file, python_dict=True) except: msg = '{0} `idl_file` {0} is not an IDL file'.format( fd.Program_Msg(__file__), idl_file) raise LSSUtils_Error(msg) return idl_dict
def cookiecutter_paths(path='./'): """ Paths to main folders in the `Data Science` cookiecutter template. This structure was taken from : - https://drivendata.github.io/cookiecutter-data-science/ Parameters ---------- path : str, optional Path to the file within the `.git` repository Return ---------- param_dict : python dictionary Dictionary with info of the proect that uses the Data Science cookiecutter template. Raises ---------- LSSUtils_Error : exception If `path` is not within a .git directory, it raises an error. """ # Base Path base_dir = git_root_dir(path) + '/' # Checking that directory exists if os.path.exists(base_dir): # Plot Directory plot_dir = os.path.join(base_dir, 'reports', 'figures/') # Source directory src_dir = os.path.join(base_dir, 'src', 'data') # Data path data_dir = os.path.join(base_dir, 'data/') # Creating files for dir_ii in [plot_dir, src_dir, data_dir]: fd.Path_Folder(dir_ii) # Saving to dictionary param_dict = {} param_dict['base_dir'] = base_dir param_dict['plot_dir'] = plot_dir param_dict['src_dir'] = src_dir param_dict['data_dir'] = data_dir else: msg = '{0} `base_dir` ({1}) is not a Git directory! Exiting'.format( fd.Program_Msg(__file__), base_dir) raise LSSUtils_Error(msg) return param_dict
def luminosity_to_absolute_mag(lum, filter_opt, system='SDSS_Blanton_2003_z0.1'): """ Calculates the absolute magnitude of object through the `filter_opt` filter. Parameters ----------- lum : float, int, array_like Luminosity of 1 or more objects. In units of `solar luminosities`. filter_opt : {'U', 'B', 'V', 'R', 'I', 'J', 'H', 'K'} str Magnitude filter to use. system : {'Binney_and_Merrifield_1998', 'SDSS_Blanton_2003_z0.1'} str Kind of filter to use. Options: - 'Binney_and_Merrifield_1998' : See Binney and Merrifield, 1998 - 'SDSS_Blanton_2003_z0.1' : See Blanton et al. (2003) Eqn. 14. Returns ----------- abs_mag : float, int, or array_like Absolute magnitude of one or multiple objects. Same type as `lum` Raises ---------- LSSUtils_Error : Exception Program exception if input parameters are accepted """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters valid_types = (float, int, list, np.ndarray) if not (isinstance(abs_mag, valid_types)): msg = '{0} `abs_mag` ({1}) is not a valid type!'.format( file_msg, abs_mag) raise LSSUtils_Error(msg) ## Obtaining Sun's absolute magnitude abs_mag_sun = get_sun_mag(filter_opt, system=system) ## Absolute magnitude calculation lum_sun = 1.0 # In units of solar luminosities # Absolute magnitude of objects abs_mag = abs_mag_sun - 2.5 * np.log10(lum / lum_sun) return abs_mag
def absolute_magnitude_to_luminosity(abs_mag, filter_opt, system='SDSS_Blanton_2003_z0.1'): """ Calculates the luminosity of the object through `filter_opt` filter. Parameters ----------- abs_mag : float, int, or array_like Absolute magnitude of one or multiple objects. filter_opt : {'U', 'B', 'V', 'R', 'I', 'J', 'H', 'K'} str Magnitude filter to use. system : {'Binney_and_Merrifield_1998', 'SDSS_Blanton_2003_z0.1'} str Kind of filter to use. Options: - 'Binney_and_Merrifield_1998' : See Binney and Merrifield, 1998 - 'SDSS_Blanton_2003_z0.1' : See Blanton et al. (2003) Eqn. 14. Returns ----------- log_L : float or array_like Logarithmic value of the luminosity in the `filter_opt` band. Raises ---------- LSSUtils_Error : Exception Program exception if input parameters are accepted """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters valid_types = (float, int, list, np.ndarray) if not (isinstance(abs_mag, valid_types)): msg = '{0} `abs_mag` ({1}) is not a valid type!'.format( file_msg, abs_mag) raise LSSUtils_Error(msg) ## Obtaining Sun's absolute magnitude abs_mag_sun = get_sun_mag(filter_opt, system=system) ## Luminosity calculations log_L = (abs_mag_sun - abs_mag) * 0.4 return log_L
def extract_catls(catl_kind='data', catl_type='mr', sample_s='19', datatype='.hdf5', catl_info='members', halotype='fof', clf_method=3, hod_n=0, clf_seed=1235, perf_opt=False, return_len=False, print_filedir=True): """ Extracts a list of synthetic catalogues given input parameters Parameters ------------ catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_type : {'mr', 'mstar'} str, optional Type of catalogue to use. It shows which abundance matching method was used for the CLF when assigning halo masses. This variable is set to 'mr' by default. Options: - `mr` : Uses r-band absolute magnitude - `mstar` : Uses stellar masses sample_s : {'19', '20', '21'} str, optional Volume-limited sample to use. This variable is set to '19' by default. Options: - '19' : Uses the Mr19 volume-limited sample, i.e. 'Consuelo' - '20' : Uses the Mr20 volume-limited sample, i.e. 'Esmeralda' - '21' : Uses the Mr21 volume-limited sample, i.e. 'Carmen' datatype : {'.hdf5'} str, optional Data type of the files to be indexed in the folder. This variable is set to '.hdf5' by default. catl_info : {'members', 'groups'} str, optional Option for which kind of catalogues to use. Options: - `members` : Member galaxies of group catalogues - `groups` : Catalogues with `group` information. halotype : {'fof', 'so'} str, optional Type of the dark matter halo of the simulation used to create the synthetic catalogues. This variable is set to `fof` by default. Options: - 'fof': Friends-of-Friends halos. - 'so' : Spherical overdensity halos. clf_method : {1, 2, 3} int, optional Method for assigning galaxy properties to mock galaxies. This variable is set to `3` by default. Options: - `1` : Independent assigment of (g-r) color, sersic, and log(ssfr) - `2` : (g-r) decides active/passive designation and draw values independently. - `3` : (g-r) decides active/passive designations, and assigns other galaxy properties for that given galaxy. hod_n : {0, 1} int, optional HOD model to use. Only relevant when `catl_kind == mocks`. clf_seed : int, optional Seed used for the `CLF` random seed. This variable is set to `1235` by default. perf_opt : boolean, optional If True, it chooses to analyze the `perfect` set of synthetic catalogues. This variable is set to `False` by default. return_len : boolean, optional If True, the function returns the total number of elements in the folder that match the criteria. print_filedir : boolean, optional If True, the output directory is printed onto the screen. Returns ------------ catl_arr : `numpy.ndarray` Array of elements/files matching the `datatype` type in the directory. Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters catl_kind_valid = ['data', 'mocks'] catl_type_valid = ['mr', 'mstar'] sample_s_valid = ['19', '20', '21'] catl_info_valid = ['members', 'groups'] halotype_valid = ['fof', 'so'] clf_method_valid = [1, 2, 3] hod_n_valid = [0, 1] # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_type` if not (catl_type in catl_type_valid): msg = '{0} `catl_type` ({1}) is not a valid input!'.format( file_msg, catl_type) raise LSSUtils_Error(msg) # `sample_s` if not (sample_s in sample_s_valid): msg = '{0} `sample_s` ({1}) is not a valid input!'.format( file_msg, sample_s) raise LSSUtils_Error(msg) # `catl_info` if not (catl_info in catl_info_valid): msg = '{0} `catl_info` ({1}) is not a valid input!'.format( file_msg, catl_info) raise LSSUtils_Error(msg) # `halotype` if not (halotype in halotype_valid): msg = '{0} `halotype` ({1}) is not a valid input!'.format( file_msg, halotype) raise LSSUtils_Error(msg) # `clf_method` if not (clf_method in clf_method_valid): msg = '{0} `clf_method` ({1}) is not a valid input!'.format( file_msg, clf_method) raise LSSUtils_Error(msg) # `hod_n` if not (hod_n in hod_n_valid): msg = '{0} `hod_n` ({1}) is not a valid input!'.format(file_msg, hod_n) raise LSSUtils_Error(msg) # `perf_opt` if not (isinstance(perf_opt, bool)): msg = '{0} `perf_opt` ({1}) is not a valid type!'.format( file_msg, type(perf_opt)) raise LSSUtils_Error(msg) # `print_filedir` if not (isinstance(print_filedir, bool)): msg = '{0} `print_filedir` ({1}) is not a valid type!'.format( file_msg, type(print_filedir)) raise LSSUtils_Error(msg) # `return_len` if not (isinstance(return_len, bool)): msg = '{0} `return_len` ({1}) is not a valid type!'.format( file_msg, type(return_len)) raise LSSUtils_Error(msg) # `datatype` if not (isinstance(datatype, str)): msg = '{0} `datatype` ({1}) is not a valid type!'.format( file_msg, type(datatype)) raise LSSUtils_Error(msg) ## ## Extracting the path of the catalogues filedir = catl_sdss_dir(catl_kind=catl_kind, catl_type=catl_type, sample_s=sample_s, catl_info=catl_info, halotype=halotype, clf_method=clf_method, hod_n=hod_n, clf_seed=clf_seed, perf_opt=perf_opt, print_filedir=print_filedir) ## ## Convertint to array catl_arr = np.sort(fd.Index(filedir, datatype)) # Checking number of elements if len(catl_arr) == 0: msg = '{0} `catl_arr` contains 0 entries!'.format(file_msg) raise LSSUtils_Error(msg) ## ## Returning elements if return_len: return catl_arr, len(catl_arr) else: return catl_arr
def sdss_catl_clean(catl_pd, catl_kind, catl_info='members', reindex=True): """ Cleans the catalogue by removing `failed` values. Parameters ----------- catl_pd : `pandas.DataFrame` Dataset with the catalogue information. catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_info : {'members', 'groups'} str, optional Option for which kind of catalogues to use. Options: - `members` : Member galaxies of group catalogues - `groups` : Catalogues with `group` information. reindex : boolean, optional If True, the output catalogue is re-indexed. Return ----------- catl_pd_clean : `pandas.DataFrame` Cleaned version of `catl_pd`, after having removed `failed` values. Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters catl_kind_valid = ['data', 'mocks'] catl_info_valid = ['members', 'groups'] # `catl_pd` if not (isinstance(catl_pd, pd.DataFrame)): msg = '{0} `catl_pd` ({1}) is not a valid type!'.format( file_msg, catl_pd) raise LSSUtils_Error(msg) # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_info` if not (catl_info in catl_info_valid): msg = '{0} `catl_info` ({1}) is not a valid input!'.format( file_msg, catl_info) raise LSSUtils_Error(msg) # `reindex if not (isinstance(reindex, bool)): msg = '{0} `reindex` ({1}) is not a valid type!'.format( file_msg, type(reindex)) raise LSSUtils_Error(msg) ## ## Defining `failed` values ssfr_fail_arr = [0, -99, -999, np.nan] mstar_fail_arr = [-1, 0, np.nan] ## ## Getting keys for catalogues (logssfr_key, logmstar_key) = catl_keys_prop(catl_kind=catl_kind, catl_info=catl_info, return_type='list') ## ## Cleaning catalogue entries # Data if catl_kind == 'data': # Clean version catl_pd_clean = catl_pd[~catl_pd[logssfr_key].isin(ssfr_fail_arr) &\ ~catl_pd[logmstar_key].isin(mstar_fail_arr)] # Mocks if catl_kind == 'mocks': # Clean version catl_pd_clean = catl_pd[~catl_pd[logssfr_key].isin(ssfr_fail_arr)] ## ## Reindexing if reindex: catl_pd_clean.reset_index(inplace=True, drop=True) return catl_pd_clean
def Behroozi_relation(log_mstar, z=0., return_mhalo_h0=False, mstar_h0=False): """ Returns the halo mass of a central galaxy as a function of its stellar mass. Parameters ----------- log_mstar : `float` ,`np.ndarray`, or array-like Value or array of values of base-10 logarithm of stellar mass in h=1 solar mass units. z : int, float, `np.ndarray` or array-like Redshift of the halo hosting the galaxy. If passing an array, it must be of the same length as the input `log_mstar`. return_mhalo_h0 : `bool`, optional If True, the function returns the halo masses in ``h=1`` units. This variable is set to False by default. mstar_h0 : `bool`, optional If True, the stellar mass in `log_mstar` is converted from ``h=1`` units to ``h=0.7`` units. This variable is set to False by default. Returns ----------- log_halo_mass : float or `np.ndarray` Array or float containing 10-base logarithm of halo mass in ``h=1`` solar mass units. Note ---------- The parameter values in Behroozi+10 were fit to data assuming ``h=0.7``, but all halotools inputs are in ``h=1`` units. Thus we will transform our input stellar mass to ``h=0.7`` units, evaluate using the Behroozi parameters, and then transform back to ``h=1`` units before returning the result. """ file_msg = fd.Program_Msg(__file__) little_h = 0.7 ## Checking input parameters # `log_mstar` mstar_valid_types = (int, float, np.ndarray, list) if not (isinstance(log_mstar, mstar_valid_types)): msg = '{0} `log_mstar` ({1}) is not a valid type!'.format( file_msg, type(log_mstar)) raise LSSUtils_Error(msg) # `z` z_valid_types = (int, float, np.ndarray, list) if not (isinstance(z, z_valid_types)): msg = '{0} `z` ({1}) is not a valid type!'.format( file_msg, type(z)) raise LSSUtils_Error(msg) # `return_mhalo_h0` return_mhalo_h0_valid_types = (bool) if not (isinstance(return_mhalo_h0, return_mhalo_h0_valid_types)): msg = '{0} `return_mhalo_h0` ({1}) is not a valid type!'.format( file_msg, type(return_mhalo_h0)) raise LSSUtils_Error(msg) # `mstar_h0` mstar_h0_valid_types = (bool) if not (isinstance(mstar_h0, mstar_h0_valid_types)): msg = '{0} `mstar_h0` ({1}) is not a valid type!'.format( file_msg, type(mstar_h0)) raise LSSUtils_Error(msg) ## ## Behroozi dictionary param_dict = _retrieve_Behroozi_default_dict() ## Converting from different `h` units if mstar_h0: mstar = (10**log_mstar)/(little_h**2) else: mstar = 10.**(log_mstar) # Scale factor a = 1./(1. + z) ## ## Behroozi function logm0 = param_dict['smhm_m0_0'] + param_dict['smhm_m0_a']*(a - 1) m0 = 10.**logm0 logm1 = param_dict['smhm_m1_0'] + param_dict['smhm_m1_a']*(a - 1) beta = param_dict['smhm_beta_0'] + param_dict['smhm_beta_a']*(a - 1) delta = param_dict['smhm_delta_0'] + param_dict['smhm_delta_a']*(a - 1) gamma = param_dict['smhm_gamma_0'] + param_dict['smhm_gamma_a']*(a - 1) # stellar_mass_by_m0 = mstar/m0 term3_numerator = (stellar_mass_by_m0)**delta term3_denominator = 1. + (stellar_mass_by_m0)**(-gamma) log_halo_mass = logm1 + beta*np.log10(stellar_mass_by_m0) log_halo_mass += (term3_numerator/term3_denominator) - 0.5 # convert back from h=0.7 to h=1 and return the result if return_mhalo_h0: return np.log10((10.**log_halo_mass)*little_h) else: return log_halo_mass
def sdss_catl_clean_nmin(catl_pd, catl_kind, catl_info='members', nmin=1, perf_opt=False): """ Cleans the catalogue removing `failed` values, and only includes galaxies that are in groups/halos above a `nmin` threshold. Parameters ----------- catl_pd : `pandas.DataFrame` Dataset with the catalogue information. catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_info : {'members', 'groups'} str, optional Option for which kind of catalogues to use. Options: - `members` : Member galaxies of group catalogues - `groups` : Catalogues with `group` information. nmin : int, optional Minimum group richness to have in the (galaxy) group catalogue. This variable is set to `1` by default. perf_opt : boolean, optional Option for using a `perfect` mock catalogue. Return ----------- catl_pd_clean : `pandas.DataFrame` Cleaned version of `catl_pd` after having removed `failed` values, and having choosen only galaxies within groups above a group richness threshold of `nmin`. Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters catl_kind_valid = ['data', 'mocks'] catl_info_valid = ['members', 'groups'] # `catl_pd` if not (isinstance(catl_pd, pd.DataFrame)): msg = '{0} `catl_pd` ({1}) is not a valid type!'.format( file_msg, catl_pd) raise LSSUtils_Error(msg) # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_info` if not (catl_info in catl_info_valid): msg = '{0} `catl_info` ({1}) is not a valid input!'.format( file_msg, catl_info) raise LSSUtils_Error(msg) # `nmin` if not ((nmin > 0) and (isinstance(nmin, int))): msg = '{0} `nmin` must be an integer and have a value above `0`' msg = msg.format(file_msg) raise LSSUtils_Error(msg) # `perf_opt` if not (isinstance(perf_opt, bool)): msg = '{0} `perf_opt` ({1}) is not a valid type!'.format( file_msg, type(perf_opt)) raise LSSUtils_Error(msg) ## ## Types of galaxies cens = int(1) nmin = int(nmin) ## ## Getting keys for catalogue (gm_key, id_key, galtype_key) = catl_keys(catl_kind, return_type='list', perf_opt=perf_opt) ## ## Cleaning catalogue entries catl_pd_clean_all = sdss_catl_clean(catl_pd, catl_kind=catl_kind, catl_info=catl_info, reindex=True) ## Choosing only galaxies in groups of richness >= `nmin` # Member galaxies if catl_info == 'members': # Centrals catl_pd_cens = catl_pd_clean_all.loc[( catl_pd_clean_all[galtype_key] == cens), id_key] catl_pd_cl = catl_pd_clean_all[( catl_pd_clean_all[id_key].isin(catl_pd_cens))] # Group counts group_counts = Counter(catl_pd_cl[id_key]) group_ngals = np.array( [xx for xx in group_counts.keys() if group_counts[xx] >= nmin]) # Cleaned version catl_pd_clean = catl_pd_cl[catl_pd_cl[id_key].isin(group_ngals)] catl_pd_clean.reset_index(inplace=True, drop=True) # Group catalogue if catl_info == 'groups': if ('ngals' in catl_pd_clean_all.columns.tolist()): catl_pd_clean = catl_pd_clean_all.loc[ catl_pd_clean_all['ngals'] >= nmin] catl_pd_clean.reset_index(inplace=True, drop=True) else: msg = '{0} Key `ngals` not found in DataFrame ... Exiting!' msg = msg.format(file_msg) raise LSSUtils_Error(msg) return catl_pd_clean
def Stats_one_arr(x, y, base=1., arr_len=0, arr_digit='n', weights=None, statfunc=np.nanmean, bin_statval='average', return_perc=False, failval=np.nan): """ Calculates statists for 2 arrays Parameters ---------- x, y : array_like, shape(N,) Sets of elements for the 1st and 2nd observable base : float, optional Bin width in units of `x`. This variable is set to 1. by default. arr_len : int, optional Minimum number of elements in each bin of `x` arr_digit : {'n', 'y', 'o'} str, optional Option for which elements to return. Options: - 'n' : Returns `x_stat`, `y_stat`, `y_std`, `y_std_err` - 'y' : Returns `x_stat`, `y_stat`, `y_std`, `y_std_err`, `x_bins_data`, `y_bins_data` - 'o' : Returns `x_bins_data`, `y_bins_data` weights : array_like or NoneType, optional Array of weights for values in `y`. This is set to None by default. statfunc : {`numpy.nanmean`, `numpy.nanmedian`} statistical function, optional Numerical function used to calculate on bins of data. By default, this variable is set to `numpy.nanmean` bin_statval : {'average', 'left', 'right'} str, optional Option for where to put the bin values of `x` and `y`. By default, this variable is set to `average`, which means that the values are those of the averages of the bins in `x` and `y`. return_perc : `bool`, optional If true, it also returns the `percentiles` of the data. Last item in the return list. This variable is set to False by default. failval : int, float, NoneType, or NaN, optional This is the value used when no data is available for the bin. This is set to `numpy.nan` by default Returns ---------- x_stat, y_stat : array_like Binned array of elements from `x` y_std : array_like Standard deviation of the binned array in `x` y_std_err : array_like Error in the `statfunc` of `y` x_bins_data : array_like, optional Elements of `x` in each bin with spacing of `base`. Only returned if `arr_digit` == 'y' or 'o' y_bins_data : array_like, optional Elements of `y` in each bin with spacing of `base`. Only returned if `arr_digit` == 'y' or 'o' perc_lims : array_like, shape(N,3) Percentiles in each bin of `x_stat`. Only returned if `arr_digit` == 'y' or 'o' """ file_msg = fd.Program_Msg(__file__) ## Verifying input values # `arr_digit` if not ((arr_digit == 'y') or (arr_digit == 'n') or (arr_digit == 'o')): msg = '{0} `arr_digit` ({1}) is not a valid input. Exiting'.format( file_msg, arr_digit) raise LSSUtils_Error(msg) # Array dimensions if not ((len(x) > 0) and (len(y) > 0)): msg = '{0} The arrays `x` and `y` must have at least one value' msg = msg.format(file_msg) raise LSSUtils_Error(msg) if not ((np.asarray(x).ndim == 1) and (np.asarray(y).ndim == 1)): msg = '{0} The arrays `x` and `y` must have dimension of `1`' msg = msg.format(file_msg) raise LSSUtils_Error(msg) # `arr_len` if not (arr_len >= 0): msg = '{0} `arr_len` ({1}) must be greater or equal than zero!'.format( file_msg, arr_len) raise LSSUtils_Error(msg) # `bin_statval` if not (bin_statval in ['average', 'left', 'right']): msg = '{0} `bin_statval` ({1}) is not a valid input! Exiting'.format( file_msg, bin_statval) raise LSSUtils_Error(msg) ## ## Converting arrays to numpy arrays x = np.asarray(x) y = np.asarray(y) nelem = len(x) arr_len = int(arr_len - 1.) if arr_len != 0 else int(arr_len) ## ## Statistics calculations x_bins = Bins_array_create(x, base=base) x_digits = np.digitize(x, x_bins) ## ## Determining which bins to use ## These are the bins that meet the criteria of `arr_len` x_digits_bins = np.array([ int(ii) for ii in range(1, len(x_bins)) if len(x_digits[x_digits == ii]) > arr_len ]) ## Elements in each bin # X-values x_bins_data = np.array([x[x_digits == ii] for ii in x_digits_bins]) # Y-values y_bins_data = np.array([y[x_digits == ii] for ii in x_digits_bins]) ## ## Selecting data in bins # Centered around the average if (bin_statval == 'average'): x_stat = np.array([ statfunc(ii) if len(ii) > arr_len else failval for ii in x_bins_data ]) # Left-hand side of the bin if (bin_statval == 'left'): x_stat = np.array([ x_bins[:-1][ii] if len(x_bins_data[ii]) > arr_len else failval for ii in range(len(x_bins_data)) ]) # Right-hand side of the bin if (bin_statval == 'right'): x_stat = np.array([ x_bins[1:][ii] if len(x_bins_data[ii]) > arr_len else failval for ii in range(len(x_bins_data)) ]) ## ## Determining the values in `y` # `stat_function` y_stat = np.array( [statfunc(ii) if len(ii) > arr_len else failval for ii in y_bins_data]) # Standard Deviation y_std = np.array([ np.nanstd(ii) if len(ii) > arr_len else failval for ii in y_bins_data ]) # Error in the mean/median y_std_err = np.array([ np.nanstd(ii) / math.sqrt(len(ii)) if len(ii) > arr_len else failval for ii in y_bins_data ]) ## ## Correcting error inf `statfunc` == `numpy.nanmedian` if statfunc == np.nanmedian: y_std_err *= 1.253 ## ## Returning percentiles if return_perc: perc_arr_lims = sigma_calcs(y_stat) ## ## Returning values if return_perc: if arr_digit == 'n': return_val = [x_stat, y_stat, y_std, y_std_err, perc_arr_lims] if arr_digit == 'y': return_val = [ x_stat, y_stat, y_std, y_std_err, x_bins_data, y_bins_data, perc_arr_lims ] if arr_digit == 'o': return_val = [x_bins_data, y_bins_data, perc_arr_lims] else: if arr_digit == 'n': return_val = [x_stat, y_stat, y_std, y_std_err] if arr_digit == 'y': return_val = [ x_stat, y_stat, y_std, y_std_err, x_bins_data, y_bins_data ] if arr_digit == 'o': return_val = [x_bins_data, y_bins_data] return return_val
def get_sun_mag(filter_opt, system='SDSS_Blanton_2003_z0.1'): """ Get solar absolaute magnitude for a filter in a system. Taken from Duncan Campbell, and later modified. Parameters ---------- filter_opt : {'U', 'B', 'V', 'R', 'I', 'J', 'H', 'K'} str Magnitude filter to use. system : {'Binney_and_Merrifield_1998', 'SDSS_Blanton_2003_z0.1'} str Kind of filter to use. Options: - 'Binney_and_Merrifield_1998' : See Binney and Merrifield, 1998 - 'SDSS_Blanton_2003_z0.1' : See Blanton et al. (2003) Eqn. 14. Returns ---------- abs_mag_sun : float Solar absolute magnitude in `filter_opt` using `system` parameters. Raises ---------- LSSUtils_Error : Exception Program exception if input parameters are accepted Examples ---------- >>> get_sun_mag('R', 'Binney_and_Merrifield_1998') 4.42 >>> get_sun_mag('V', 'Binney_and_Merrifield_1998') 4.83 >>> get_sun_mag('g', 'SDSS_Blanton_2003_z0.1') 5.45 """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters filter_arr = [ 'U', 'B', 'V', 'R', 'I', 'J', 'H', 'K', 'u', 'g', 'r', 'i', 'z' ] system_arr = ['Binney_and_Merrifield_1998', 'SDSS_Blanton_2003_z0.1'] # Checks # Input filter if not (filter_opt in filter_arr): msg = '{0} `filter_opt` ({1}) is not a valid option!'.format( file_msg, filter_opt) raise LSSUtils_Error # Input system if not (system in system_arr): msg = '{0} `system` ({1}) is not a valid option!'.format( file_msg, system) raise LSSUtils_Error ## ## Input parameters abs_mag_sun_dict = { 'Binney_and_Merrifield_1998': { 'U': 5.61, 'B': 5.48, 'V': 4.83, 'R': 4.42, 'I': 4.08, 'J': 3.64, 'H': 3.32, 'K': 3.28 }, 'SDSS_Blanton_2003_z0.1': { 'u': 6.80, 'g': 5.45, 'r': 4.76, 'i': 4.58, 'z': 4.51 } } ## Checking if key exists in dictionary ## and assigning magnitude if (filter_opt in abs_mag_sun_dict[system].keys()): abs_mag_sun = abs_mag_sun_dict[system][filter_opt] else: msg = '{0} `filter_opt` ({1}) is not a proper key of `system` ({2})' msg = msg.format(file_msg, filter_opt, system) raise LSSUtils_Error(msg) return abs_mag_sun
def concatenate_pd_df(directory, filetype='hdf5', foutput=None, outonly=True): """ Concatenates pandas DataFrames into a single DataFrame Parameters ---------- directory : str Path to the folder containing multiple pandas-HDF5 files filetype : str, optional File format of the file in `directory` to be read This is set to `hdf5` by default. foutput : str or NoneType If not `None`, it is the basename of the output file in HDF5 format outonly : boolean, optional If True, it returns the pandas DataFrame. If False, it only saved the concatenated `pandas.DataFrame`. Returns ---------- df_conc : `pandas.DataFrame` DataFrame containing the combined datasets from the files in `directory`. Raises ---------- LSSUtils_Error : Exception If no files are found in `directory`, it raises an error warning about this. """ file_msg = fd.Program_Msg(__file__) # Checking that `directory` exists if not os.path.exists(directory): msg = '{0} `directory` {1} is not a valid path! Exiting!'.format( file_msg, directory) raise LSSUtils_Error(msg) # Concatenating files files_arr = df.index(directory, '.' + filetype, sort=True) print('{0} Found `{1}` files'.format(file_msg, files_arr.size)) if len(files_arr) > 0: # Initializing array that contains info df_arr = [[] for x in range(len(files_arr))] # Looping over HDF5 (pandas) files for ii, file_ii in enumerate(files_arr): df_arr[ii] = read_pandas_hdf5(file_ii) # Concatenating arrays df_conc = pd.concat(df_arr, ignore_index=True) # Deciding name of resulting output file if (foutput is not None) and (type(foutput) == str): foutput_file = os.path.join(directory, '{0}.{1}'.format(foutput, filetype)) # Saving resulting DataFrame pandas_df_to_hdf5_file(df_conc, foutput_file, key='/Main') # Checking file exists fd.File_Exists(foutput_file) print('{0} Output file saved in: {2}'.format( file_msg, foutput_file)) # If only outputting concatenated DataFrame if outonly: return df_conc else: msg = '{0} No files in `{1}` with extension `{2}`'.format( file_msg, directory, filetype) raise LSSUtils_Error(msg)
def scoring_methods(feat_arr, truth_arr, model=None, pred_arr=None, score_method='perc', threshold=0.1, perc=0.9): """ Determines the overall score for given arrays, i.e. the `predicted` array and the `truth` array Parameters ----------- feat_arr : `np.ndarray` or array-like, shape (n_samples, n_features) Array consisting of the `predicted values`. The dimensions of `feat_arr` are `n_samples` by `n_features`, where `n_samples` is the number of observations, and `n_features` the number of features used. truth_arr : `np.ndarray` or array-like, shape (n_samples, n_outcomes) Array consisting of the `true` values for the `n_samples` observations. The dimensions of `truth_arr` are `n_samples` by `n_outcomes`, where `n_samples` is the number of observations, and `n_outcomes` the number of predicted outcomes. model : scikit-learn model object or `NoneType` Model used to estimate the score if ``score_method == 'model_score'`` This variable is set to `None` by default. pred_arr : `np.ndarray`, array-like, or `NoneType`, shape (n_samples, n_outcomes) Array of predicted values from `feat_arr`. If ``model == None``, this variable must be an array-like object. If ``model != None``, this variable will not be used, and will be calculated using the `model` object. This variable is set to `None` by default. score_method : {'perc', 'threshold', 'model_score', 'r2'} `str`, optional Type of scoring to use when determining how well an algorithm is performing. Options: - 'perc' : Use percentage and rank-ordering of the values - 'threshold' : Score based on diffs of `threshold` or less from tru value - 'model_score' : Out-of-the-box metod from `sklearn` to determine success. - 'r2': R-squared statistic for error calcuation. threshold : float, optional Value to use when calculating the error within `threshold` value from the truth. This variable is set to `0.1` by default. perf : float, optional Value used when determining score within some `perc_val` percentile value form [0,1]. Returns ----------- method_score : float Overall score from `pred_arr` to predict `truth_arr`. Notes ----------- For more information on how to pre-process your data, see `http://scikit-learn.org/stable/modules/model_evaluation.html`_. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters # `feat_arr` feat_arr_type_valid = (list, np.ndarray) if not (isinstance(feat_arr, feat_arr_type_valid)): msg = '{0} `feat_arr` ({1}) is not a valid input type'.format( file_msg, type(feat_arr)) raise LSSUtils_Error(msg) # `truth_arr` truth_arr_type_valid = (list, np.ndarray) if not (isinstance(truth_arr, truth_arr_type_valid)): msg = '{0} `truth_arr` ({1}) is not a valid input type'.format( file_msg, type(truth_arr)) raise LSSUtils_Error(msg) # `score_method` - Type score_method_type_valid = (str) if not (isinstance(score_method, score_method_type_valid)): msg = '{0} `score_method` ({1}) is not a valid input type'.format( file_msg, type(score_method)) raise LSSUtils_Error(msg) # `score_method` - Value score_method_valid = ['perc', 'threshold', 'model_score', 'r2'] if not (score_method in score_method_valid): msg = '{0} `score_method` ({1}) is not a valid input!'.format( file_msg, score_method) raise LSSUtils_Error(score_method) # `threshold` - Type threshold_valid = (float, int) if not (isinstance(threshold, threshold_valid)): msg = '{0} `threshold` ({1}) is not a valid input type'.format( file_msg, type(threshold)) raise LSSUtils_Error(msg) # `threshold` - Value if not (threshold >= 0.): msg = '{0} `threshold` ({1}) must be larger than 0!'.format( file_msg, threshold) raise LSSUtils_Error(msg) ## ## Checking for `model` and `pred_arr` # If both are none if ((model == None) and (pred_arr == None)): msg = '{0} `model` and `pred_arr` cannot be both `None`. ' msg += 'Only one can be `None`' msg = msg.format(file_msg) raise LSSUtils_Error(msg) # `pred_arr` - Type pred_arr_valid = ((list, np.ndarray)) if (model == None): if not (isinstance(pred_arr, pred_arr_valid)): msg = '{0} `pred_arr` ({1}) is not a valid input type!'.format( file_msg, type(pred_arr)) raise LSSUtils_Error(msg) ## ## Choosing scoring method # Percentile method if (score_method == 'perc'): # Checking for `pred_arr` if (pred_arr == None): pred_arr = model.predict(feat_arr) # Checking for `model` if (model == None): pred_arr = np.asarray(pred_arr) # Error calcualtion pred_err = np.abs(pred_arr - truth_arr) method_score = scipy.stats.scoreatpercentile(pred_err, 100.*perc_val) # Threshold method if (score_method == 'threshold'): # Checking for `pred_arr` if (pred_arr == None): pred_arr = model.predict(feat_arr) # Checking for `model` if (model == None): pred_arr = np.asarray(pred_arr) # Error calcualtion pred_err = np.abs(pred_arr - truth_arr) pred_thresh = len(pred_err[pred_err <= threshold]) method_score = pred_thresh / len(pred_arr) # R-squared method if (score_method == 'r2'): # Checking for `pred_arr` if (pred_arr == None): pred_arr = model.predict(feat_arr) # Checking for `model` if (model == None): pred_arr = np.asarray(pred_arr) # Error calcualtion method_score = skmetrics.r2_score(truth_arr, pred_arr) # Model method if (score_method == 'model_score'): method_score = model.score(feat_arr, truth_arr) return method_score
def train_test_dataset(pred_arr, feat_arr, pre_opt='min_max', shuffle_opt=True, random_state=0, test_size=0.25): """ Function to create the training and testing datasets for a given set of features array and predicted array. Parameters ----------- pred_arr : `np.ndarray` or array-like, shape (n_samples, n_outcomes) Array consisting of the `predicted values`. The dimensions of `pred_arr` are `n_samples` by `n_outcomes`, where `n_samples` is the number of observations, and `n_outcomes` the number of predicted outcomes. feat_arr : `np.ndarray` or array-like, shape (n_samples, n_features) Array consisting of the `predicted values`. The dimensions of `feat_arr` are `n_samples` by `n_features`, where `n_samples` is the number of observations, and `n_features` the number of features used. pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional Type of preprocessing to do on `feat_arr`. Options: - 'min_max' : Turns `feat_arr` to values between (0,1) - 'standard' : Uses the `sklearn.preprocessing.StandardScaler` method - 'normalize' : Uses the `sklearn.preprocessing.Normalizer` method - 'no' : No preprocessing on `feat_arr` shuffle_opt : `bool`, optional If True, the data is shuffled before splitting into testing and training datasets. This variable is set to True by default. random_state : int, optional Random state number used for when splitting into training and testing datasets. If set, it will always have the same seed `random_state`. This variable is set to `0` by default. test_size : float, optional Percentage of the catalogue that represents the `test` size of the testing dataset. This variable must be between (0,1). This variable is set to `0.25` by default. Returns ----------- train_dict : `dict` Dictionary containing the `training` data from the catalogue. test_dict : `dict` Dictionary containing the `testing` data from the catalogue. See also ----------- data_preprocessing : Function to preprocess a dataset. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters # `pred_arr` pred_arr_type_valid = (list, np.ndarray) if not (isinstance(pred_arr, pred_arr_type_valid)): msg = '{0} `pred_arr` ({1}) is not a valid input type'.format( file_msg, type(pred_arr)) raise LSSUtils_Error(msg) # `feat_arr` feat_arr_type_valid = (list, np.ndarray) if not (isinstance(feat_arr, feat_arr_type_valid)): msg = '{0} `feat_arr` ({1}) is not a valid input type'.format( file_msg, type(feat_arr)) raise LSSUtils_Error(msg) # `pre_opt` pre_opt_valid = ['min_max', 'standard', 'normalize', 'no'] if not (pre_opt in pre_opt_valid): msg = '{0} `pre_opt` ({1}) is not a valid input'.format( file_msg, pre_opt) raise LSSUtils_Error(msg) # `shuffle_opt` shuffle_opt_type_valid = (bool) if not (shuffle_opt in shuffle_opt_type_valid): msg = '{0} `shuffle_opt` ({1}) is not a valid input'.format( file_msg, shuffle_opt) raise LSSUtils_Error(msg) # `random_state` random_state_type_valid = (int) if not (isinstance(random_state, random_state_type_valid)): msg = '{0} `random_state` ({1}) is not a valid input'.format( file_msg, random_state) raise LSSUtils_Error(msg) # `test_size` if not ((test_size > 0) and (test_size < 1.)): msg = '{0} `test_size` ({1}) must be in range (0,1)'.format( file_msg, test_size) raise LSSUtils_Error(msg) ## ## Checking dimensions of `pred_arr` and `feat_arr` pred_arr = np.asarray(pred_arr) feat_arr = np.asarray(feat_arr) # Dimensions if (pred_arr.ndim) == 1: pred_arr = pred_arr.reshape(len(pred_arr),) if (feat_arr.ndim) == 1: feat_arr = feat_arr.reshape(len(feat_arr),) # Shape if (len(pred_arr) != len(feat_arr)): msg = '{0} The shape of `pred_arr` ({1}) and `feat_arr` ({2}) must ' msg += 'have the same length' msg = msg.format(file_msg, len(pred_arr), len(feat_arr)) raise LSSUtils_Error(msg) ## ## Rescaling Dataset feat_arr_scaled = data_preprocessing( feat_arr, pre_opt=pre_opt ) ## ## Splitting into `Training` and `Testing` datasets. # Scaled ( X_train, X_test, Y_train, Y_test) = skms.train_test_split( feat_arr_scaled, pred_arr, test_size=test_size, shuffle=shuffle_opt, random_state=random_state) # Not-scaled ( X_train_ns, X_test_ns, Y_train_ns, Y_test_ns) = skms.train_test_split( feat_arr, pred_arr, test_size=test_size, shuffle=shuffle_opt, random_state=random_state) ## ## Assigning `training` and `testing` datasets to dictionaries train_dict = { 'X_train': X_train, 'Y_train': Y_train, 'X_train_ns':X_train_ns, 'Y_train_ns':Y_train_ns} test_dict = {'X_test' : X_test , 'Y_test' : Y_test, 'X_test_ns':X_test_ns, 'Y_test_ns':Y_test_ns} return train_dict, test_dict
def catl_keys_prop(catl_kind, catl_info='members', return_type='list'): """ Dictionary keys for the diffeent galaxy and group properties of catalogues. Parameters ------------ catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_info : {'members', 'groups'} str, optional Option for which kind of catalogues to use. Options: - `members` : Member galaxies of group catalogues - `groups` : Catalogues with `group` information. return_type : {'list', 'dict'} str, optional Type of output to the be returned. This variable is set to `list` by default. Options: - 'list' : Returns the values as part of a list - 'dict' : Returns the values as part of a python dictionary Return ------------ catl_objs : python dictionary or array_like Dictionary/array with the proper keys for the catalogue(s). Order : 1) `ssfr_key`, 2) `mstar_key` Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. Examples ------------ >>> catl_keys_prop('data') ['logssfr', 'logMstar_JHU'] >>> catl_keys_prop('mocks', catl_info='groups', return_type='list') ['logssfr', 'logMstar'] """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters catl_kind_valid = ['data', 'mocks'] catl_info_valid = ['members', 'groups'] return_type_valid = ['list', 'dict'] # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_info` if not (catl_info in catl_info_valid): msg = '{0} `catl_info` ({1}) is not a valid input!'.format( file_msg, catl_info) raise LSSUtils_Error(msg) # `return_type` if not (return_type in return_type_valid): msg = '{0} `return_type` ({1}) is not a valid input!'.format( file_msg, return_type) raise LSSUtils_Error(msg) ## ## Property keys ## ## Data if (catl_kind == 'data'): ## Members if catl_info == 'members': # SSFR and Stellar mass logssfr_key, logmstar_key = ['logssfr', 'logMstar_JHU'] ## Groups if catl_info == 'groups': # SSFR and Stellar mass logssfr_key, logmstar_key = ['logssfr_tot', 'logMstar_tot'] ## ## Mocks if (catl_kind == 'mocks'): ## Members if catl_info == 'members': # SSFR and Stellar mass logssfr_key, logmstar_key = ['logssfr', 'logMstar'] ## Groups if catl_info == 'groups': # SSFR and Stellar mass logssfr_key, logmstar_key = ['logssfr', 'logMstar'] ## ## Saving values if return_type == 'dict': catl_objs = {'logssfr_key': logssfr_key, 'logmstar_key': logmstar_key} elif return_type == 'list': catl_objs = [logssfr_key, logmstar_key] return catl_objs
def absolute_to_apparent_magnitude(abs_mag, lum_dist, unit='mpc'): """ Calculates the apparent magnitude using the luminosity and absolute magnitude. Parameters ----------- abs_mag : float, int, or array_like Array of absolute magnitude(s) lum_dist : array_like Array of luminosity distnace to object. In units of `Mpc`. unit : {'pc', 'kpc', 'mpc'} str, optional Unit to use for `lum_dist`. This variable is set to `mpc` by default. When `pc`, the units are in parsecs, while `mpc` is for distances in mega-parsecs, etc. Returns ----------- app_mag : array_like, or float Array of apparent magnitude(s). `app_mag` is a float if `abs_mag` is a float or int. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters valid_types = (float, np.ndarray, list, int) # Type for `abs_mag` if not (isinstance(abs_mag, valid_types)): msg = '{0} `abs_mag` ({1}) is not a valid type!'.format( file_msg, type(abs_mag)) raise LSSUtils_Error(msg) # Type for `unit` unit_valid_arr = ['mpc', 'pc'] if not (unit in unit_valid_arr): msg = '{0} `unit` ({1}) is not a valid input!'.format(file_msg, unit) raise LSSUtils_Error(msg) ## Converting to array-type # `abs_mag` object if (isinstance(abs_mag, float) or isinstance(abs_mag, int)): abs_mag = float(abs_mag) if (isinstance(abs_mag, list) or isinstance(abs_mag, np.ndarray)): abs_mag = np.asarray(abs_mag) # `lum_dist` object if (isinstance(lum_dist, float) or isinstance(lum_dist, int)): lum_dist = float(lum_dist) if (isinstance(lum_dist, list) or isinstance(lum_dist, np.ndarray)): lum_dist = np.asarray(lum_dist) # Units - Conveting to Mpc # This follows the formula: # app_mag - abs_mag = 5 * (np.log10(lum_dist) + a - 1) # Where a = 0 when [d] = parsecs # Where a = 3 when [d] = kiloparsecs # Where a = 6 when [d] = megaparsecs if unit == 'pc': a = 0 elif unit == 'kpc': a = 3 elif unit == 'mpc': a = 6 ## ## Calcualtions app_mag = abs_mag + 5. * (np.log10(lum_dist) - 1 + a) return app_mag
def absolute_magnitude_lim(z, mag_lim, cosmo=None, H0=100., verbose=True): """ Calculates the absolute magnitude limit as function of redshift `z` for a flux-limited survey. Parameters ----------- z : float, int, or array_like Maximum redshift for a given flux-limited survey. mag_lim : float Apparent magnitude limit of the flux-limited survey. cosmo : `astropy.cosmology` object Cosmology object from Astropy. H0 : float, optional Hubble parameters value used to estimate distances. This variable is set to 100 km/s/Mpc by default. verbose : boolean, optional If True, a message will appear when the default cosmology is used. Returns ----------- abs_mag : float, int, or array_like Absolute magnitude limit in units of `abs_mag` + 5*log10(h), where `h` is the little Hubble parameter. Raises ---------- LSSUtils_Error : Exception Program exception if input parameters are accepted """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters # Redshift z_valid_types = (float, int, list, np.ndarray) if not (isinstance(z, z_valid_types)): msg = '{0} `z` ({1}) is not a valid type!'.format(file_msg, type(z)) raise LSSUtils_Error(msg) # Magnitude limit mag_lim_valid_types = (float, int) if not (isinstance(mag_lim, mag_lim_valid_types)): msg = '{0} `mag_lim` ({1}) is not a valid type!'.format( file_msg, type(mag_lim)) raise LSSUtils_Error(msg) # Hubble parameter value H0_valid_types = (float, int) if not (isinstance(H0, H0_valid_types)): msg = '{0} `H0` ({1}) is not a valid type!'.format(file_msg, type(H0)) raise LSSUtils_Error(msg) ## ## Calculations if not cosmo: from astropy.cosmology import FlatLambdaCDM cosmo = FlatLambdaCDM(H0=H0, Om0=0.316) if verbose: print(">> Warning: No cosmology was specified. Using default:", cosmo) ## Luminosity distance lum_dist = cosmo.luminosity_distance(z).value ## Absolute magnitude abs_mag = apparent_to_absolute_magnitude(mag_lim, lum_dist) return abs_mag
def catl_keys(catl_kind, perf_opt=False, return_type='list'): """ Dictionary keys for the different types of catalogues Parameters ---------- catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues perf_opt : boolean, optional Option for using a `perfect` mock catalogue. return_type : {'list', 'dict'} str, optional Type of output to the be returned. This variable is set to `list` by default. Options: - 'list' : Returns the values as part of a list - 'dict' : Returns the values as part of a python dictionary Returns ---------- catl_keys : python dictionary or array_like Dictionary/array with the proper keys for the catalogue(s). Order : 1) `gm_key`, 2) `id_key`, 3) `galtype_key` Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. Examples ---------- >>> catl_keys('data', perf_opt=False, return_type='list') ['M_h', 'groupid', 'galtype'] >>> catl_keys('mocks', perf_opt=True, return_type='list') ['M_h', 'haloid', 'galtype'] """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters # `catl_kind` if not (catl_kind in ['data', 'mocks']): msg = '{0} `catl_kind` ({1}) is not a valid input parameter!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `return_type` if not (return_type in ['list', 'dict']): msg = '{0} `return_type` ({1}) is not a valid input parameter'.format( file_msg, return_type) raise LSSUtils_Error(msg) # `perf_opt` if not (isinstance(perf_opt, bool)): msg = '{0} `perf_opt` ({1}) must be a boolean object!'.format( file_msg, type(perf_opt)) raise LSSUtils_Error(msg) ## ## Perfect Catalogue if catl_kind == 'data': perf_opt = False ## ## Property keys if catl_kind == 'data': gm_key, id_key, galtype_key = ['M_h', 'groupid', 'galtype'] elif catl_kind == 'mocks': if perf_opt: gm_key, id_key, galtype_key = ['M_h', 'haloid', 'galtype'] else: gm_key, id_key, galtype_key = ['M_group', 'groupid', 'g_galtype'] ## ## Saving values if return_type == 'dict': catl_objs = { 'gm_key': gm_key, 'id_key': id_key, 'galtype_key': galtype_key } elif return_type == 'list': catl_objs = [gm_key, id_key, galtype_key] return catl_objs
def data_preprocessing(feat_arr, pre_opt='min_max'): """ Preprocess the data used, in order to clean and make the data more suitable for the machine learning algorithms Parameters ----------- feat_arr : `numpy.ndarray` Array of feature values. This array is used for training a ML algorithm. pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional Type of preprocessing to do on `feat_arr`. Options: - 'min_max' : Turns `feat_arr` to values between (0,1) - 'standard' : Uses the `~sklearn.preprocessing.StandardScaler` method - 'normalize' : Uses the `~sklearn.preprocessing.Normalizer` method - 'no' : No preprocessing on `feat_arr` Returns ----------- feat_arr_scaled : `numpy.ndarray` Rescaled version of `feat_arr` based on the choice of `pre_opt`. Notes ----------- For more information on how to pre-process your data, see `http://scikit-learn.org/stable/modules/preprocessing.html`_. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters # `feat_arr` feat_arr_type_valid = (list, np.ndarray) if not (isinstance(feat_arr, feat_arr_type_valid)): msg = '{0} `feat_arr` ({1}) is not a valid input type'.format( file_msg, type(feat_arr)) raise LSSUtils_Error(msg) # `pre_opt` pre_opt_valid = ['min_max', 'standard', 'normalize', 'no'] if not (pre_opt in pre_opt_valid): msg = '{0} `pre_opt` ({1}) is not a valid input'.format( file_msg, pre_opt) raise LSSUtils_Error(msg) ## ## Scaling `feat_arr` if (pre_opt == 'min_max'): # Scaler scaler = skpre.MinMaxScaler(feature_range=(0,1)) # Rescaling feat_arr_scaled = scaler.fit_transform(feat_arr) ## Standardize Data if pre_opt == 'standard': # Scaler scaler = skpre.StandardScaler().fit(feat_arr) # Rescaling feat_arr_scaled = scaler.transform(feat_arr) ## Normalize Data if pre_opt == 'normalize': # Scaler scaler = skpre.Normalizer().fit(feat_arr) # Rescaling feat_arr_scaled = scaler.transform(feat_arr) ## No Preprocessing if pre_opt == 'no': feat_arr_scaled = feat_arr return feat_arr_scaled
def catl_sdss_dir(catl_kind='data', catl_type='mr', sample_s='19', catl_info='members', halotype='fof', clf_method=3, hod_n=0, clf_seed=1235, perf_opt=False, print_filedir=True): """ Extracts the path to the synthetic catalogues. Parameters ----------- catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_type : {'mr', 'mstar'} str, optional Type of catalogue to use. It shows which abundance matching method was used for the CLF when assigning halo masses. This variable is set to 'mr' by default. Options: - `mr` : Uses r-band absolute magnitude - `mstar` : Uses stellar masses sample_s : {'19', '20', '21'} str, optional Volume-limited sample to use. This variable is set to '19' by default. Options: - '19' : Uses the Mr19 volume-limited sample, i.e. 'Consuelo' - '20' : Uses the Mr20 volume-limited sample, i.e. 'Esmeralda' - '21' : Uses the Mr21 volume-limited sample, i.e. 'Carmen' catl_info : {'members', 'groups'} str, optional Option for which kind of catalogues to use. Options: - `members` : Member galaxies of group catalogues - `groups` : Catalogues with `group` information. halotype : {'fof', 'so'} str, optional Type of the dark matter halo of the simulation used to create the synthetic catalogues. This variable is set to `fof` by default. Options: - 'fof': Friends-of-Friends halos. - 'so' : Spherical overdensity halos. clf_method : {1, 2, 3} int, optional Method for assigning galaxy properties to mock galaxies. This variable is set to `3` by default. Options: - `1` : Independent assigment of (g-r) color, sersic, and log(ssfr) - `2` : (g-r) decides active/passive designation and draw values independently. - `3` : (g-r) decides active/passive designations, and assigns other galaxy properties for that given galaxy. hod_n : {0, 1} int, optional HOD model to use. Only relevant when `catl_kind == mocks`. clf_seed : int, optional Seed used for the `CLF` random seed. This variable is set to `1235` by default. perf_opt : boolean, optional If True, it chooses to analyze the `perfect` set of synthetic catalogues. This variable is set to `False` by default. print_filedir : boolean, optional If True, the output directory is printed onto the screen. Returns ----------- catls_path : str Path to the desired set of synthetic catalogues. Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters catl_kind_valid = ['data', 'mocks'] catl_type_valid = ['mr', 'mstar'] sample_s_valid = ['19', '20', '21'] catl_info_valid = ['members', 'groups'] halotype_valid = ['fof', 'so'] clf_method_valid = [1, 2, 3] hod_n_valid = [0, 1] # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_type` if not (catl_type in catl_type_valid): msg = '{0} `catl_type` ({1}) is not a valid input!'.format( file_msg, catl_type) raise LSSUtils_Error(msg) # `sample_s` if not (sample_s in sample_s_valid): msg = '{0} `sample_s` ({1}) is not a valid input!'.format( file_msg, sample_s) raise LSSUtils_Error(msg) # `catl_info` if not (catl_info in catl_info_valid): msg = '{0} `catl_info` ({1}) is not a valid input!'.format( file_msg, catl_info) raise LSSUtils_Error(msg) # `halotype` if not (halotype in halotype_valid): msg = '{0} `halotype` ({1}) is not a valid input!'.format( file_msg, halotype) raise LSSUtils_Error(msg) # `clf_method` if not (clf_method in clf_method_valid): msg = '{0} `clf_method` ({1}) is not a valid input!'.format( file_msg, clf_method) raise LSSUtils_Error(msg) # `hod_n` if not (hod_n in hod_n_valid): msg = '{0} `hod_n` ({1}) is not a valid input!'.format(file_msg, hod_n) raise LSSUtils_Error(msg) # `perf_opt` if not (isinstance(perf_opt, bool)): msg = '{0} `perf_opt` ({1}) is not a valid type!'.format( file_msg, type(perf_opt)) raise LSSUtils_Error(msg) # `print_filedir` if not (isinstance(print_filedir, bool)): msg = '{0} `print_filedir` ({1}) is not a valid type!'.format( file_msg, type(print_filedir)) raise LSSUtils_Error(msg) ## ## Type of catalogue if catl_info == 'members': catl_info_str = 'member_galaxy_catalogues' elif catl_info == 'groups': catl_info_str = 'group_galaxy_catalogues' ## ## Perfect catalogue if perf_opt: # Data if catl_kind == 'data': msg = '{0} Invalid `catl_kind` ({1}) for when `perf_opt == True' msg = msg.format(file_msg, catl_kind) raise LSSUtils_Error(msg) # Mocks catl_info_perf_str = 'perfect_{0}'.format(catl_info_str) else: # Mocks catl_info_perf_str = catl_info_str ## ## Extracting path of the files # Data if catl_kind == 'data': # Joining paths filedir = os.path.join(wp.get_output_path(), 'SDSS', catl_kind, catl_type, 'Mr' + sample_s, catl_info_perf_str) # Mocks if catl_kind == 'mocks': # Joining paths filedir = os.path.join(wp.get_output_path(), 'SDSS', catl_kind, 'halos_{0}'.format(halotype), 'hod_model_{0}'.format(hod_n), 'clf_seed_{0}'.format(clf_seed), 'clf_method_{0}'.format(clf_method), catl_type, 'Mr' + sample_s, catl_info_perf_str) ## ## Making sure `filedir` exists if not (os.path.exists(filedir)): msg = '{0} `filedir` ({1}) does NOT exist! Check input variables' msg = msg.format(file_msg, filedir) raise LSSUtils_Error(msg) ## ## Printing out paths if print_filedir: print('{0} `filedir`: {1}'.format(file_msg, filedir)) return filedir
def sigma_calcs(data_arr, type_sigma='std', perc_arr=[68., 95., 99.7], return_mean_std=False): """ Calcualates the 1-, 2-, and 3-sigma ranges for `data_arr` Parameters ----------- data_arr : `numpy.ndarray`, shape( param_dict['nrpbins'], param_dict['itern_tot']) array of values, from which to calculate percentiles or St. Dev. type_sigma : {'perc', 'std'} string, optional (default = 'std') Option for calculating either `percentiles` or `standard deviations` - 'perc': calculates percentiles - 'std' : uses standard deviations as 1-, 2-, and 3-sigmas perc_arr : array_like, optional (default = [68., 95., 99.7]) Array of percentiles to calculate return_mean_std : boolean, optional (default = False) Option for returning mean and St. Dev. along with `sigma_dict` Return ---------- sigma_dict: python dicitionary dictionary containg the 1-, 2-, and 3-sigma upper and lower ranges for `data-arr` mark_mean: array_like array of the mean value of `data_arr`. Only returned if `return_mean_std == True` mark_std: array_like array of the St. Dev. value of `data_arr`. Only returned if `return_mean_std == True` """ file_msg = fd.Program_Msg(__file__) ## Checking input variables # `data_arr` data_arr_valid_types = (np.ndarray, list) if not (isinstance(data_arr, data_arr_valid_types)): msg = '{0} `data_arr` ({1}) is not a valid type!'.format( file_msg, type(data_arr)) raise LSSUtils_Error(msg) else: data_arr = np.asarray(data_arr) # `type_sigma` type_sigma_valid = ['perc', 'std'] if not (isinstance(type_sigma, str)): msg = '{0} `type_sigma` ({1}) is not a valid type!'.format( file_msg, type(type_sigma)) raise LSSUtils_Error(msg) if not (type_sigma in type_sigma_valid): msg = '{0} `type_sigma` ({1}) is not a valid input choice!'.format( file_msg, type_sigma) ## Determining shape of `data_arr` if data_arr.ndim == 1: axis = 0 else: axis = 1 ## Creating dictionary for saving `sigma`s sigma_dict = {} for ii in range(len(perc_arr)): sigma_dict[ii] = [] ## Using Percentiles to estimate errors if type_sigma == 'perc': for ii, perc_ii in enumerate(perc_arr): mark_lower = np.nanpercentile(data_arr, 50. - (perc_ii / 2.), axis=axis) mark_upper = np.nanpercentile(data_arr, 50. + (perc_ii / 2.), axis=axis) # Saving to dictionary sigma_dict[ii] = np.column_stack((mark_lower, mark_upper)).T ## Using standard deviations to estimate errors if type_sigma == 'std': mean_val = np.nanmean(data_arr, axis=axis) std_val = np.nanstd(data_arr, axis=axis) for ii in range(len(perc_arr)): mark_lower = mean_val - ((ii + 1) * std_val) mark_upper = mean_val + ((ii + 1) * std_val) # Saving to dictionary sigma_dict[ii] = np.column_stack((mark_lower, mark_upper)).T ## ## Estimating mean and St. Dev. of `data_arr` mark_mean = np.nanmean(data_arr, axis=axis) mark_std = np.nanstd(data_arr, axis=axis) ## Fixing values for when `axis == 0` if data_arr.ndim == 1: for ii in range(len(sigma_dict.keys())): sigma_dict[ii] = sigma_dict[ii].flatten() if return_mean_std: return sigma_dict, mark_mean, mark_std else: return sigma_dict
def spherematch(ra1, dec1, ra2, dec2, tol=None, nnearest=1, nthreads=1): """ Determines the matches between two catalogues of sources with <ra, dec> coordinates. Parameters ---------- ra1, dec1 : array_like Right ascension and declination of the 1st catalogue. Units are in `degrees`. ra2, dec2 : array_like Right ascension and declination of the 2nd catalogue. Units are in `degrees`. tol : float or None, optional How close (in degrees) a match has to be to count as a match. If None, all nearest neighbors for the 1st catalogue will be returned. nnearest : int, optional The nth neighbor to find. E.g. 1 for the nearest nearby, 2 for the second nearest neighbor, etc. Partcularly useful if you want to get the nearest *non-self* neighbor of a catalogue. To do this use:: ``spherematch(ra, dec, ra, dec, nnearest=2)`` if `nnearest == 0`, all matches are returned. nthreads : int, optional Number of threads to use for calculation. This variable is set to 1 by default. Must be larger than 1. Returns ---------- idx1 : int `numpy.ndarray` Indices of the 1st catalogue of the matches. Will never be larger than `ra1`/`dec1`. idx2 : int `numpy.ndarray` Indices of the 2nd catalogue of the matches. Will never be larger than `ra1`/`dec1`. ds : float `numpy.ndarray` Distance (in degrees) between the matches. """ file_msg = fd.Program_Msg(__file__) ## Checking input arguments valid_types = (list, np.ndarray) # `ra1` if not (isinstance(ra1, valid_types)): msg = '{0} `ra1` ({1}) is not a valid type!'.format(file_msg, type(ra1)) raise LSSUtils_Error(msg) # `dec1` if not (isinstance(dec1, valid_types)): msg = '{0} `dec1` ({1}) is not a valid type!'.format(file_msg, type(dec1)) raise LSSUtils_Error(msg) # `ra2` if not (isinstance(ra2, valid_types)): msg = '{0} `ra2` ({1}) is not a valid type!'.format(file_msg, type(ra2)) raise LSSUtils_Error(msg) # `dec2` if not (isinstance(dec2, valid_types)): msg = '{0} `dec2` ({1}) is not a valid type!'.format(file_msg, type(dec2)) raise LSSUtils_Error(msg) # `nnearest` if nnearest < 0: msg = '{0} `nnearest` ({1}) must be larger than `0`!'.format(file_msg, nnearest) raise LSSUtils_Error(msg) # `threads` if nthreads < 1: msg = '{0} `nthreads` ({1}) must be larger than `1`!'.format(file_msg, nthreads) raise LSSUtils_Error(msg) ## ## Converting arguments into arrays for ease of use ra1 = np.array(ra1 , copy=False) dec1 = np.array(dec1, copy=False) ra2 = np.array(ra2 , copy=False) dec2 = np.array(dec2, copy=False) ## Checking shape # 1st catalogue if ra1.shape != dec1.shape: msg = '{0} The shape of `ra1` ({1}) does not mathc that of `dec1` ({2}).' msg = msg.format(file_msg, ra1.shape, dec1.shape) raise LSSUtils_Error(msg) # 2nd catalogue if ra2.shape != dec2.shape: msg = '{0} The shape of `ra2` ({1}) does not mathc that of `dec2` ({2}).' msg = msg.format(file_msg, ra2.shape, dec2.shape) raise LSSUtils_Error(msg) ## ## Converting spherical coordinates into cartesian coordinates # 1st catalogue x1, y1, z1 = _spherical_to_cartesian_fast( ra1.ravel(), dec1.ravel(), nthreads) coords1 = np.empty((x1.size,3)) coords1[:, 0] = x1 coords1[:, 1] = y1 coords1[:, 2] = z1 # 2nd catalogue x2, y2, z2 = _spherical_to_cartesian_fast( ra2.ravel(), dec2.ravel(), nthreads) coords2 = np.empty((x2.size,3)) coords2[:, 0] = x2 coords2[:, 1] = y2 coords2[:, 2] = z2 ## ## Finding nearest neighbors kdt = KDT(coords2) # Finding neighbors if nnearest == 1: idx_s2 = kdt.query(coords1)[1] elif (nnearest == 0) and (tol is not None): # if you want ALL matches! p1_x, p1_y, p1_z = _spherical_to_cartesian_fast(90., 0 , nthreads) p2_x, p2_y, p2_z = _spherical_to_cartesian_fast(90., tol, nthreads) # Converting to floats p1_x = float(p1_x) p1_y = float(p1_y) p1_z = float(p1_z) p2_x = float(p2_x) p2_y = float(p2_y) p2_z = float(p2_z) r = np.sqrt((p2_x - p1_x)**2 + (p2_y - p1_y)**2 + (p2_z - p1_z)**2) idx_s2 = kdt.query_ball_point(coords1, r)[0] elif nnearest > 1: idx_s2 = kdt.query(coords1, nnearest)[1][:, -1] else: msg = '{0} Invalid `nnearest` ({1})!'.format(file_msg, nnearest) raise LSSUtils_Error(msg) ## ## Calculating distance between matches ds = _great_circle_distance_fast( ra1 , dec1 , ra2[idx_s2] , dec2[idx_s2], nthreads ) ## ## If `tol` is None, then all objects will have a match. idx_s1 = np.arange(ra1.size) ## ## Remove matches that are `beyond` the tolerance separation if (tol is not None) and (nnearest != 0): mask = ds < tol idx_s1 = idx_s1[mask] idx_s2 = idx_s2[mask] ds = ds [mask] return idx_s1, idx_s2, ds
def catl_sdss_merge(catl_pd_ii, catl_kind='data', catl_type='mr', sample_s='19', halotype='fof', clf_method=3, hod_n=0, clf_seed=1235, perf_opt=False, return_memb_group=False, print_filedir=False): """ Merges the member and group catalogues for a given set of input parameters, and returns a modified version of the galaxy group catalogues with added info about the galaxy groups. Parameters ------------ catl_pd_ii : int Index of the catalogue to match, from :func:`~cosmoutils.mock_catalogues.catls_utils.extract_catls` function. catl_kind : {'data', 'mocks'} str, optional Type of catalogue to use. This variable is set to `data` by default. Options: - `data` : catalogues come from SDSS `real` catalogue - `mocks` : catalogue come from SDSS `mock` catalogues catl_type : {'mr', 'mstar'} str, optional Type of catalogue to use. It shows which abundance matching method was used for the CLF when assigning halo masses. This variable is set to 'mr' by default. Options: - `mr` : Uses r-band absolute magnitude - `mstar` : Uses stellar masses sample_s : {'19', '20', '21'} str, optional Volume-limited sample to use. This variable is set to '19' by default. Options: - '19' : Uses the Mr19 volume-limited sample, i.e. 'Consuelo' - '20' : Uses the Mr20 volume-limited sample, i.e. 'Esmeralda' - '21' : Uses the Mr21 volume-limited sample, i.e. 'Carmen' halotype : {'fof', 'so'} str, optional Type of the dark matter halo of the simulation used to create the synthetic catalogues. This variable is set to `fof` by default. Options: - 'fof': Friends-of-Friends halos. - 'so' : Spherical overdensity halos. clf_method : {1, 2, 3} int, optional Method for assigning galaxy properties to mock galaxies. This variable is set to `3` by default. Options: - `1` : Independent assigment of (g-r) color, sersic, and log(ssfr) - `2` : (g-r) decides active/passive designation and draw values independently. - `3` : (g-r) decides active/passive designations, and assigns other galaxy properties for that given galaxy. hod_n : {0, 1} int, optional HOD model to use. Only relevant when `catl_kind == mocks`. clf_seed : int, optional Seed used for the `CLF` random seed. This variable is set to `1235` by default. perf_opt : boolean, optional If True, it chooses to analyze the `perfect` set of synthetic catalogues. This variable is set to `False` by default. return_memb_group : `bool`, optional If True, the function returns the member and group catalogues, along with the merged catalogue. It returns ``<memb_group_pd, memb_pd, group_pd>`` print_filedir : boolean, optional If True, the output directory is printed onto the screen. Return ------------ memb_group_pd : `pandas.DataFrame` Combined version of the i-th member and group catalogues. It contains both galaxy and group information. memb_pd : `pandas.DataFrame` Catalogue of the member galaxies of the i-th catalogue. This catalogue contains information of the `member galaxies`. group_pd : `pandas.DataFrame` Catalogue of the groups of the i-th catalogue. This catalogue contains information of the `galaxy groups`. Raises ------------ LSSUtils_Error : Exception from `LSSUtils_Error` Program exception if input parameters are accepted. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters catl_pd_ii_valid = (float, int) catl_kind_valid = ['data', 'mocks'] catl_type_valid = ['mr', 'mstar'] sample_s_valid = ['19', '20', '21'] catl_info_valid = ['members', 'groups'] halotype_valid = ['fof', 'so'] clf_method_valid = [1, 2, 3] hod_n_valid = [0, 1] # `catl_pd_ii` if (isinstance(catl_pd_ii, catl_pd_ii_valid)): catl_pd_ii = int(catl_pd_ii) else: msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, type(catl_kind)) raise LSSUtils_Error(msg) # `catl_kind` if not (catl_kind in catl_kind_valid): msg = '{0} `catl_kind` ({1}) is not a valid input!'.format( file_msg, catl_kind) raise LSSUtils_Error(msg) # `catl_type` if not (catl_type in catl_type_valid): msg = '{0} `catl_type` ({1}) is not a valid input!'.format( file_msg, catl_type) raise LSSUtils_Error(msg) # `sample_s` if not (sample_s in sample_s_valid): msg = '{0} `sample_s` ({1}) is not a valid input!'.format( file_msg, sample_s) raise LSSUtils_Error(msg) # `halotype` if not (halotype in halotype_valid): msg = '{0} `halotype` ({1}) is not a valid input!'.format( file_msg, halotype) raise LSSUtils_Error(msg) # `clf_method` if not (clf_method in clf_method_valid): msg = '{0} `clf_method` ({1}) is not a valid input!'.format( file_msg, clf_method) raise LSSUtils_Error(msg) # `hod_n` if not (hod_n in hod_n_valid): msg = '{0} `hod_n` ({1}) is not a valid input!'.format(file_msg, hod_n) raise LSSUtils_Error(msg) # `perf_opt` if not (isinstance(perf_opt, bool)): msg = '{0} `perf_opt` ({1}) is not a valid type!'.format( file_msg, type(perf_opt)) raise LSSUtils_Error(msg) # `return_memb_group` if not (isinstance(return_memb_group, bool)): msg = '{0} `return_memb_group` ({1}) is not a valid type!'.format( file_msg, type(return_memb_group)) raise LSSUtils_Error(msg) # `print_filedir` if not (isinstance(print_filedir, bool)): msg = '{0} `print_filedir` ({1}) is not a valid type!'.format( file_msg, type(print_filedir)) raise LSSUtils_Error(msg) ## ## Extracting catalogues given input parameters (memb_arr, memb_len) = extract_catls(catl_kind=catl_kind, catl_type=catl_type, sample_s=sample_s, halotype=halotype, clf_method=clf_method, hod_n=hod_n, clf_seed=clf_seed, perf_opt=perf_opt, catl_info='members', return_len=True, print_filedir=print_filedir) # Checking number of catalogues if catl_pd_ii > (memb_len - 1): msg = '{0} `catl_pd_ii` ({1}) is OUT of range ({2})!'.format( file_msg, catl_pd_ii, memb_len) raise LSSUtils_Error(msg) ## ## Extracting group catalogue # i-th Galaxy catalogue memb_path = memb_arr[catl_pd_ii] # i-th Galaxy Group catalogue group_path = catl_sdss_dir(catl_kind=catl_kind, catl_type=catl_type, sample_s=sample_s, halotype=halotype, clf_method=clf_method, hod_n=hod_n, clf_seed=clf_seed, perf_opt=perf_opt, catl_info='groups', print_filedir=print_filedir) ## ## Paths to catalogue # Mocks if catl_kind == 'mocks': group_path += os.path.basename(memb_path).replace('memb', 'group') # Data if catl_kind == 'data': group_path += os.path.basename(memb_path).replace('Gals', 'Group') # Checking that file exists fd.File_Exists(group_path) ## ## Reading in Catalogues memb_pd = fr.read_hdf5_file_to_pandas_DF(memb_path) group_pd = fr.read_hdf5_file_to_pandas_DF(group_path) ## Keys for the catalogues (gm_key, id_key, galtype_key) = catl_keys(catl_kind, perf_opt=perf_opt, return_type='list') ## Matching keys from Group catalogue if len(np.unique(memb_pd[id_key])) == len(np.unique(group_pd[id_key])): # Group column names group_colnames = np.sort(group_pd.columns.values) group_groupid = np.sort(np.unique(group_pd[id_key])) n_groups = len(group_groupid) n_memb = len(memb_pd) ## Sorting `memb_pd` by `id_key` # Member catalogue memb_pd.sort_values(by=id_key, inplace=True) memb_pd.reset_index(inplace=True, drop=True) # Group catalogue group_pd.sort_values(by=id_key, inplace=True) group_pd.reset_index(inplace=True, drop=True) ## Renaming columns g_colnames_dict = {ii: 'GG' + ii for ii in group_colnames} group_pd.rename(columns=g_colnames_dict, inplace=True) group_pd.rename(columns={'GG' + id_key: id_key}, inplace=True) ## ## Merging the 2 DataFrames memb_group_pd = pd.merge(left=memb_pd, right=group_pd, how='left', left_on=id_key, right_on=id_key) else: msg = '{0} Lengths of the 2 DataFrames (`memb_pd`, `group_pd`) ' msg += 'do not match!' msg = msg.format(file_msg) raise LSSUtils_Error(msg) ## ## Returning DataFrames if return_memb_group: return_obj = (memb_group_pd, memb_pd, group_pd) else: return_obj = memb_group_pd return return_obj