示例#1
0
def read_hdf5_file_to_pandas_DF(hdf5_file, key=None):
    """
    Reads content of HDF5 file and converts it to a Pandas DataFrame

    Parameters
    ----------
    hdf5_file : str
        Path to the HDF5 file. This is the file that will be converted
        to a pandas DataFrame.

    key : str or NoneType, optional
        Key or path in `hdf5_file` for the pandas DataFrame and the normal
        HDF5 file.

    Returns
    ----------
    df : `pandas.DataFrame`
        DataFrame from `hdf5_file` under the `key` directory.
    """
    file_msg = fd.Program_Msg(__file__)
    fd.File_Exists(hdf5_file)
    # Reading in Pandas DataFrame
    try:
        df = pd.read_hdf(hdf5_file, key=key)
    except:
        msg = '{0} Could not read `hdf5_file` ({1})! Please check if it exists'
        msg = msg.format(file_msg, hdf5_file)
        raise LSSUtils_Error(file_msg)

    return df
def get_parser():
    """
    Get parser object for `eco_mocks_create.py` script.

    Returns
    -------
    args: 
        input arguments to the script
    """
    ## Define parser object
    description_msg = 'Description of Script'
    parser = ArgumentParser(
        description=description_msg,
        formatter_class=SortingHelpFormatter,
    )
    ##
    parser.add_argument('--version', action='version', version='%(prog)s 1.0')

    ## Program message
    parser.add_argument('-progmsg',
                        dest='Prog_msg',
                        help='Program message to use throught the script',
                        type=str,
                        default=cfutils.Program_Msg(__file__))
    ## Parsing Objects
    args = parser.parse_args()

    return args
示例#3
0
def pandas_df_to_hdf5_file(df, hdf5_file, key=None, mode='w', complevel=8):
    """
    Saves a `pandas.DataFrame` into a `pandas` HDF5 FILE.

    Parameters
    ----------
    df : `pandas.DataFrame`
        DataFrame to be converted and saved into a HDF5 file.

    hdf5_file : str
        Path to the output HDF5 file

    key : str or NoneType, optional
        Key or path, under which `df` will be saved in the `hdf5_file`.

    mode : {'w','a'}, optional
        Mode to handle `hdf5_file`. This value is set to `w` by default,
        which stand for `write`.

    complevel : int, optional
        Level of compression for `hdf5_file`.
        The range of `complevel` is rane(0-9).
        This is set to a default of 8.
    """
    file_msg = fd.Program_Msg(__file__)
    # Saving DataFrame to `hdf5_file`
    try:
        df.to_hdf(hdf5_file, key, mode=mode, complevel=complevel)
        msg = '{0} HDF5 file created: {1}'.format(file_msg, hdf5_file)
        print(msg)
    except:
        msg = '{0} Could not create HDF5 file'.format(file_msg)
        raise LSSUtils_Error(msg)
示例#4
0
def url_checker(url_str):
    """
    Checks if the URL is valid or not.

    Parameters
    -----------
    url_str : `str`
        URL of the website to evaluate.

    Raises
    ----------
    LSSUtils_Error : `Exception`
        Program exception if input parameters are accepted
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking input parameters
    if not (isinstance(url_str, str)):
        msg = '{0} `url_str` ({1}) is not a STRING!'.format(
            file_msg, type(url_str))
        raise LSSUtils_Error(msg)
    ##
    ## Checking Website
    request_url = requests.get(url_str)
    if (request_url.status_code != 200):
        msg = '{0} `url_str` ({1}) does not exist!'.format(file_msg, url_str)
        raise LSSUtils_Error(msg)
示例#5
0
def IDL_read_file(idl_file):
    """
    Reads an IDL file and converts it to a Python dictionary

    Parameters
    ----------
    idl_file : string
        Path to the filename being used

    Returns
    ----------
    idl_dict : python dictionary
        Dictionary with the data from `idl_file`
    """
    # Checking that file exists
    fd.File_Exists(idl_file)
    # Converting to dictionary
    try:
        idl_dict = readsav(idl_file, python_dict=True)
    except:
        msg = '{0} `idl_file` {0} is not an IDL file'.format(
            fd.Program_Msg(__file__), idl_file)
        raise LSSUtils_Error(msg)

    return idl_dict
示例#6
0
def cookiecutter_paths(path='./'):
    """
    Paths to main folders in the `Data Science` cookiecutter template.
    This structure was taken from :
    - https://drivendata.github.io/cookiecutter-data-science/

    Parameters
    ----------
    path : str, optional
        Path to the file within the `.git` repository

    Return
    ----------
    param_dict : python dictionary
        Dictionary with info of the project that uses the Data Science
        cookiecutter template.

    Raises
    ----------
    LSSUtils_Error : exception
        If `path` is not within a .git directory, it raises an error.
    """
    # Base Path
    base_dir = git_root_dir(path) + '/'
    # Checking that directory exists
    if os.path.exists(base_dir):
        # Plot Directory
        plot_dir = os.path.join(base_dir, 'reports', 'figures/')
        # Source directory
        src_dir = os.path.join(base_dir, 'src', 'data/')
        # Data path
        data_dir = os.path.join(base_dir, 'data/')
        # External path
        ext_dir = os.path.join(data_dir, 'external/')
        # Processed path
        proc_dir = os.path.join(data_dir, 'processed/')
        # External path
        int_dir = os.path.join(data_dir, 'interim/')
        # External path
        raw_dir = os.path.join(data_dir, 'raw/')
        # Creating files
        for dir_ii in [plot_dir, src_dir, data_dir]:
            fd.Path_Folder(dir_ii)
        # Saving to dictionary
        param_dict = {}
        param_dict['base_dir'] = base_dir
        param_dict['plot_dir'] = plot_dir
        param_dict['src_dir'] = src_dir
        param_dict['data_dir'] = data_dir
        param_dict['ext_dir'] = ext_dir
        param_dict['proc_dir'] = proc_dir
        param_dict['int_dir'] = int_dir
        param_dict['raw_dir'] = raw_dir
    else:
        msg = '{0} `base_dir` ({1}) is not a Git directory! Exiting'.format(
            fd.Program_Msg(__file__), base_dir)
        raise LSSUtils_Error(msg)

    return param_dict
示例#7
0
def url_file_list(url, ext):
    """
    Lists the files from a URL taht have a specific file extension.

    Parameters
    -----------
    url : `str`
        String of the URL

    ext : `str`
        File extension of the files in the URL.

    Returns
    -----------
    files_arr : `numpy.ndarray`, shape (N,)
        Array of the file in `url` that match the file extension `ext`.
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking for file type
    # 'URL'
    if not isinstance(url, str):
        msg = '{0} `url` ({1}) is not a valid type. It must be a STRING!'
        msg = msg.format(file_msg, type(url))
        raise TypeError(msg)
    # File extension
    if not isinstance(ext, str):
        msg = '{0} `ext` ({1}) is not a valid type. It must be a STRING!'
        msg = msg.format(file_msg, type(ext))
        raise TypeError(msg)
    ## Reformatting URL
    # Removing whitespaces
    url = url.strip()
    # Removing trailing slach
    if url.endswith('/'):
        url = url[:-1]
    # Checking if URL exists
    url_checker(url)
    # Reading in HTML from page
    page = requests.get(url).text
    # Converting to BeautifulSoup format
    soup = BeautifulSoup(page, 'html.parser')
    ## Obtaining list of files
    # Removing files that are NOT strings
    files_arr_pre = np.array([
        xx.get('href') for xx in soup.find_all('a')
        if isinstance(xx.get('href'), str)
    ])
    # Only those finishing with certain extension
    files_pre_ext = np.array([xx for xx in files_arr_pre if xx.endswith(ext)])
    # Checking if file contains string 'http://'
    files_pre_web = np.array([(url + '/' + xx) if not ('//' in xx) else xx
                              for xx in files_pre_ext])
    # Sorting out file array
    files_arr = np.sort(files_pre_web)

    return files_arr
示例#8
0
def read_pandas_hdf5(hdf5_file, key=None, ret=False):
    """
    Reads a HDF5 file that contains one or many datasets.
    It converts it into a pandas DataFrame.

    Parameters
    ----------
    hdf5_file : str
        Path to the HDF5 file containing one or more pandas DataFrame(s).

    key : str or NoneType
        If provided, it will extract the `key` value as a pandas DataFrame.
        This value is set to `None` by default.

    ret : `bool`, optional
        If True, it returns the value of the `key`.
        By default, it is set to False.

    Returns
    ----------
    df : `pandas.DataFrame`
        DataFrame from the `hdf5_file` with the data from the `key` directory
    """
    file_msg = fd.Program_Msg(__file__)
    # Checking that file exists
    fd.File_Exists(hdf5_file)
    # Checking number of keys
    hdf5_obj = pd.HDFStore(hdf5_file)
    hdf5_keys = [ii for ii in hdf5_obj.keys()]
    hdf5_obj.close()
    # Reading in HDF5 file
    if key is None:
        try:
            df = pd.read_hdf(hdf5_file)
            if ret:
                return df, hdf5_keys[0]
            else:
                return df
        except:
            msg = '{0} Must specify which key to use:\n\t'.format(file_msg)
            msg += 'Possible keys: \n'
            print(msg)
            for key_i, name in enumerate(hdf5_keys):
                print('\t Key {0}:  {1}'.format(key_i, name))
    else:
        if key not in hdf5_keys:
            print('{0} Key not in the file: '.format(file_msg))
            print('Possible Keys:\n')
            for key_i, name in enumerate(hdf5_keys):
                print('\t Key {0}:  {1}'.format(key_i, name))
        else:
            df = pd.read_hdf(hdf5_file, key=key)
            if ret:
                return df, key
            else:
                return df
示例#9
0
def get_parser():
    """
    Get parser object for `eco_mocks_create.py` script.

    Returns
    -------
    args: 
        input arguments to the script
    """
    ## Define parser object
    description_msg = 'Downloads the necessary catalogues from the web'
    parser = ArgumentParser(description=description_msg,
                            formatter_class=SortingHelpFormatter,)
    ## 
    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
    # Type of survey
    parser.add_argument('-survey',
                        dest='survey',
                        help='Type of survey to produce. Choices: A, B, ECO',
                        type=str,
                        choices=['A','B','ECO'],
                        default='ECO')
    ## CPU Counts
    parser.add_argument('-cpu',
                        dest='cpu_frac',
                        help='Fraction of total number of CPUs to use',
                        type=float,
                        default=0.75)
    ## Option for removing file
    parser.add_argument('-remove',
                        dest='remove_files',
                        help="""
                        Delete files from previous analyses with same
                        parameters
                        """,
                        type=_str2bool,
                        default=False)
    ## Program message
    parser.add_argument('-progmsg',
                        dest='Prog_msg',
                        help='Program message to use throught the script',
                        type=str,
                        default=cfutils.Program_Msg(__file__))
    ## Verbose
    parser.add_argument('-v','--verbose',
                        dest='verbose',
                        help='Option to print out project parameters',
                        type=_str2bool,
                        default=False)
    ## Parsing Objects
    args = parser.parse_args()

    return args
def reversed_arrays(x, y):
    """
    Determines if arrays increase or decrease monotonically.

    Parameters
    -----------
    x : `numpy.ndarray`
        Array containing the 1st set of values

    y : `numpy.ndarray`
        Array containing the 2nd set of values.

    Return
    -----------
    mono_opt : `bool`
        If True, `x` increases monotonically with increasing `y`.
        If False, `x` decreases monotonically with increasing `y`.

    Raises
    ----------
    LSSUtils_Error : Exception
        Program exception if input parameters are accepted
    """
    file_msg = fd.Program_Msg(__file__)
    # Testing input arguments
    # x-array
    valid_types = (list, np.ndarray)
    if not (isinstance(x, valid_types)):
        msg = '{0} `x` is not a valid type!'.format(file_msg, type(x))
        raise LSSUtils_Error(msg)
    # y-array
    valid_types = (list, np.ndarray)
    if not (isinstance(y, valid_types)):
        msg = '{0} `y` is not a valid type!'.format(file_msg, type(y))
        raise LSSUtils_Error(msg)
    # x- and y-array shapes
    x = np.asarray(x)
    y = np.asarray(y)
    #
    # Checking if arrays increase or decrease monotonically
    x_diff = np.diff(x).sum()
    y_diff = np.diff(y).sum()
    # Monotonically increasing or decreasing
    if (x_diff > 0) and (y_diff > 0):
        mono_opt = True
    else:
        mono_opt = False

    return mono_opt
示例#11
0
def luminosity_to_absolute_mag(lum,
                               filter_opt,
                               system='SDSS_Blanton_2003_z0.1'):
    """
    Calculates the absolute magnitude of object through the `filter_opt`
    filter.

    Parameters
    -----------
    lum : float, int, array_like
        Luminosity of 1 or more objects. In units of `solar luminosities`.

    filter_opt : {'U', 'B', 'V', 'R', 'I', 'J', 'H', 'K'} str
        Magnitude filter to use.

    system : {'Binney_and_Merrifield_1998', 'SDSS_Blanton_2003_z0.1'} str
        Kind of filter to use.

        Options:
            - 'Binney_and_Merrifield_1998' : See Binney and Merrifield, 1998
            - 'SDSS_Blanton_2003_z0.1' : See Blanton et al. (2003) Eqn. 14.

    Returns
    -----------
    abs_mag : float, int, or array_like
        Absolute magnitude of one or multiple objects. Same type as `lum`

    Raises
    ----------
    LSSUtils_Error : Exception
        Program exception if input parameters are accepted
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking input parameters
    valid_types = (float, int, list, np.ndarray)
    if not (isinstance(lum, valid_types)):
        msg = '{0} `lum` ({1}) is not a valid type!'.format(file_msg, lum)
        raise LSSUtils_Error(msg)
    ## Obtaining Sun's absolute magnitude
    abs_mag_sun = get_sun_mag(filter_opt, system=system)
    ## Absolute magnitude calculation
    # In units of solar luminosities
    lum_sun = 1.0
    # Absolute magnitude of objects
    abs_mag = abs_mag_sun - 2.5 * np.log10(lum / lum_sun)

    return abs_mag
示例#12
0
def get_parser():
    """
    Get parser object for `eco_mocks_create.py` script.

    Returns
    -------
    args: 
        input arguments to the script
    """
    ## Define parser object
    description_msg = 'Description of Script'
    parser = ArgumentParser(
        description=description_msg,
        formatter_class=SortingHelpFormatter,
    )
    ##
    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
    parser.add_argument('-namevar',
                        '--long-name',
                        dest='variable_name',
                        help='Description of variable',
                        type=float,
                        default=0)
    ##
    parser.add_argument('-namevar1',
                        '--long-name1',
                        dest='variable_name1',
                        help='Description of variable',
                        type=_check_pos_val,
                        default=0.1)
    ## `Perfect Catalogue` Option
    parser.add_argument('-namevar2',
                        '--long-name2',
                        dest='variable_name2',
                        help='Description of variable',
                        type=_str2bool,
                        default=False)
    ## Program message
    parser.add_argument('-progmsg',
                        dest='Prog_msg',
                        help='Program message to use throught the script',
                        type=str,
                        default=cfutils.Program_Msg(__file__))
    ## Parsing Objects
    args = parser.parse_args()

    return args
示例#13
0
def Bins_array_create(arr, base=10, return_tuple=False):
    """
    Generates an evenly-spaced array between the minimum and maximum value
    of a given array,

    Parameters
    ----------
    arr : array_like
        Array of of numbers or floats

    base : `int` or `float`, optional
        Interval used to create the evenly-spaced array of elements

    return_tuple : `bool`, optional
        If `True`, the function returns a set of tuples for each bin. This
        variable  is set to `False` by default.

    Returns
    ----------
    bins_arr : `numpy.ndarray`
        Array of elements separated in intervals of `base`
    """
    file_msg = fd.Program_Msg(__file__)
    # Transforming input data
    base = float(base)
    arr = np.asarray(arr)
    # Checking array dimensions
    if arr.ndim != 1:
        msg = '{0} The input array is not of dimension 1, but of `{1}`'.format(
            file_msg, arr.ndim)
        raise LSSUtils_Error(msg)
    # Creating evenly-spaced array
    arr_min = myfloor(arr.min(), base=base)
    arr_max = myceil(arr.max(), base=base)
    bins_arr = np.arange(arr_min, arr_max + 0.5 * base, base)
    # Creating tuple if necessary
    if return_tuple:
        bins_arr_mod = (np.array([[bins_arr[ii], bins_arr[ii + 1]]
                                  for ii in range(len(bins_arr) - 1)]))
        return_obj = bins_arr_mod
    else:
        return_obj = bins_arr

    return return_obj
示例#14
0
def absolute_magnitude_to_luminosity(abs_mag,
                                     filter_opt,
                                     system='SDSS_Blanton_2003_z0.1'):
    """
    Calculates the luminosity of the object through `filter_opt` filter.

    Parameters
    -----------
    abs_mag : float, int, or array_like
        Absolute magnitude of one or multiple objects.

    filter_opt : {'U', 'B', 'V', 'R', 'I', 'J', 'H', 'K'} str
        Magnitude filter to use.

    system : {'Binney_and_Merrifield_1998', 'SDSS_Blanton_2003_z0.1'} str
        Kind of filter to use.

        Options:
            - 'Binney_and_Merrifield_1998' : See Binney and Merrifield, 1998
            - 'SDSS_Blanton_2003_z0.1' : See Blanton et al. (2003) Eqn. 14.

    Returns
    -----------
    log_L : float or array_like
        Logarithmic value of the luminosity in the `filter_opt` band.

    Raises
    ----------
    LSSUtils_Error : Exception
        Program exception if input parameters are accepted
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking input parameters
    valid_types = (float, int, list, np.ndarray)
    if not (isinstance(abs_mag, valid_types)):
        msg = '{0} `abs_mag` ({1}) is not a valid type!'.format(
            file_msg, abs_mag)
        raise LSSUtils_Error(msg)
    ## Obtaining Sun's absolute magnitude
    abs_mag_sun = get_sun_mag(filter_opt, system=system)
    ## Luminosity calculations
    log_L = (abs_mag_sun - abs_mag) * 0.4

    return log_L
示例#15
0
def reshape_arr_1d(arr):
    """
    Transforms the array intoa 1-dimensional array, if necessary.

    Parameters
    -----------
    arr : `numpy.ndarray` or array-like
        Array to be converted into 1-dimensional array.

    Returns
    -----------
    arr_new : `numpy.ndarray` or array-like
        Converted array into 1-dimensional array if needed.
    """
    file_msg = fd.Program_Msg(__file__)
    # Checking input parameters
    arr_valid_types = (list, np.ndarray)
    # `arr`
    if not (isinstance(arr, arr_valid_types)):
        msg = '{0} `arr` ({1}) is not a valid input type!'.format(file_msg,
            type(arr))
        raise TypeError(msg)
    # Dimensions
    if (isinstance(arr, arr_valid_types)):
        if not (np.asarray(arr).ndim in [1, 2]):
            msg = '{0} The shape of `arr` ({1}) can only have 1 or 2 '
            msg += 'dimensions'
            msg = msg.format(file_msg, np.asarray(arr).ndim)
            raise LSSUtils_Error(msg)
    # Converting to Numpy array
    arr = np.asarray(arr)
    # Trying to reshape it
    if (arr.ndim == 2):
        if (arr.shape[1] == 1):
            arr = arr.reshape(len(arr),)

    return arr
示例#16
0
    def __init__(self, argname, argvalues):
        """
        Initializes class object.

        Parameters
        ----------
        argname : `str`
            Key of the element to change in main dictionary.
            It can only contain 1 word at a time.

        argvalues : array-like
            List of argvalues for each of the `argnames`.
            This list will be used to loop over the values and
            replace them into the main dictionary.

        Notes
        ----------
        This function loops over the many different elements in `argvalues`.
        This function is meant to be used as a `decorator` for some
        function whose input a dictionary.
        """
        file_msg = fd.Program_Msg(__file__)
        ## Check input parameters
        # `argname`
        if not (isinstance(argname, str)):
            msg = '{0} `argname` ({1}) must be a string'.format(
                file_msg, type(argname))
            raise TypeError(msg)
        # `argvalues`
        if not (isinstance(argvalues, (tuple, list))):
            msg = '{0} `argvalues` ({1}) must be a tuple or list'.format(
                file_msg, type(argvalues))
            raise TypeError(msg)
        ## Assigning to class variables
        self.argname = argname
        self.argvalues = argvalues
        self.file_msg = file_msg
示例#17
0
def pandas_file_to_hdf5_file(df_file, hdf5_file, key=None, mode='w'):
    """
    Converts a HDF5 with pandas format and converts it to normal HDF5 file

    Paramters
    ---------
    df_file : str
        Path to the `df_file` containing the pandas DataFrame to be converted

    hdf5_file : str
        Path to the output HDF5 file containg arrays as keys

    key : str or NoneType, optional
        Key or path in HDF5 file for the `df_file` and `hdf5_file`
    """
    file_msg = fd.Program_Msg(__file__)
    fd.File_Exists(df_file)
    # Reading in DataFrame
    if not key:
        data, key = read_pandas_hdf5(df_file, key=None, ret=True)
    else:
        data = read_pandas_hdf5(df_file, key=key)
    # Rearranging data
    arr_names = data.dtypes.index.values
    dtypes_arr = data.dtypes.values
    dtypes_arr = np.array([x.str for x in dtypes_arr])
    data_dtypes = np.dtype(zip(arr_names, dtypes_arr))
    dataset = np.recarray((len(data), ), dtype=data_dtypes)
    for name in dataset.dtype.names:
        dataset[name] = data[name]
    # Saving file to HDF5 format
    hdf5_obj = h5py.File(hdf5_file, mode=mode)
    hdf5_obj.create_dataset(key, data=dataset)
    hdf5_obj.close()
    msg = '{0} HDF5 file created: {1}'.format(file_msg, hdf5_file)
    print(msg)
示例#18
0
def array_insert(arr1, arr2, axis=1):
    """
    Joins the two arrays into a `single` multi-dimensional array.

    Parameters
    ------------
    arr1 : `numpy.ndarray`
        1st array to merge

    arr2 : `numpy.ndarray`
        2nd array to merge

    axis : `axis object`
        Axis to use for the merging

    Returns
    ----------
    arr_merged : `numpy.ndarray`
        Merged array from `arr1` and `arr2`.
    """
    file_msg = fd.Program_Msg(__file__)
    # Checking input parameters
    arr_valid_types = (list, np.ndarray)
    # `arr1`
    if not (isinstance(arr1, arr_valid_types)):
        '{0} `arr1` ({1}) is not array-like!'.format(file_msg, type(arr1))
        raise ValueError(msg)
    # `arr2`
    if not (isinstance(arr2, arr_valid_types)):
        '{0} `arr2` ({1}) is not array-like!'.format(file_msg, type(arr2))
        raise ValueError(msg)
    #
    # Merging arrays
    arr_merged = np.insert(arr1, len(arr1.T), arr2, axis=axis)

    return arr3
示例#19
0
def sdss_catl_clean(catl_pd, catl_kind, catl_info='members', reindex=True):
    """
    Cleans the catalogue by removing `failed` values.

    Parameters
    -----------
    catl_pd : `pandas.DataFrame`
        Dataset with the catalogue information.

    catl_kind : {'data', 'mocks'} str, optional
        Type of catalogue to use. This variable is set to `data` by default.

        Options:
            - `data` : catalogues come from SDSS `real` catalogue
            - `mocks` : catalogue come from SDSS `mock` catalogues

    catl_info : {'members', 'groups'} str, optional
        Option for which kind of catalogues to use.

        Options:
            - `members` : Member galaxies of group catalogues
            - `groups` : Catalogues with `group` information.

    reindex : `bool`, optional
        If True, the output catalogue is re-indexed.

    Return
    -----------
    catl_pd_clean : `pandas.DataFrame`
        Cleaned version of `catl_pd`, after having removed `failed` values.

    Raises
    ------------
    LSSUtils_Error : Exception from `LSSUtils_Error`
        Program exception if input parameters are accepted.
    """
    file_msg = fd.Program_Msg(__file__)
    # Checking input parameters
    catl_kind_valid = ['data', 'mocks']
    catl_info_valid = ['members', 'groups']
    # `catl_pd`
    if not (isinstance(catl_pd, pd.DataFrame)):
        msg = '{0} `catl_pd` ({1}) is not a valid type!'.format(
            file_msg, catl_pd)
        raise LSSUtils_Error(msg)
    # `catl_kind`
    if not (catl_kind in catl_kind_valid):
        msg = '{0} `catl_kind` ({1}) is not a valid input!'.format(
            file_msg, catl_kind)
        raise LSSUtils_Error(msg)
    # `catl_info`
    if not (catl_info in catl_info_valid):
        msg = '{0} `catl_info` ({1}) is not a valid input!'.format(
            file_msg, catl_info)
        raise LSSUtils_Error(msg)
    # `reindex
    if not (isinstance(reindex, bool)):
        msg = '{0} `reindex` ({1}) is not a valid type!'.format(
            file_msg, type(reindex))
        raise LSSUtils_Error(msg)
    #
    # Defining `failed` values
    ssfr_fail_arr = [0, -99, -999, np.nan]
    mstar_fail_arr = [-1, 0, np.nan]
    #
    # Getting keys for catalogues
    (logssfr_key, logmstar_key) = catl_keys_prop(catl_kind=catl_kind,
                                                 catl_info=catl_info,
                                                 return_type='list')
    #
    # Cleaning catalogue entries
    #
    # Data
    if catl_kind == 'data':
        # Clean version
        catl_pd_clean = catl_pd[~catl_pd[logssfr_key].isin(ssfr_fail_arr) & \
                                ~catl_pd[logmstar_key].isin(mstar_fail_arr)]
    # Mocks
    if catl_kind == 'mocks':
        # Clean version
        catl_pd_clean = catl_pd[~catl_pd[logssfr_key].isin(ssfr_fail_arr)]
    #
    # Reindexing
    if reindex:
        catl_pd_clean.reset_index(inplace=True, drop=True)

    return catl_pd_clean
示例#20
0
def train_test_dataset(pred_arr,
                       feat_arr,
                       pre_opt='min_max',
                       shuffle_opt=True,
                       random_state=0,
                       test_size=0.25,
                       reshape=False,
                       return_idx=False):
    """
    Function to create the training and testing datasets for a given set
    of features array and predicted array.

    Parameters
    -----------
    pred_arr : `pandas.DataFrame` `numpy.ndarray` or array-like, shape (n_samples, n_outcomes)
        Array consisting of the `predicted values`. The dimensions of
        `pred_arr` are `n_samples` by `n_outcomes`, where `n_samples` is the
        number of observations, and `n_outcomes` the number of predicted
        outcomes.

    feat_arr : `numpy.ndarray`, `pandas.DataFrame` or array-like, shape (n_samples, n_features)
        Array consisting of the `predicted values`. The dimensions of
        `feat_arr` are `n_samples` by `n_features`, where `n_samples`
        is the number of observations, and `n_features` the number of
        features used.

    pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional
        Type of preprocessing to do on `feat_arr`.

        Options:
            - 'min_max' : Turns `feat_arr` to values between (0,1)
            - 'standard' : Uses `sklearn.preprocessing.StandardScaler` method
            - 'normalize' : Uses the `sklearn.preprocessing.Normalizer` method
            - 'no' : No preprocessing on `feat_arr`

    shuffle_opt : `bool`, optional
        If True, the data is shuffled before splitting into testing and
        training datasets. This variable is set to True by default.

    random_state : int, optional
        Random state number used for when splitting into training and
        testing datasets. If set, it will always have the same seed
        `random_state`. This variable is set to `0` by default.

    test_size : float, optional
        Percentage of the catalogue that represents the `test` size of
        the testing dataset. This variable must be between (0,1).
        This variable is set to `0.25` by default.

    reshape : `bool`, optional
        If True, it reshapes `feat_arr` into a 1d array if its shapes is
        equal to (ncols, 1), where `ncols` is the number of columns.
        This variable is set to `False` by default.

    return_idx : `bool`, optional
        If `True`, it returns the indices of the `training` and `testing`
        datasets. This variable is set to `False` by default.

    Returns
    -----------
    train_dict : `dict`
        Dictionary containing the `training` data from the catalogue.

    test_dict : `dict`
        Dictionary containing the `testing` data from the catalogue.

    See also
    -----------
    data_preprocessing : Function to preprocess a dataset.
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking input parameters
    # `pred_arr`
    pred_arr_type_valid = (list, np.ndarray, pd.DataFrame)
    if not (isinstance(pred_arr, pred_arr_type_valid)):
        msg = '{0} `pred_arr` ({1}) is not a valid input type'.format(
            file_msg, type(pred_arr))
        raise LSSUtils_Error(msg)
    # `feat_arr`
    feat_arr_type_valid = (list, np.ndarray, pd.DataFrame)
    if not (isinstance(feat_arr, feat_arr_type_valid)):
        msg = '{0} `feat_arr` ({1}) is not a valid input type'.format(
            file_msg, type(feat_arr))
        raise LSSUtils_Error(msg)
    # `pre_opt`
    pre_opt_valid = ['min_max', 'standard', 'normalize', 'no']
    if not (pre_opt in pre_opt_valid):
        msg = '{0} `pre_opt` ({1}) is not a valid input'.format(
            file_msg, pre_opt)
        raise LSSUtils_Error(msg)
    # `shuffle_opt`
    shuffle_opt_type_valid = (bool)
    if not (isinstance(shuffle_opt, shuffle_opt_type_valid)):
        msg = '{0} `shuffle_opt` ({1}) is not a valid input type'.format(
            file_msg, type(shuffle_opt))
        raise LSSUtils_Error(msg)
    # `random_state`
    random_state_type_valid = (int)
    if not (isinstance(random_state, random_state_type_valid)):
        msg = '{0} `random_state` ({1}) is not a valid input'.format(
            file_msg, random_state)
        raise LSSUtils_Error(msg)
    # `test_size`
    if not ((test_size > 0) and (test_size < 1.)):
        msg = '{0} `test_size` ({1}) must be in range (0,1)'.format(
            file_msg, test_size)
        raise LSSUtils_Error(msg)
    ##
    ## Checking indices of `pred_arr` and `feat_arr`
    if return_idx:
        # If object is a DataFrame
        if (isinstance(pred_arr, pd.DataFrame)
                and isinstance(feat_arr, pd.DataFrame)):
            pred_arr_idx = pred_arr.index.values
            feat_arr_idx = feat_arr.index.values
        else:
            pred_arr_idx = np.arange(len(pred_arr))
            feat_arr_idx = np.arange(len(feat_arr))
        # Reshaping if necessary
        if reshape:
            pred_arr_idx = gu.reshape_arr_1d(pred_arr_idx)
            feat_arr_idx = gu.reshape_arr_1d(feat_arr_idx)
    ##
    ## Checking dimensions of `pred_arr` and `feat_arr`
    pred_arr = np.asarray(pred_arr)
    feat_arr = np.asarray(feat_arr)
    # Dimensions
    if reshape:
        pred_arr = gu.reshape_arr_1d(pred_arr)
        feat_arr = gu.reshape_arr_1d(feat_arr)
    # Shape
    if (len(pred_arr) != len(feat_arr)):
        msg = '{0} The shape of `pred_arr` ({1}) and `feat_arr` ({2}) must '
        msg += 'have the same length'
        msg = msg.format(file_msg, len(pred_arr), len(feat_arr))
        raise LSSUtils_Error(msg)
    ##
    ## Rescaling Dataset
    feat_arr_scaled = data_preprocessing(feat_arr,
                                         pre_opt=pre_opt,
                                         reshape=reshape)
    ##
    ## Splitting into `Training` and `Testing` datasets.
    # Scaled
    (X_train, X_test, Y_train,
     Y_test) = skms.train_test_split(feat_arr_scaled,
                                     pred_arr,
                                     test_size=test_size,
                                     shuffle=shuffle_opt,
                                     random_state=random_state)
    # Not-scaled
    (X_train_ns, X_test_ns, Y_train_ns,
     Y_test_ns) = skms.train_test_split(feat_arr,
                                        pred_arr,
                                        test_size=test_size,
                                        shuffle=shuffle_opt,
                                        random_state=random_state)
    # Returning indices if necessary
    if return_idx:
        # Splitting to `training` and `testing`
        (X_train_idx, X_test_idx, Y_train_idx,
         Y_test_idx) = skms.train_test_split(feat_arr_idx,
                                             pred_arr_idx,
                                             test_size=test_size,
                                             shuffle=shuffle_opt,
                                             random_state=random_state)
        if not (np.array_equal(X_train_idx, Y_train_idx)
                and np.array_equal(X_test_idx, Y_test_idx)):
            msg = '{0} Index arrays are not equal to each other!'
            raise LSSUtils_Error(msg)
    ##
    ## Assigning `training` and `testing` datasets to dictionaries
    # Saving indices if necessary
    if return_idx:
        # Adding 'indices' to dictionaries
        train_dict = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_train_ns': X_train_ns,
            'Y_train_ns': Y_train_ns,
            'train_idx': X_train_idx
        }
        test_dict = {
            'X_test': X_test,
            'Y_test': Y_test,
            'X_test_ns': X_test_ns,
            'Y_test_ns': Y_test_ns,
            'test_idx': X_test_idx
        }
    else:
        train_dict = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_train_ns': X_train_ns,
            'Y_train_ns': Y_train_ns
        }
        test_dict = {
            'X_test': X_test,
            'Y_test': Y_test,
            'X_test_ns': X_test_ns,
            'Y_test_ns': Y_test_ns
        }

    return train_dict, test_dict
示例#21
0
def scoring_methods(truth_arr,
                    feat_arr=None,
                    pred_arr=None,
                    model=None,
                    score_method='perc',
                    threshold=0.1,
                    perc=0.68):
    """
    Determines the overall score for given arrays, i.e. the `predicted`
    array and the `truth` array

    Parameters
    -----------
    truth_arr : `numpy.ndarray` or array-like, shape (n_samples, n_outcomes)
        Array consisting of the `true` values for the `n_samples`
        observations. The dimensions of `truth_arr` are
        `n_samples` by `n_outcomes`, where `n_samples` is the
        number of observations, and `n_outcomes` the number of predicted
        outcomes.

    feat_arr : `numpy.ndarray`, array-like, or `NoneType`, shape (n_samples, n_features)
        Array consisting of the `predicted values`. The dimensions of
        `feat_arr` are `n_samples` by `n_features`, where `n_samples`
        is the number of observations, and `n_features` the number of
        features used. This variable is set to `None` by default.

    pred_arr : `numpy.ndarray`, array-like, or `NoneType`, shape (n_samples, n_outcomes)
        Array of predicted values from `feat_arr`. If ``model == None``,
        this variable must be an array-like object. If ``model != None``,
        this variable will not be used, and will be calculated using
        the `model` object. This variable is set to `None` by default.

    model : scikit-learn model object or `NoneType`
        Model used to estimate the score if ``score_method == 'model_score'``
        This variable is set to `None` by default.

    score_method : {'perc', 'threshold', 'model_score', 'r2'} `str`, optional
        Type of scoring to use when determining how well an algorithm
        is performing.

        Options:
            - 'perc' : Use percentage and rank-ordering of the values
            - 'threshold' : Score based on diffs of `threshold` or less from true value.
            - 'model_score' : Out-of-the-box metod from `sklearn` to determine success.
            - 'r2': R-squared statistic for error calcuation.

    threshold : float, optional
        Value to use when calculating the error within `threshold` value
        from the truth. This variable is set to `None` by default.
        If `None`, this variable assumes a value of `0.1`.

    perc : float, optional
        Value used when determining score within some `perc` percentile
        value form [0,1]. This variable is set to `None` by default.
        If `None`, it assumes a value of `0.68`.

    Returns
    -----------
    method_score : float
        Overall score from `pred_arr` to predict `truth_arr`.

    Notes
    -----------
    For more information on how to pre-process your data, see
    `http://scikit-learn.org/stable/modules/model_evaluation.html`_.
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking input parameters
    # `feat_arr`
    feat_arr_type_valid = (list, np.ndarray, type(None))
    if not (isinstance(feat_arr, feat_arr_type_valid)):
        msg = '{0} `feat_arr` ({1}) is not a valid input type'.format(
            file_msg, type(feat_arr))
        raise LSSUtils_Error(msg)
    # `truth_arr`
    truth_arr_type_valid = (list, np.ndarray)
    if not (isinstance(truth_arr, truth_arr_type_valid)):
        msg = '{0} `truth_arr` ({1}) is not a valid input type'.format(
            file_msg, type(truth_arr))
        raise LSSUtils_Error(msg)
    # `score_method` - Type
    score_method_type_valid = (str)
    if not (isinstance(score_method, score_method_type_valid)):
        msg = '{0} `score_method` ({1}) is not a valid input type'.format(
            file_msg, type(score_method))
        raise LSSUtils_Error(msg)
    # `score_method` - Value
    score_method_valid = ['perc', 'threshold', 'model_score', 'r2']
    if not (score_method in score_method_valid):
        msg = '{0} `score_method` ({1}) is not a valid input!'.format(
            file_msg, score_method)
        raise LSSUtils_Error(score_method)
    # `threshold` - Type
    threshold_valid = (float, int)
    if not (isinstance(threshold, threshold_valid)):
        msg = '{0} `threshold` ({1}) is not a valid input type'.format(
            file_msg, type(threshold))
        raise LSSUtils_Error(msg)
    # `threshold` - Value
    if not (threshold >= 0.):
        msg = '{0} `threshold` ({1}) must be larger than 0!'.format(
            file_msg, threshold)
        raise LSSUtils_Error(msg)
    ##
    ## Checking for `model`, `pred_arr` and `feat_arr`
    # If both are none
    if ((model is None) and (pred_arr is None)):
        msg = '{0} `model` and `pred_arr` cannot both be `None`. '
        msg += 'Only one can be `None`'
        msg = msg.format(file_msg)
        raise LSSUtils_Error(msg)
    # If `feat_arr` and `pred_arr` are `None`
    if ((feat_arr is None) and (pred_arr is None)):
        msg = '{0} `feat_arr` and `pred_arr` cannot both be `None`'.format(
            file_msg)
        raise TypeError(msg)
    # `pred_arr` - Type
    # If both are `None`
    pred_arr_valid = ((list, np.ndarray))
    if (model is None):
        if not (isinstance(pred_arr, pred_arr_valid)):
            msg = '{0} `pred_arr` ({1}) is not a valid input type!'.format(
                file_msg, type(pred_arr))
            raise LSSUtils_Error(msg)
    ##
    ## Choosing scoring method
    # Percentile method
    if (score_method == 'perc'):
        # Checking for `pred_arr`
        if (pred_arr is None):
            pred_arr = model.predict(feat_arr)
        # Checking for `model`
        if (model is None):
            pred_arr = np.asarray(pred_arr)
        # Error calcualtion
        pred_err = np.abs(pred_arr - truth_arr)
        method_score = scipy.stats.scoreatpercentile(pred_err, 100. * perc)
    # Threshold method
    if (score_method == 'threshold'):
        # Checking for `pred_arr`
        if (pred_arr is None):
            pred_arr = model.predict(feat_arr)
        # Checking for `model`
        if (model is None):
            pred_arr = np.asarray(pred_arr)
        # Error calcualtion
        pred_err = np.abs(pred_arr - truth_arr)
        pred_thresh = len(pred_err[pred_err <= threshold])
        method_score = pred_thresh / len(pred_arr)
    # R-squared method
    if (score_method == 'r2'):
        # Checking for `pred_arr`
        if (pred_arr is None):
            pred_arr = model.predict(feat_arr)
        # Checking for `model`
        if (model is None):
            pred_arr = np.asarray(pred_arr)
        # Error calcualtion
        method_score = skmetrics.r2_score(truth_arr, pred_arr)
    # Model method
    if (score_method == 'model_score'):
        method_score = model.score(feat_arr, truth_arr)

    return method_score
def get_parser():
    """
    Get parser object for `eco_mocks_create.py` script.

    Returns
    -------
    args: 
        input arguments to the script
    """
    ## Define parser object
    description_msg = 'Main analysis of the `Red Sequence` project.'
    parser = ArgumentParser(
        description=description_msg,
        formatter_class=SortingHelpFormatter,
    )
    ##
    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
    ## 1st Magnitude band
    parser.add_argument('-mband_1',
                        dest='mband_1',
                        help='First apparent magnitude band to analyze.',
                        type=str,
                        choices=[
                            'mag_auto_g', 'mag_auto_r', 'mag_auto_i',
                            'mag_auto_z', 'mag_auto_y'
                        ],
                        default='mag_auto_g')
    ## 2nd Magnitude band
    parser.add_argument('-mband_2',
                        dest='mband_2',
                        help='Second apparent magnitude band to analyze.',
                        type=str,
                        choices=[
                            'mag_auto_g', 'mag_auto_r', 'mag_auto_i',
                            'mag_auto_z', 'mag_auto_y'
                        ],
                        default='mag_auto_z')
    ## 3rd Magnitude band
    parser.add_argument('-mband_3',
                        dest='mband_3',
                        help='Third apparent magnitude band to analyze.',
                        type=str,
                        choices=[
                            'mag_auto_g', 'mag_auto_r', 'mag_auto_i',
                            'mag_auto_z', 'mag_auto_y'
                        ],
                        default='mag_auto_i')
    ## Maximum difference between `mband_1` and `mband_2`
    parser.add_argument('-mag_diff_tresh',
                        dest='mag_diff_tresh',
                        help="""
                        Maximum threshold of the difference between `mband_1`
                        and `mband_2`. It must be larger than `4`.
                        """,
                        type=_check_pos_val,
                        default=4.)
    ## Bottom magnitude limit for `mband_1` and `mband_2.`
    parser.add_argument('-mag_min',
                        dest='mag_min',
                        help="""
                        Bottom magnitude limit for `mband_1` and `mband_2`.
                        """,
                        type=float,
                        default=24.)
    ## Upper magnitude limit for `mband_1` and `mband_2.`
    parser.add_argument('-mag_max',
                        dest='mag_max',
                        help="""
                        Upper magnitude limit for `mband_1` and `mband_2`.
                        """,
                        type=float,
                        default=17.)
    ## Maximum number of elements to download
    parser.add_argument('-master_limit',
                        dest='master_limit',
                        help='Number of elements to use for the MASTER file',
                        type=int,
                        default=100000)
    ## Aperture radius in 'arcseconds'
    parser.add_argument('-radius_size',
                        dest='radius_size',
                        help='Size of radius on the Sky. In units of `arcsec`',
                        type=_check_pos_val,
                        default=5.)
    ## Cosmology Choice
    parser.add_argument(
        '-cosmo',
        dest='cosmo_choice',
        help='Choice of Cosmology',
        type=str,
        choices=['WMAP5', 'WMAP7', 'WMAP9', 'Planck15', 'custom'],
        default='WMAP7')
    ## Redshift bin size
    parser.add_argument('-z_binsize',
                        dest='z_binsize',
                        help='Size of bin for redsift `z`',
                        type=_check_pos_val,
                        default=0.0125)
    ## Minimum redshift value
    parser.add_argument('-z_min',
                        dest='z_min',
                        help='Minimim redshift to analyze.',
                        type=_check_pos_val,
                        default=0.4)
    ## Minimum redshift value
    parser.add_argument('-z_max',
                        dest='z_max',
                        help='Maximum redshift to analyze.',
                        type=_check_pos_val,
                        default=1.0)
    ## Choice of the input galaxy cluster location
    parser.add_argument('-input_catl_loc',
                        dest='input_catl_loc',
                        help='Choice of the input galaxy cluster location.',
                        type=str,
                        choices=['RedMapper', 'SDSS'],
                        default='RedMapper')
    ## Choice of binning
    parser.add_argument('-hist_nbins',
                        dest='hist_nbins',
                        help='Number of bins for x- and y-axis.',
                        type=_check_pos_val,
                        default=200)
    ## Option for removing file
    parser.add_argument('-remove',
                        dest='remove_files',
                        help="""
                        Delete files from previous analyses with same
                        parameters
                        """,
                        type=_str2bool,
                        default=False)
    ## Program message
    parser.add_argument('-progmsg',
                        dest='Prog_msg',
                        help='Program message to use throught the script',
                        type=str,
                        default=cfutils.Program_Msg(__file__))
    ## Verbose
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        help='Option to print out project parameters',
                        type=_str2bool,
                        default=False)
    ## Parsing Objects
    args = parser.parse_args()

    return args
示例#23
0
def catl_sdss_merge(catl_pd_ii,
                    catl_kind='data',
                    catl_type='mr',
                    sample_s='19',
                    halotype='fof',
                    clf_method=3,
                    hod_n=0,
                    clf_seed=1235,
                    dv=1.0,
                    sigma_clf_c=0.1417,
                    perf_opt=False,
                    return_memb_group=False,
                    print_filedir=False):
    """
    Merges the member and group catalogues for a given set of input parameters,
    and returns a modified version of the galaxy group catalogues with added
    info about the galaxy groups.

    Parameters
    ------------
    catl_pd_ii : `int`
        Index of the catalogue to match,
        from :func:`~cosmo_utils.mock_catalogues.catls_utils.extract_catls`
        function.

    catl_kind : {'data', 'mocks'} `str`, optional
        Type of catalogue to use. This variable is set to `data` by default.

        Options:
            - `data` : catalogues come from SDSS `real` catalogue
            - `mocks` : catalogue come from SDSS `mock` catalogues

    catl_type : {'mr', 'mstar'} `str`, optional
        Type of catalogue to use. It shows which abundance matching method
        was used for the CLF when assigning halo masses. This variable is
        set to 'mr' by default.

        Options:
            - `mr` : Uses r-band absolute magnitude
            - `mstar` : Uses stellar masses

    sample_s : {'19', '20', '21'} str, optional
        Volume-limited sample to use. This variable is set to '19' by default.

        Options:
            - '19' : Uses the Mr19 volume-limited sample, i.e. 'Consuelo'
            - '20' : Uses the Mr20 volume-limited sample, i.e. 'Esmeralda'
            - '21' : Uses the Mr21 volume-limited sample, i.e. 'Carmen'

    halotype : {'fof', 'so'} str, optional
        Type of the dark matter halo of the simulation used to create the
        synthetic catalogues. This variable is set to `fof` by default.

        Options:
            - 'fof': Friends-of-Friends halos.
            - 'so' : Spherical overdensity halos.

    clf_method : {1, 2, 3} int, optional
        Method for assigning galaxy properties to mock galaxies.
        This variable is set to `3` by default.

        Options:
            - `1` : Independent assigment of (g-r) color, sersic, and log(ssfr)
            - `2` : (g-r) decides active/passive designation and draw values
                    independently.
            - `3` : (g-r) decides active/passive designations, and
                    assigns other galaxy properties for that given galaxy.

    hod_n : {0, 1} int, optional
        HOD model to use. Only relevant when `catl_kind == mocks`.

    clf_seed : int, optional
        Seed used for the `CLF` random seed. This variable is set to `1235`
        by default.

    dv : float, optional
        Difference between galaxy and mass velocity profiles
        (v_g-v_c)/(v_m-v_c). This value is set to `1.0` by default.

    sigma_clf_c : `float`, optional
        Value of the scatter in log(L) for central galaxies in the CLF.
        This variable is set to ``0.1417`` by default.

    perf_opt : `bool`, optional
        If True, it chooses to analyze the `perfect` set of synthetic
        catalogues. This variable is set to `False` by default.

    return_memb_group :  `bool`, optional
        If True, the function returns the member and group catalogues,
        along with the merged catalogue.
        It returns ``<memb_group_pd, memb_pd, group_pd>``

    print_filedir : `bool`, optional
        If True, the output directory is printed onto the screen.

    Return
    ------------
    memb_group_pd : `pandas.DataFrame`
        Combined version of the i-th member and group catalogues.
        It contains both galaxy and group information.

    memb_pd : `pandas.DataFrame`
        Catalogue of the member galaxies of the i-th catalogue.
        This catalogue contains information of the `member galaxies`.

    group_pd : `pandas.DataFrame`
        Catalogue of the groups of the i-th catalogue.
        This catalogue contains information of the `galaxy groups`.

    Raises
    ------------
    LSSUtils_Error : Exception from `LSSUtils_Error`
        Program exception if input parameters are accepted.
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking input parameters
    catl_pd_ii_valid = (float, int, np.int64, np.int32, np.float32, np.float64)
    catl_kind_valid = ['data', 'mocks']
    catl_type_valid = ['mr', 'mstar']
    sample_s_valid = ['19', '20', '21']
    halotype_valid = ['fof', 'so']
    clf_method_valid = [1, 2, 3]
    hod_n_valid = np.arange(0, 20)
    # `catl_pd_ii`
    if (isinstance(catl_pd_ii, catl_pd_ii_valid)):
        catl_pd_ii = int(catl_pd_ii)
    else:
        msg = '{0} `catl_pd_ii` ({1}) is not a valid input!'.format(
            file_msg, type(catl_pd_ii))
        raise LSSUtils_Error(msg)
    # `catl_kind`
    if not (catl_kind in catl_kind_valid):
        msg = '{0} `catl_kind` ({1}) is not a valid input!'.format(
            file_msg, catl_kind)
        raise LSSUtils_Error(msg)
    # `catl_type`
    if not (catl_type in catl_type_valid):
        msg = '{0} `catl_type` ({1}) is not a valid input!'.format(
            file_msg, catl_type)
        raise LSSUtils_Error(msg)
    # `sample_s`
    if not (sample_s in sample_s_valid):
        msg = '{0} `sample_s` ({1}) is not a valid input!'.format(
            file_msg, sample_s)
        raise LSSUtils_Error(msg)
    # `halotype`
    if not (halotype in halotype_valid):
        msg = '{0} `halotype` ({1}) is not a valid input!'.format(
            file_msg, halotype)
        raise LSSUtils_Error(msg)
    # `clf_method`
    if not (clf_method in clf_method_valid):
        msg = '{0} `clf_method` ({1}) is not a valid input!'.format(
            file_msg, clf_method)
        raise LSSUtils_Error(msg)
    # `dv`
    if not (dv > 0):
        msg = '{0} `dv` ({1}) must be larger than 0!'.format(file_msg, dv)
        raise LSSUtils_Error(msg)
    # `sigma_clf_c` - Type
    if not (isinstance(sigma_clf_c, float)):
        msg = '{0} `sigma_clf_c` ({1}) is not a valid input type!'
        msg = msg.format(file_msg, type(sigma_clf_c))
        raise LSSUtils_Error(msg)
    # `sigma_clf_c` - Value
    if not (sigma_clf_c >= 0.):
        msg = '{0} `sigma_clf_c` ({1}) must be larger than 0!'
        msg = msg.format(file_msg, sigma_clf_c)
        raise LSSUtils_Error(msg)
    # `hod_n`
    if not (hod_n in hod_n_valid):
        msg = '{0} `hod_n` ({1}) is not a valid input!'.format(file_msg, hod_n)
        raise LSSUtils_Error(msg)
    # `perf_opt`
    if not (isinstance(perf_opt, bool)):
        msg = '{0} `perf_opt` ({1}) is not a valid type!'.format(
            file_msg, type(perf_opt))
        raise LSSUtils_Error(msg)
    # `return_memb_group`
    if not (isinstance(return_memb_group, bool)):
        msg = '{0} `return_memb_group` ({1}) is not a valid type!'.format(
            file_msg, type(return_memb_group))
        raise LSSUtils_Error(msg)
    # `print_filedir`
    if not (isinstance(print_filedir, bool)):
        msg = '{0} `print_filedir` ({1}) is not a valid type!'.format(
            file_msg, type(print_filedir))
        raise LSSUtils_Error(msg)
    #
    # Extracting catalogues given input parameters
    (memb_arr, memb_len) = extract_catls(catl_kind=catl_kind,
                                         catl_type=catl_type,
                                         sample_s=sample_s,
                                         halotype=halotype,
                                         clf_method=clf_method,
                                         hod_n=hod_n,
                                         clf_seed=clf_seed,
                                         dv=dv,
                                         sigma_clf_c=sigma_clf_c,
                                         perf_opt=perf_opt,
                                         catl_info='members',
                                         return_len=True,
                                         print_filedir=print_filedir)
    # Checking number of catalogues
    if catl_pd_ii > (memb_len - 1):
        msg = '{0} `catl_pd_ii` ({1}) is OUT of range ({2})!'.format(
            file_msg, catl_pd_ii, memb_len)
        raise LSSUtils_Error(msg)
    #
    # Extracting group catalogue
    # i-th Galaxy catalogue
    memb_path = memb_arr[catl_pd_ii]
    # i-th Galaxy Group catalogue
    group_path = catl_sdss_dir(catl_kind=catl_kind,
                               catl_type=catl_type,
                               sample_s=sample_s,
                               halotype=halotype,
                               clf_method=clf_method,
                               dv=dv,
                               sigma_clf_c=sigma_clf_c,
                               hod_n=hod_n,
                               clf_seed=clf_seed,
                               perf_opt=perf_opt,
                               catl_info='groups',
                               print_filedir=print_filedir)
    #
    # Paths to catalogue
    # Mocks
    if catl_kind == 'mocks':
        group_path = os.path.join(
            group_path,
            os.path.basename(memb_path).replace('memb', 'group'))
    # Data
    if catl_kind == 'data':
        group_path = os.path.join(
            group_path,
            os.path.basename(memb_path).replace('Gals', 'Group'))
    # Checking that file exists
    fd.File_Exists(group_path)
    ##
    ## Reading in Catalogues
    memb_pd = fr.read_hdf5_file_to_pandas_DF(memb_path)
    group_pd = fr.read_hdf5_file_to_pandas_DF(group_path)
    ## Keys for the catalogues
    (gm_key, id_key, galtype_key) = catl_keys(catl_kind,
                                              perf_opt=perf_opt,
                                              return_type='list')
    ## Matching keys from Group catalogue
    if len(np.unique(memb_pd[id_key])) == len(np.unique(group_pd[id_key])):
        # Group column names
        group_colnames = np.sort(group_pd.columns.values)
        ## Sorting `memb_pd` by `id_key`
        # Member catalogue
        memb_pd.sort_values(by=id_key, inplace=True)
        memb_pd.reset_index(inplace=True, drop=True)
        # Group catalogue
        group_pd.sort_values(by=id_key, inplace=True)
        group_pd.reset_index(inplace=True, drop=True)
        ## Renaming columns
        g_colnames_dict = {ii: 'GG_' + ii for ii in group_colnames}
        group_pd.rename(columns=g_colnames_dict, inplace=True)
        group_pd.rename(columns={'GG_' + id_key: id_key}, inplace=True)
        ##
        ## Merging the 2 DataFrames
        memb_group_pd = pd.merge(left=memb_pd,
                                 right=group_pd,
                                 how='left',
                                 left_on=id_key,
                                 right_on=id_key)
    else:
        msg = '{0} Lengths of the 2 DataFrames (`memb_pd`, `group_pd`) '
        msg += 'do not match!'
        msg = msg.format(file_msg)
        raise LSSUtils_Error(msg)
    ##
    ## Returning DataFrames
    if return_memb_group:
        return_obj = (memb_group_pd, memb_pd, group_pd)
    else:
        return_obj = memb_group_pd

    return return_obj
示例#24
0
def concatenate_pd_df(directory, filetype='hdf5', foutput=None, outonly=True):
    """
    Concatenates pandas DataFrames into a single DataFrame

    Parameters
    ----------
    directory : str
        Path to the folder containing multiple pandas-HDF5 files

    filetype : str, optional
        File format of the file in `directory` to be read
        This is set to `hdf5` by default.

    foutput : str or NoneType
        If not `None`, it is the basename of the output file in HDF5 format

    outonly : `bool`, optional
        If True, it returns the pandas DataFrame.
        If False, it only saved the concatenated `pandas.DataFrame`.

    Returns
    ----------
    df_conc : `pandas.DataFrame`
        DataFrame containing the combined datasets from the files in
        `directory`.

    Raises
    ----------
    LSSUtils_Error : Exception
        If no files are found in `directory`, it raises an error
        warning about this.
    """
    file_msg = fd.Program_Msg(__file__)
    # Checking that `directory` exists
    if not os.path.exists(directory):
        msg = '{0} `directory` {1} is not a valid path! Exiting!'.format(
            file_msg, directory)
        raise LSSUtils_Error(msg)
    # Concatenating files
    files_arr = fd.index(directory, '.' + filetype, sort=True)
    print('{0} Found `{1}` files'.format(file_msg, files_arr.size))
    if len(files_arr) > 0:
        # Initializing array that contains info
        df_arr = [[] for x in range(len(files_arr))]
        # Looping over HDF5 (pandas) files
        for ii, file_ii in enumerate(files_arr):
            df_arr[ii] = read_pandas_hdf5(file_ii)
        # Concatenating arrays
        df_conc = pd.concat(df_arr, ignore_index=True)
        # Deciding name of resulting output file
        if (foutput is not None) and (type(foutput) == str):
            foutput_file = os.path.join(directory,
                                        '{0}.{1}'.format(foutput, filetype))
            # Saving resulting DataFrame
            pandas_df_to_hdf5_file(df_conc, foutput_file, key='/Main')
            # Checking file exists
            fd.File_Exists(foutput_file)
            print('{0} Output file saved in: {2}'.format(
                file_msg, foutput_file))
        # If only outputting concatenated DataFrame
        if outonly:
            return df_conc
    else:
        msg = '{0} No files in `{1}` with extension `{2}`'.format(
            file_msg, directory, filetype)
        raise LSSUtils_Error(msg)
示例#25
0
def url_files_download(url,
                       ext,
                       outdir,
                       check_exist=False,
                       create_dir=False,
                       remove_files=False,
                       bar_opt='tqdm'):
    """
    Downloads the files from a URL to a local directory. The files that
    match a specific file extension, `ext`.

    Parameters
    -----------
    url : `str`
        String of the URL

    ext : `str`
        File extension of the files in the URL.

    outdir : `str`
        Path to the output directory. This is the directory, to which
        the files with extensions `ext` will be saved.

    check_exist : `bool`, optional
        If `True`, it checks for whether or not the file exists.
        This variable is set to `False` by default.

    create_dir : `bool`, optional
        If `True`, it creates the directory if it does not exist.
        This variable is set to `False` by default.

    remove_files : `bool`, optional
        If `True`, local files that are present that match the files at
        the URL will be replaced by the new versions. This variable is
        set to ``False`` by default.

    bar_opt : {'tqdm', 'native'}
        Option for which type of progress bar to use when downloading files.
        This variable is set to `tqdm` by default.
        Options:
            - 'tqdm' : Uses a tqdm-based progress bar
            - 'native': Used the wget-based native progress bar.
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking for file type
    # 'URL'
    if not isinstance(url, str):
        msg = '{0} `url` ({1}) is not a valid type. It must be a STRING!'
        msg = msg.format(file_msg, type(url))
        raise TypeError(msg)
    # File extension
    if not isinstance(ext, str):
        msg = '{0} `ext` ({1}) is not a valid type. It must be a STRING!'
        msg = msg.format(file_msg, type(ext))
        raise TypeError(msg)
    # Output directory
    if not isinstance(outdir, str):
        msg = '{0} `outdir` ({1}) is not a valid type. It must be a STRING!'
        msg = msg.format(file_msg, type(outdir))
        raise TypeError(msg)
    # `check_exist`
    if not (isinstance(check_exist, bool)):
        msg = '`check_exist` ({0}) must be of `boolean` type!'.format(
            type(check_exist))
        raise TypeError(msg)
    # `create_dir`
    if not (isinstance(create_dir, bool)):
        msg = '`create_dir` ({0}) must be of `boolean` type!'.format(
            type(create_dir))
        raise TypeError(msg)
    # `bar` - Type
    if not (isinstance(bar_opt, str)):
        msg = '`bar_opt` ({0}) must be of `boolean` type!'.format(
            type(bar_opt))
        raise TypeError(msg)
    # Progress bar - Value
    if not (bar_opt in ['tqdm', 'native']):
        msg = '{0} `bar_opt` ({1}) is not a valid option! Exiting'
        msg = msg.format(file_msg, bar_opt)
        raise LSSUtils_Error(msg)
    ##
    ## List of files in the URL
    files_arr = url_file_list(url, ext)
    # Creating directory
    if create_dir:
        cfutils.Path_Folder(outdir)
    # Check for its existence
    if check_exist:
        if not (os.path.exists(outdir)):
            msg = '`outdir` ({0}) was not found!'.format(outdir)
            raise FileNotFoundError(msg)
    ##
    ## Downloading files to output directory
    if len(files_arr) > 0:
        if (bar_opt == 'tqdm'):
            tqdm_desc = 'Downloading files: '
            for file_ii in tqdm(files_arr, desc=tqdm_desc):
                # Local file
                file_ii_local = os.path.join(outdir, os.path.basename(file_ii))
                # Checking if local file exists
                if os.path.exists(file_ii_local):
                    if remove_files:
                        os.remove(file_ii_local)
                        wget_opt = True
                    else:
                        wget_opt = False
                else:
                    wget_opt = True
                ##
                ## Only downloading if necessary
                if wget_opt:
                    wget.download(file_ii, out=outdir, bar=None)
        elif (bar_opt == 'native'):
            for file_ii in files_arr:
                # Local file
                file_ii_local = os.path.join(outdir, os.path.basename(file_ii))
                # Checking if local file exists
                if os.path.exists(file_ii_local):
                    if remove_files:
                        os.remove(file_ii_local)
                        wget_opt = True
                    else:
                        wget_opt = False
                else:
                    wget_opt = True
                ##
                ## Only downloading if necessary
                if wget_opt:
                    wget.download(file_ii, out=outdir)
    else:
        msg = '{0} Number of files is ZERO!'.format(file_msg)
        print(msg)
def abundance_matching_f(dict1,
                         dict2,
                         volume1=1.,
                         volume2=1.,
                         reverse=True,
                         dens1_opt=False):
    """
    Abundance matching based on 2 quantities.
    It assigns values from `dict2` to elements in `dict1`

    Parameters
    -----------
    dict1 : python dictionary or `numpy.ndarray`
        Dictionary or array of 1st property.

        Keys :
            - `var` : 1st variable to be analysed
            - `dens` : Density array corresponding to `var` elements.
                        Only if `dens` == True.

    dict2 : python dictionary
        dictionary or array of the 2nd property.

        Keys :
            - `var` : 2nd variable to be analyzed
            - `dens` : Density array corresponding to `var` elements.
                        Given if `dens` == True.

    volume1 : float
        Corresponding volume to `dict1`.

    reverse : `bool`, optional
        Determines the relation between `var1` and `var2`.

    dens1_opt : `bool`, optional
        If True, `density` must be calculated.

        Options :
            - `True` : Density is already provided as key for `dict1`.
            - `False` : Density must be calculated.

    Returns
    -----------
    var1_ab : `numpy.ndarray`
        Array of elements matching those of `dict1`, after matching with
        `dict2`.
    """
    file_msg = fd.Program_Msg(__file__)
    # Check types of input paramenters
    valid_types = (list, dict, np.ndarray)
    # `dict1`
    if not (isinstance(dict1, valid_types)):
        msg = '{0} `dict1` ({1}) is not a valid type!'.format(
            file_msg, type(dict1))
    # `dict2`
    if not (isinstance(dict2, dict)):
        msg = '{0} `dict2` must be a dictionary. Its type is `{1}`'.format(
            file_msg, type(dict2))
        raise LSSUtils_Error(msg)
    # 2nd property
    var2 = np.asarray(dict2['var'])
    dens2 = np.asarray(dict2['dens'])
    #
    # `dens1_opt`
    if dens1_opt:
        # 1st Property
        var1 = np.asarray(dict1['var'])
        dens_1 = np.asarray(dict1['dens'])
    else:
        if (isinstance(dict1, dict)):
            var1 = dict1['var']
        elif (isinstance(dict1, (list, np.ndarray))):
            var1 = dict1.copy()
        #
        # Determining relation between `var1` and `var2`
        mono_opt_1 = reversed_arrays(var1, var2)
        # Monotonically increasing
        if mono_opt_1:
            counts_1 = np.array([np.where(var1 > x)[0].size for x in var1]) + 1
        else:
            counts_1 = np.array([np.where(var1 < x)[0].size for x in var1]) + 1
        #
        # Determining density of 1st property
        dens_1 = counts_1.astype(float) / volume1
    #
    # Interpolation for 2nd property
    var2_interp = interp1d(dens2, var2, bounds_error=True, assume_sorted=False)
    # Assigning values to property 1
    var1_ab = np.asarray([var2_interp(xx) for xx in dens_1])

    return var1_ab
示例#27
0
def Behroozi_relation(log_mstar, z=0.):
    """
    Returns the halo mass of a central galaxy as a function of its stellar
    mass.

    Parameters
    -----------
    log_mstar : `float` ,`np.ndarray`, or array-like
        Value or array of values of base-10 logarithm of stellar mass
        in h=1 solar mass units.

    z : int, float, `np.ndarray` or array-like
        Redshift of the halo hosting the galaxy. If passing an array,
        it must be of the same length as the input `log_mstar`.

    Returns
    -----------
    log_halo_mass : float or `np.ndarray`
        Array or float containing 10-base logarithm of halo mass in ``h=1``
        solar mass units.

    Notes
    ----------
    The parameter values in Behroozi+10 were fit to data assuming ``h=0.7``.
    Thus, we will transform our input stellar mass to ``h=0.7`` units,
    evaluate using the Behroozi parameters, and then transform back to
    ``h=1`` units before returning the result.
    """
    file_msg = fd.Program_Msg(__file__)
    little_h = 0.7
    ## Checking input parameters
    # `log_mstar`
    mstar_valid_types = (int, float, np.ndarray, list)
    if not (isinstance(log_mstar, mstar_valid_types)):
        msg = '{0} `log_mstar` ({1}) is not a valid type!'.format(
            file_msg, type(log_mstar))
        raise TypeError(msg)
    ##
    ## Behroozi dictionary
    param_dict = _retrieve_Behroozi_default_dict()
    ## COnverting stellar mass from ``h=1`` units to ``h=0.7`` units.
    mstar = (10.**log_mstar) / (little_h**2)
    ## Scale factor
    a = 1. / (1. + z)
    ##
    ## Behroozi function
    logm0 = param_dict['smhm_m0_0'] + param_dict['smhm_m0_a'] * (a - 1.)
    m0 = 10.**logm0
    logm1 = param_dict['smhm_m1_0'] + param_dict['smhm_m1_a'] * (a - 1)
    beta = param_dict['smhm_beta_0'] + param_dict['smhm_beta_a'] * (a - 1)
    delta = param_dict['smhm_delta_0'] + param_dict['smhm_delta_a'] * (a - 1)
    gamma = param_dict['smhm_gamma_0'] + param_dict['smhm_gamma_a'] * (a - 1)
    #
    stellar_mass_by_m0 = mstar / m0
    term3_numerator = (stellar_mass_by_m0)**delta
    term3_denominator = 1. + (stellar_mass_by_m0)**(-gamma)
    #
    log_halo_mass = logm1 + beta * np.log10(stellar_mass_by_m0)
    log_halo_mass += (term3_numerator / term3_denominator) - 0.5
    #
    # Convert back from ``h=0.7`` to ``h=1`` units
    return np.log10((10.**log_halo_mass) * (little_h))
示例#28
0
def data_preprocessing(feat_arr, pre_opt='min_max', reshape=False):
    """
    Preprocess the data used, in order to clean and make the data more
    suitable for the machine learning algorithms

    Parameters
    -----------
    feat_arr : `numpy.ndarray`, `list`, `pandas.DataFrame`
        Array of feature values. This array is used for training a
        ML algorithm.

    pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional
        Type of preprocessing to do on `feat_arr`.

        Options:
            - 'min_max' : Turns `feat_arr` to values between (0,1)
            - 'standard' : Uses `~sklearn.preprocessing.StandardScaler` method
            - 'normalize' : Uses the `~sklearn.preprocessing.Normalizer` method
            - 'no' : No preprocessing on `feat_arr`

    reshape : `bool`, optional
        If True, it reshapes `feat_arr` into a 1d array if its shapes is
        equal to (ncols, 1), where `ncols` is the number of columns.
        This variable is set to `False` by default.

    Returns
    -----------
    feat_arr_scaled : `numpy.ndarray`
        Rescaled version of `feat_arr` based on the choice of `pre_opt`.

    Notes
    -----------
    For more information on how to pre-process your data, see
    `http://scikit-learn.org/stable/modules/preprocessing.html`_.
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking input parameters
    # `feat_arr`
    feat_arr_type_valid = (list, np.ndarray, pd.DataFrame)
    if not (isinstance(feat_arr, feat_arr_type_valid)):
        msg = '{0} `feat_arr` ({1}) is not a valid input type'.format(
            file_msg, type(feat_arr))
        raise LSSUtils_Error(msg)
    # `pre_opt`
    pre_opt_valid = ['min_max', 'standard', 'normalize', 'no']
    if not (pre_opt in pre_opt_valid):
        msg = '{0} `pre_opt` ({1}) is not a valid input'.format(
            file_msg, pre_opt)
        raise LSSUtils_Error(msg)
    ##
    ## Reshaping `feat_arr`
    if reshape:
        feat_arr = gu.reshape_arr_1d(feat_arr)
    ##
    ## Scaling `feat_arr`
    if (pre_opt == 'min_max'):
        # Scaler
        scaler = skpre.MinMaxScaler(feature_range=(0, 1))
        # Rescaling
        feat_arr_scaled = scaler.fit_transform(feat_arr)
    ## Standardize Data
    if pre_opt == 'standard':
        # Scaler
        scaler = skpre.StandardScaler().fit(feat_arr)
        # Rescaling
        feat_arr_scaled = scaler.transform(feat_arr)
    ## Normalize Data
    if pre_opt == 'normalize':
        # Scaler
        scaler = skpre.Normalizer().fit(feat_arr)
        # Rescaling
        feat_arr_scaled = scaler.transform(feat_arr)
    ## No Preprocessing
    if pre_opt == 'no':
        feat_arr_scaled = feat_arr

    return feat_arr_scaled
示例#29
0
def extract_catls(catl_kind='data',
                  catl_type='mr',
                  sample_s='19',
                  datatype='.hdf5',
                  catl_info='members',
                  halotype='fof',
                  clf_method=3,
                  hod_n=0,
                  clf_seed=1235,
                  dv=1.0,
                  sigma_clf_c=0.1417,
                  perf_opt=False,
                  return_len=False,
                  print_filedir=True):
    """
    Extracts a list of synthetic catalogues given input parameters

    Parameters
    ------------
    catl_kind : {'data', 'mocks'} str, optional
        Type of catalogue to use. This variable is set to `data` by default.

        Options:
            - `data` : catalogues come from SDSS `real` catalogue
            - `mocks` : catalogue come from SDSS `mock` catalogues

    catl_type : {'mr', 'mstar'} str, optional
        Type of catalogue to use. It shows which abundance matching method
        was used for the CLF when assigning halo masses. This variable is
        set to 'mr' by default.

        Options:
            - `mr` : Uses r-band absolute magnitude
            - `mstar` : Uses stellar masses

    sample_s : {'19', '20', '21'} str, optional
        Volume-limited sample to use. This variable is set to '19' by default.

        Options:
            - '19' : Uses the Mr19 volume-limited sample, i.e. 'Consuelo'
            - '20' : Uses the Mr20 volume-limited sample, i.e. 'Esmeralda'
            - '21' : Uses the Mr21 volume-limited sample, i.e. 'Carmen'

    datatype : {'.hdf5'} str, optional
        Data type of the files to be indexed in the folder. This variable
        is set to '.hdf5' by default.

    catl_info : {'members', 'groups'} str, optional
        Option for which kind of catalogues to use.

        Options:
            - `members` : Member galaxies of group catalogues
            - `groups` : Catalogues with `group` information.

    halotype : {'fof', 'so'} str, optional
        Type of the dark matter halo of the simulation used to create the
        synthetic catalogues. This variable is set to `fof` by default.

        Options:
            - 'fof': Friends-of-Friends halos.
            - 'so' : Spherical overdensity halos.

    clf_method : {1, 2, 3} int, optional
        Method for assigning galaxy properties to mock galaxies.
        This variable is set to `3` by default.

        Options:
            - `1` : Independent assigment of (g-r) color, sersic, and log(ssfr)
            - `2` : (g-r) decides active/passive designation and draw values
                    independently.
            - `3` : (g-r) decides active/passive designations, and
                    assigns other galaxy properties for that given galaxy.

    hod_n : {0, 1} int, optional
        HOD model to use. Only relevant when `catl_kind == mocks`.

    clf_seed : int, optional
        Seed used for the `CLF` random seed. This variable is set to `1235`
        by default.

    dv : float, optional
        Difference between galaxy and mass velocity profiles
        (v_g-v_c)/(v_m-v_c). This value is set to `1.0` by default.

    sigma_clf_c : `float`, optional
        Value of the scatter in log(L) for central galaxies in the CLF.
        This variable is set to ``0.1417`` by default.

    perf_opt : `bool`, optional
        If True, it chooses to analyze the `perfect` set of synthetic
        catalogues. This variable is set to `False` by default.

    return_len : `bool`, optional
        If True, the function returns the total number of elements in
        the folder that match the criteria.

    print_filedir : `bool`, optional
        If True, the output directory is printed onto the screen.

    Returns
    ------------
    catl_arr : `numpy.ndarray`
        Array of elements/files matching the `datatype` type in the directory.

    Raises
    ------------
    LSSUtils_Error : Exception from `LSSUtils_Error`
        Program exception if input parameters are accepted.
    """
    file_msg = fd.Program_Msg(__file__)
    # Checking input parameters
    catl_kind_valid = ['data', 'mocks']
    catl_type_valid = ['mr', 'mstar']
    sample_s_valid = ['19', '20', '21']
    catl_info_valid = ['members', 'groups']
    halotype_valid = ['fof', 'so']
    clf_method_valid = [1, 2, 3]
    hod_n_valid = np.arange(0, 20)
    # `catl_kind`
    if not (catl_kind in catl_kind_valid):
        msg = '{0} `catl_kind` ({1}) is not a valid input!'.format(
            file_msg, catl_kind)
        raise LSSUtils_Error(msg)
    # `catl_type`
    if not (catl_type in catl_type_valid):
        msg = '{0} `catl_type` ({1}) is not a valid input!'.format(
            file_msg, catl_type)
        raise LSSUtils_Error(msg)
    # `sample_s`
    if not (sample_s in sample_s_valid):
        msg = '{0} `sample_s` ({1}) is not a valid input!'.format(
            file_msg, sample_s)
        raise LSSUtils_Error(msg)
    # `catl_info`
    if not (catl_info in catl_info_valid):
        msg = '{0} `catl_info` ({1}) is not a valid input!'.format(
            file_msg, catl_info)
        raise LSSUtils_Error(msg)
    # `halotype`
    if not (halotype in halotype_valid):
        msg = '{0} `halotype` ({1}) is not a valid input!'.format(
            file_msg, halotype)
        raise LSSUtils_Error(msg)
    # `clf_method`
    if not (clf_method in clf_method_valid):
        msg = '{0} `clf_method` ({1}) is not a valid input!'.format(
            file_msg, clf_method)
        raise LSSUtils_Error(msg)
    # `hod_n`
    if not (hod_n in hod_n_valid):
        msg = '{0} `hod_n` ({1}) is not a valid input!'.format(file_msg, hod_n)
        raise LSSUtils_Error(msg)
    # `perf_opt`
    if not (isinstance(perf_opt, bool)):
        msg = '{0} `perf_opt` ({1}) is not a valid type!'.format(
            file_msg, type(perf_opt))
        raise LSSUtils_Error(msg)
    # `print_filedir`
    if not (isinstance(print_filedir, bool)):
        msg = '{0} `print_filedir` ({1}) is not a valid type!'.format(
            file_msg, type(print_filedir))
        raise LSSUtils_Error(msg)
    # `dv`
    if not (dv > 0):
        msg = '{0} `dv` ({1}) must be larger than 0!'.format(file_msg, dv)
        raise LSSUtils_Error(msg)
    # `sigma_clf_c` - Type
    if not (isinstance(sigma_clf_c, float)):
        msg = '{0} `sigma_clf_c` ({1}) is not a valid input type!'
        msg = msg.format(file_msg, type(sigma_clf_c))
        raise LSSUtils_Error(msg)
    # `sigma_clf_c` - Value
    if not (sigma_clf_c >= 0.):
        msg = '{0} `sigma_clf_c` ({1}) must be larger than 0!'
        msg = msg.format(file_msg, sigma_clf_c)
        raise LSSUtils_Error(msg)
    # `return_len`
    if not (isinstance(return_len, bool)):
        msg = '{0} `return_len` ({1}) is not a valid type!'.format(
            file_msg, type(return_len))
        raise LSSUtils_Error(msg)
    # `datatype`
    if not (isinstance(datatype, str)):
        msg = '{0} `datatype` ({1}) is not a valid type!'.format(
            file_msg, type(datatype))
        raise LSSUtils_Error(msg)
    #
    # Extracting the path of the catalogues
    filedir = catl_sdss_dir(catl_kind=catl_kind,
                            catl_type=catl_type,
                            sample_s=sample_s,
                            catl_info=catl_info,
                            halotype=halotype,
                            clf_method=clf_method,
                            hod_n=hod_n,
                            clf_seed=clf_seed,
                            dv=dv,
                            sigma_clf_c=sigma_clf_c,
                            perf_opt=perf_opt,
                            print_filedir=print_filedir)
    #
    # Converting to array
    catl_arr = np.sort(fd.Index(filedir, datatype))
    # Checking number of elements
    if len(catl_arr) == 0:
        msg = '{0} `catl_arr` contains 0 entries!'.format(file_msg)
        raise LSSUtils_Error(msg)
    #
    # Returning elements
    if return_len:
        return catl_arr, len(catl_arr)
    else:
        return catl_arr
示例#30
0
def sdss_catl_clean_nmin(catl_pd,
                         catl_kind,
                         catl_info='members',
                         nmin=1,
                         perf_opt=False):
    """
    Cleans the catalogue removing `failed` values, and only includes
    galaxies that are in groups/halos above a `nmin` threshold.

    Parameters
    -----------
    catl_pd : `pandas.DataFrame`
        Dataset with the catalogue information.

    catl_kind : {'data', 'mocks'} str, optional
        Type of catalogue to use. This variable is set to `data` by default.

        Options:
            - `data` : catalogues come from SDSS `real` catalogue
            - `mocks` : catalogue come from SDSS `mock` catalogues

    catl_info : {'members', 'groups'} str, optional
        Option for which kind of catalogues to use.

        Options:
            - `members` : Member galaxies of group catalogues
            - `groups` : Catalogues with `group` information.

    nmin : int, optional
        Minimum group richness to have in the (galaxy) group catalogue.
        This variable is set to `1` by default.

    perf_opt : `bool`, optional
        Option for using a `perfect` mock catalogue.

    Return
    -----------
    catl_pd_clean : `pandas.DataFrame`
        Cleaned version of `catl_pd` after having removed `failed` values,
        and having choosen only galaxies within groups above a group richness
        threshold of `nmin`.

    Raises
    ------------
    LSSUtils_Error : Exception from `LSSUtils_Error`
        Program exception if input parameters are accepted.
    """
    file_msg = fd.Program_Msg(__file__)
    # Checking input parameters
    catl_kind_valid = ['data', 'mocks']
    catl_info_valid = ['members', 'groups']
    # `catl_pd`
    if not (isinstance(catl_pd, pd.DataFrame)):
        msg = '{0} `catl_pd` ({1}) is not a valid type!'.format(
            file_msg, catl_pd)
        raise LSSUtils_Error(msg)
    # `catl_kind`
    if not (catl_kind in catl_kind_valid):
        msg = '{0} `catl_kind` ({1}) is not a valid input!'.format(
            file_msg, catl_kind)
        raise LSSUtils_Error(msg)
    # `catl_info`
    if not (catl_info in catl_info_valid):
        msg = '{0} `catl_info` ({1}) is not a valid input!'.format(
            file_msg, catl_info)
        raise LSSUtils_Error(msg)
    # `nmin`
    if not ((nmin > 0) and (isinstance(nmin, int))):
        msg = '{0} `nmin` must be an integer and have a value above `0`'
        msg = msg.format(file_msg)
        raise LSSUtils_Error(msg)
    # `perf_opt`
    if not (isinstance(perf_opt, bool)):
        msg = '{0} `perf_opt` ({1}) is not a valid type!'.format(
            file_msg, type(perf_opt))
        raise LSSUtils_Error(msg)
    #
    # Types of galaxies
    cens = int(1)
    nmin = int(nmin)
    #
    # Getting keys for catalogue
    (gm_key, id_key, galtype_key) = catl_keys(catl_kind,
                                              return_type='list',
                                              perf_opt=perf_opt)

    # Cleaning catalogue entries
    catl_pd_clean_all = sdss_catl_clean(catl_pd,
                                        catl_kind=catl_kind,
                                        catl_info=catl_info,
                                        reindex=True)
    # Choosing only galaxies in groups of richness >= `nmin`
    # Member galaxies
    if catl_info == 'members':
        # Centrals
        catl_pd_cens = catl_pd_clean_all.loc[(
            catl_pd_clean_all[galtype_key] == cens), id_key]
        catl_pd_cl = catl_pd_clean_all[(
            catl_pd_clean_all[id_key].isin(catl_pd_cens))]
        # Group counts
        group_counts = Counter(catl_pd_cl[id_key])
        group_ngals = np.array(
            [xx for xx in group_counts.keys() if group_counts[xx] >= nmin])
        # Cleaned version
        catl_pd_clean = catl_pd_cl[catl_pd_cl[id_key].isin(group_ngals)]
        catl_pd_clean.reset_index(inplace=True, drop=True)
    # Group catalogue
    if catl_info == 'groups':
        if ('ngals' in catl_pd_clean_all.columns.tolist()):
            catl_pd_clean = catl_pd_clean_all.loc[
                catl_pd_clean_all['ngals'] >= nmin]
            catl_pd_clean.reset_index(inplace=True, drop=True)
        else:
            msg = '{0} Key `ngals` not found in DataFrame ... Exiting!'
            msg = msg.format(file_msg)
            raise LSSUtils_Error(msg)

    return catl_pd_clean