Пример #1
0
def load_dataset(file_name, key=None, **kwargs):
    if not isinstance(file_name, six.string_types):
        logger.error('file name is not a string')
        raise AssertionError('file name is not a string')
    p = get_install_path()
    p = os.sep.join([p, 'datasets', file_name+'.csv'])
    table = pd.read_csv(p, **kwargs)
    if key is not None:
        cm.set_key(table, key)
    return table
Пример #2
0
def load_dataset(file_name, key=None, **kwargs):
    if not isinstance(file_name, six.string_types):
        logger.error('file name is not a string')
        raise AssertionError('file name is not a string')
    p = get_install_path()
    p = os.sep.join([p, 'datasets', file_name + '.csv'])
    table = pd.read_csv(p, **kwargs)
    if key is not None:
        cm.set_key(table, key)
    return table
Пример #3
0
def load_table(file_path, metadata_ext='.pklmetadata'):
    """
    Load DataFrame from file, along with its metadata (if present).

    This function loads a DataFrame from the file stored in a pickle format.
    Further, this function looks for a metadata file with the same file name
    but with a different extension (given by the user). If the metadata file
    is present, the function will update the metadata for that DataFrame in
    the catalog.

    Args:
        file_path (string): File path to load the file from
        metadata_ext (string): Metadata file extension (with the default value
            set to '.pklmetadata')
    Returns:
        If the loading is successful, the function returns a pandas DataFrame
        read from the file. The catalog will be updated with the metadata
        read from the metadata file (if the file was present).

    Raises:
        AssertionError: If the file path is not of type string
        AssertionError: If the metadata extension is not of type string

    Notes:
        This function is different from read_csv_metadata in two aspects.
        First, this function currently does not support reading in candidate
        set tables, where the there are more metadata such as ltable,
        rtable than just 'key', and conceptually the user is expected to
        provide ltable and rtable info. while invoking this function. (
        this support will be added shortly). Second, this function loads the
        table stored in a pickle format.

    See Also:
        to_csv_metadata, save_object, read_csv_metadata
    """
    # Validate input parameters

    # # The file_path is expected to be of type string
    if not isinstance(file_path, six.string_types):
        logger.error('Input file path is not of type string')
        raise AssertionError('Input file path is not of type string')

    # # The metadata_extn is expected to be of type string
    if not isinstance(metadata_ext, six.string_types):
        logger.error('Input metadata ext is not of type string')
        raise AssertionError('Input metadata ext is not of type string')

    # Load the object from the file path. Note that we use a generic load
    # object to load in the DataFrame too.
    data_frame = load_object(file_path)

    # Load metadata from file path

    # # Check if the meta data file is present
    if ps._is_metadata_file_present(file_path, extension=metadata_ext):
        # Construct the metadata file name, and read it from the disk.

        # # Get the file name used to load the DataFrame
        file_name, _ = os.path.splitext(file_path)
        # # Construct the metadata file name
        metadata_filename = file_name + metadata_ext
        # # Load the metadata from the disk
        metadata_dict = load_object(metadata_filename)

        # Update the catalog with the properties read from the disk
        for property_name, property_value in six.iteritems(metadata_dict):
            if property_name == 'key':
                # If the property_name is key call set_key as the function
                # will check for the integrity of key before setting it in
                # the catalog
                cm.set_key(data_frame, property_value)
            else:
                cm.set_property(data_frame, property_name, property_value)
    else:
        # If the metadata file is not present then issue a warning
        logger.warning('There is no metadata file')

    # Return the DataFrame
    return data_frame
Пример #4
0
def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Sample a pandas DataFrame (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): Input DataFrame to be sampled. Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): Number of samples to be picked up from the input
            DataFrame.
        replace (boolean): Flag to indicate whether sampling should be done
            with replacement or not (default value is False).
        verbose (boolean): Flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with 'sample_size' number of rows. Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If the input table is not of type pandas DataFrame.
        AssertionError: If the input DataFrame size is 0.
        AssertionError: If the sample_size is greater than the input
            DataFrame size.

    Notes:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type pandas dataframe')
        raise AssertionError('Input table is not of type pandas dataframe')

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                     rtable, l_key, r_key, logger, verbose)

    # Get the sample set for the output table
    sample_indices = pd.np.random.choice(len(table),
                                         sample_size,
                                         replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table
Пример #5
0
def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Sample a pandas DataFrame (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): Input DataFrame to be sampled. Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): Number of samples to be picked up from the input
            DataFrame.
        replace (boolean): Flag to indicate whether sampling should be done
            with replacement or not (default value is False).
        verbose (boolean): Flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with 'sample_size' number of rows. Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If the input table is not of type pandas DataFrame.
        AssertionError: If the input DataFrame size is 0.
        AssertionError: If the sample_size is greater than the input
            DataFrame size.

    Notes:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type pandas dataframe')
        raise AssertionError('Input table is not of type pandas dataframe')

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key,
                                     logger, verbose)

    # Get the sample set for the output table
    sample_indices = pd.np.random.choice(len(table), sample_size,
                                         replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table
Пример #6
0
 def test_set_key_with_mvals(self):
     p = os.sep.join([catalog_datasets_path, 'A_mvals.csv'])
     A = pd.read_csv(p)
     status = cm.set_key(A, 'ID')
     self.assertEqual(status, False)
Пример #7
0
 def test_set_key_notin_df(self):
     A = pd.read_csv(path_a)
     cm.set_key(A, 'ID1')
Пример #8
0
 def test_set_key_invalid_df(self):
     cm.set_key(None, 'ID')
Пример #9
0
 def test_set_key_valid(self):
     A = pd.read_csv(path_a)
     cm.set_key(A, 'ID')
     self.assertEqual(cm.get_key(A), 'ID')
Пример #10
0
def read_csv_metadata(file_path, **kwargs):
    """
    Read CSV (comma-separated) file into DataFrame, and update the
    catalog with the metadata read from the same file name with an extension
    specified by the user (with the default value set to '.metadata') or the
    metadata given as key-value arguments.

    Reads the CSV file from the given file path into a pandas DataFrame.
    This function uses 'read_csv' method from pandas to read the CSV file into a
    pandas DataFrame. Further it looks for a file with  same file name but
    with a specific extension. This extension can be given by the user,
    with the default value being '.metadata'. If the metadata  file is
    present, the function with read and update the catalog. If
    the metadata file is not present, the function will issue a warning
    that the metadata file is not present and will
    read the CSV file into a pandas DataFrame.

    The metadata information can also be given as parameters to the function
    (see decription of arguments for more details). If given, the function
    will update the catalog with the given information. Further,
    the metadata given in the function takes precendence over the metadata
    given in the file.


    Args:
        file_path (string): CSV file path.

        kwargs (dict): A python dictionary containing key-value arguments. There
            are a few key-value pairs that are specific to read_csv_metadata and
            all the other key-value pairs are passed to pandas read_csv method.
            The keys that are specific to read_csv_metadata are: (1)
            metadata_extn, (2) key, (3) fk_ltable, (4) fk_rtable, (5) ltable,
            and (6) rtable. Here the metadata_extn is the expected metadata
            extension (with the default value set to '.metadata'), and all
            the others are metadata related to the DataFrame read from the
            CSV file.

    Returns:
        A pandas DataFrame read from the given CSV file.

    Raises:
        AssertionError: If the input file path is not of type string.
        AssertionError: If a file does not exist in the given file path.

    """
    # Validate the input parameters.

    # # File path is expected to be of type string.
    if not isinstance(file_path, six.string_types):
        logger.error('Input file path is not of type string')
        raise AssertionError('Input file path is not of type string')

    # # Check if the given path is valid.
    if not os.path.exists(file_path):
        logger.error('File does not exist at path %s', file_path)
        raise AssertionError('File does not exist at path %s', file_path)

    # Check if the user has specified the metadata file's extension.
    extension = kwargs.pop('metadata_extn', None)

    # If the extension is not specified then set the extension to .metadata'.
    if extension is None:
        extension = '.metadata'

    # Format the extension to include a '.' in front if the user has not
    # given one.
    if not extension.startswith('.'):
        extension = '.' + extension

    # If the file is present, then update metadata from file.
    if _is_metadata_file_present(file_path, extension=extension):
        file_name, _ = os.path.splitext(file_path)
        file_name = ''.join([file_name, extension])
        metadata, _ = _get_metadata_from_file(file_name)

    # Else issue a warning that the metadata file is not present
    else:
        logger.warning('Metadata file is not present in the given path; '
                       'proceeding to read the csv file.')
        metadata = {}

    # Update the metadata with the key-value pairs given in the command. The
    # function _update_metadata_for_read_cmd takes care of updating the
    # metadata with only the key-value pairs specific to read_csv_metadata
    # method
    metadata, kwargs = _update_metadata_for_read_cmd(metadata, **kwargs)

    # Validate the metadata.
    _check_metadata_for_read_cmd(metadata)

    # Read the csv file using pandas read_csv method.
    data_frame = pd.read_csv(file_path, **kwargs)

    # Get the value for 'key' property and update the catalog.
    key = metadata.pop('key', None)
    if key is not None:
        cm.set_key(data_frame, key)

    # Update the catalog with other properties.
    for property_name, property_value in six.iteritems(metadata):
        cm.set_property(data_frame, property_name, property_value)
    if not cm.is_dfinfo_present(data_frame):
        cm.init_properties(data_frame)

    # Return the DataFrame
    return data_frame
Пример #11
0
def read_csv_metadata(file_path, **kwargs):
    """
    Read CSV (comma-separated) file into DataFrame, and update the
    catalog with the metadata read from the same file name with an extension
    specified by the user (with the default value set to '.metadata') or the
    metadata given as key-value arguments.

    Reads the CSV file from the given file path into a pandas DataFrame.
    This function uses 'read_csv' method from pandas to read the CSV file into a
    pandas DataFrame. Further it looks for a file with  same file name but
    with a specific extension. This extension can be given by the user,
    with the default value being '.metadata'. If the metadata  file is
    present, the function with read and update the catalog. If
    the metadata file is not present, the function will issue a warning
    that the metadata file is not present and will
    read the CSV file into a pandas DataFrame.

    The metadata information can also be given as parameters to the function
    (see decription of arguments for more details). If given, the function
    will update the catalog with the given information. Further,
    the metadata given in the function takes precendence over the metadata
    given in the file.


    Args:
        file_path (string): CSV file path.

        kwargs (dict): A python dictionary containing key-value arguments. There
            are a few key-value pairs that are specific to read_csv_metadata and
            all the other key-value pairs are passed to pandas read_csv method.
            The keys that are specific to read_csv_metadata are: (1)
            metadata_extn, (2) key, (3) fk_ltable, (4) fk_rtable, (5) ltable,
            and (6) rtable. Here the metadata_extn is the expected metadata
            extension (with the default value set to '.metadata'), and all
            the others are metadata related to the DataFrame read from the
            CSV file.

    Returns:
        A pandas DataFrame read from the given CSV file.

    Raises:
        AssertionError: If the input file path is not of type string.
        AssertionError: If a file does not exist in the given file path.

    """
    # Validate the input parameters.

    # # File path is expected to be of type string.
    if not isinstance(file_path, six.string_types):
        logger.error('Input file path is not of type string')
        raise AssertionError('Input file path is not of type string')

    # # Check if the given path is valid.
    if not os.path.exists(file_path):
        logger.error('File does not exist at path %s', file_path)
        raise AssertionError('File does not exist at path %s', file_path)

    # Check if the user has specified the metadata file's extension.
    extension = kwargs.pop('metadata_extn', None)

    # If the extension is not specified then set the extension to .metadata'.
    if extension is None:
        extension = '.metadata'

    # Format the extension to include a '.' in front if the user has not
    # given one.
    if not extension.startswith('.'):
        extension = '.' + extension

    # If the file is present, then update metadata from file.
    if _is_metadata_file_present(file_path, extension=extension):
        file_name, _ = os.path.splitext(file_path)
        file_name = ''.join([file_name, extension])
        metadata, _ = _get_metadata_from_file(file_name)

    # Else issue a warning that the metadata file is not present
    else:
        logger.warning('Metadata file is not present in the given path; '
                       'proceeding to read the csv file.')
        metadata = {}

    # Update the metadata with the key-value pairs given in the command. The
    # function _update_metadata_for_read_cmd takes care of updating the
    # metadata with only the key-value pairs specific to read_csv_metadata
    # method
    metadata, kwargs = _update_metadata_for_read_cmd(metadata, **kwargs)

    # Validate the metadata.
    _check_metadata_for_read_cmd(metadata)

    # Read the csv file using pandas read_csv method.
    data_frame = pd.read_csv(file_path, **kwargs)

    # Get the value for 'key' property and update the catalog.
    key = metadata.pop('key', None)
    if key is not None:
        cm.set_key(data_frame, key)

    # Update the catalog with other properties.
    for property_name, property_value in six.iteritems(metadata):
        cm.set_property(data_frame, property_name, property_value)
    if not cm.is_dfinfo_present(data_frame):
        cm.init_properties(data_frame)

    # Return the DataFrame
    return data_frame