def init_properties(data_frame):
    """
    Initializes properties for a pandas DataFrame in the catalog.

    Specifically, this function creates an entry in the catalog and sets its
    properties to empty.

    Args:
        data_frame (DataFrame): DataFrame for which the properties must be
            initialized.

    Returns:
        A Boolean value of True is returned if the initialization was
        successful.

    """
    # Validate input parameters

    # # Input object is expected to be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # Get the catalog instance
    catalog = Catalog.Instance()

    # Initialize the property in the catalog.
    # Relay the return value from the underlying catalog object's function.
    # The return value is typically True if the initialization was successful
    return catalog.init_properties(data_frame)
def show_properties_for_id(object_id):
    """
    Shows the properties for an object id present in the catalog.

    Specifically, given an object id got from typically executing id(
    <object>), where the object could be a DataFrame, this function will
    display the properties present for that object id in the catalog.

    Args:
        object_id (int): The Python identifier of an object (typically a
         pandas DataFrame).

    """
    catalog = Catalog.Instance()
    metadata = catalog.get_all_properties_for_id(object_id)
    # First print the id for the DataFrame
    print('id: ' + str(object_id))
    # For each property name anf value, print the contents to the user
    for property_name, property_value in six.iteritems(metadata):
        # If the property value is string print it out
        if isinstance(property_value, six.string_types):
            print(property_name + ": " + property_value)
        # else, print just the id.
        else:
            print(property_name + "(obj.id): " + str(id(property_value)))
def is_dfinfo_present(data_frame):
    """
    Checks whether the DataFrame information is present in the catalog.

    Args:
        data_frame (DataFrame): The DataFrame that should be checked for its
            presence in the catalog.

    Returns:
        A Boolean value of True is returned if the DataFrame is present in
        the catalog, else False is returned.

    Raises:
        AssertionError: If `data_frame` is not of type pandas
         DataFrame.

    """
    # Validate inputs
    # We expect the input object to be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    # Get the catalog instance
    catalog = Catalog.Instance()

    # Call the underlying catalog object's function to check if the
    # DataFrame information is present in the catalog.
    # Relay the return value from the delegated function.
    return catalog.is_df_info_present_in_catalog(data_frame)
def get_property(data_frame, property_name):
    """
    Gets the value of a property (with the given property name) for a pandas
    DataFrame from the catalog.

    Args:
        data_frame (DataFrame): The DataFrame for which the property should be
            retrieved.
        property_name (string): The name of the property that should be
            retrieved.

    Returns:
        A Python object (typically a string or a pandas DataFrame depending
        on the property name) is returned.

    Raises:
        AssertionError: If `data_frame` is not of type pandas
         DataFrame.
        AssertionError: If `property_name` is not of type string.
        KeyError: If `data_frame` information is not present in the catalog.
        KeyError: If requested property for the `data_frame` is not present
            in the catalog.
    """
    # Validate input parameters

    # # The input object should be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # # The property name should be of type string
    if not isinstance(property_name, six.string_types):
        logger.error('Property name is not of type string')
        raise AssertionError('Property name is not of type string')

    # Get the catalog instance, this is imported here because this object
    # used to validate the presence of a DataFrame in the catalog, and the
    # presence of requested metadata in the catalog.
    catalog = Catalog.Instance()

    # Check for the present of input DataFrame in the catalog.
    if not catalog.is_df_info_present_in_catalog(data_frame):
        logger.error('DataFrame information is not present in the catalog')
        raise KeyError('DataFrame information is not present in the catalog')

    # Check if the requested property is present in the catalog.
    if not catalog.is_property_present_for_df(data_frame, property_name):
        logger.error(
            'Requested metadata ( %s ) for the given DataFrame is not '
            'present in the catalog' % property_name)
        raise KeyError(
            'Requested metadata ( %s ) for the given DataFrame is not '
            'present in the catalog' % property_name)

    # Return the requested property for the input DataFrame
    return catalog.get_property(data_frame, property_name)
def del_catalog():
    """
    Deletes the catalog for the current session.

    Returns:
        A Boolean value of True is returned if the deletion was successful.
    """
    # Get the catalog instance
    catalog = Catalog.Instance()
    # Call the underlying catalog object's function to delete the catalog (a
    # dict).  Relay the return value from the delegated function.
    return catalog.del_catalog()
def del_property(data_frame, property_name):
    """
    Deletes a property for a pandas DataFrame from the catalog.

    Args:
        data_frame (DataFrame): The input DataFrame for which a property must be
            deleted from the catalog.

        property_name (string): The name of the property that should be deleted.

    Returns:
        A Boolean value of True is returned if the deletion was successful.

    Raises:
        AssertionError: If `data_frame` is not of type pandas DataFrame.
        AssertionError: If `property_name` is not of type string.
        KeyError: If `data_frame` information is not present in the catalog.
        KeyError: If requested property for the DataFrame is not present
            in the catalog.
    """
    # Validate input parameters

    # # The input object should be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # # The input property name is expected to be of type string
    if not isinstance(property_name, six.string_types):
        logger.error('Property name is not of type string')
        raise AssertionError('Property name is not of type string')

    # Get the catalog instance
    catalog = Catalog.Instance()

    # Check if the DataFrame information is present in the catalog, if not
    # raise an error.
    if not catalog.is_df_info_present_in_catalog(data_frame):
        logger.error('DataFrame information is not present in the catalog')
        raise KeyError('DataFrame information is not present in the catalog')

    # Check if the requested property name to be deleted  is present for the
    # DataFrame in the catalog, if not raise an error.
    if not catalog.is_property_present_for_df(data_frame, property_name):
        logger.error('Requested metadata ( %s ) for the given DataFrame is '
                     'not present in the catalog' % property_name)
        raise KeyError('Requested metadata ( %s ) for the given DataFrame is '
                       'not present in the catalog' % property_name)

    # Delete the property using the underlying catalog object and relay the
    # return value. Typically the return value is True if the deletion was
    # successful
    return catalog.del_property(data_frame, property_name)
def get_catalog_len():
    """
    Get the length (i.e the number of entries) in the catalog.

    Returns:
        The number of entries in the catalog as an integer.

    """
    # Get the catalog instance
    catalog = Catalog.Instance()
    # Call the underlying catalog object's function to get the catalog length.
    # Relay the return value from that function.
    return catalog.get_catalog_len()
def set_property(data_frame, property_name, property_value):
    """
    Sets the value of a property (with the given property name) for a pandas
    DataFrame in the catalog.

    Args:
        data_frame (DataFrame): The DataFrame for which the property must  be
            set.
        property_name (string): The name of the property to be set.
        property_value (object): The value of the property to be set. This is
            typically a string (such as key) or pandas DataFrame (such as
            ltable, rtable).

    Returns:
        A Boolean value of True is returned if the update was successful.

    Raises:
        AssertionError: If `data_frame` is not of type pandas
         DataFrame.
        AssertionError: If `property_name` is not of type string.

    Note:
        If the input DataFrame is not present in the catalog, this function
        will create an entry in the catalog and set the given property.

    """
    # Validate input parameters

    # # The input object is expected to be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    # # The property name is expected to be of type string.
    if not isinstance(property_name, six.string_types):
        logger.error('Property name is not of type string')
        raise AssertionError('Property name is not of type string')

    # Get the catalog instance
    catalog = Catalog.Instance()

    # Check if the DataFrame information is present in the catalog. If the
    # information is not present, then initialize an entry for that DataFrame
    #  in the catalog.
    if not catalog.is_df_info_present_in_catalog(data_frame):
        catalog.init_properties(data_frame)

    # Set the property in the catalog, and relay the return value from the
    # underlying catalog object's function. The return value is typically
    # True if the update was successful.
    return catalog.set_property(data_frame, property_name, property_value)
def is_catalog_empty():
    """
    Checks if the catalog is empty.

    Returns:
        A Boolean value of True is returned if the catalog is empty,
        else returns False.

    """
    # Get the catalog instance
    catalog = Catalog.Instance()

    # Call the underlying catalog object's function to check if the catalog
    # is empty.  Relay the return value from the delegated function.
    return catalog.is_catalog_empty()
def get_catalog():
    """
    Gets the catalog information for the current session.

    Returns:
        A Python dictionary containing the catalog information.

        Specifically, the dictionary contains the Python identifier of a
        DataFrame (obtained by id(DataFrame object)) as the key
        and their properties as value.
    """
    # Get the catalog instance
    catalog = Catalog.Instance()
    # Call the underlying catalog object's function to get the catalog. Relay
    # the return value from the delegated function.
    return catalog.get_catalog()
def is_property_present_for_df(data_frame, property_name):
    """
    Checks if the given property is present for the given DataFrame in the
    catalog.

    Args:
        data_frame (DataFrame): The DataFrame for which the property must be
            checked for.
        property_name (string): The name of the property that should be
        checked for its presence for the DataFrame, in the catalog.

    Returns:
        A Boolean value of True is returned if the property is present for
        the given DataFrame.

    Raises:
        AssertionError: If `data_frame` is not of type pandas
         DataFrame.
        AssertionError: If `property_name` is not of type string.
        KeyError: If `data_frame` is not present in the catalog.
    """
    # Input validations

    # # We expect the input object to be of type pandas DataFrame.
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # # The input property name should be of type string
    if not isinstance(property_name, six.string_types):
        logger.error('The property name is not of type string.')
        raise AssertionError('The property name is not of type string.')

    # Get the catalog instance
    catalog = Catalog.Instance()

    # Check if the given DataFrame information is present in the catalog. If
    # not, raise an error.
    if catalog.is_df_info_present_in_catalog(data_frame) is False:
        logger.error('DataFrame information is not present in the catalog')
        raise KeyError('DataFrame information is not present in the catalog')

    # Call the underlying catalog object's function to check if the property
    # is present for the given DataFrame. Relay the return value from that
    # function.
    return catalog.is_property_present_for_df(data_frame, property_name)
def del_all_properties(data_frame):
    """
    Deletes all properties for a DataFrame from the catalog.

    Args:
        data_frame (DataFrame): Input DataFrame for which all the properties
            must be deleted from the catalog.

    Returns:
        A boolean of True is returned if the deletion was successful
        from the catalog.

    Raises:
        AssertionError: If the `data_frame` is not of type pandas DataFrame.
        KeyError: If the DataFrame information is not present in the catalog.

    Note:
        This method's functionality is not as same as init_properties. Here
        the DataFrame's entry will be removed from the catalog,
        but init_properties will add (if the DataFrame is not present in the
        catalog) and initialize its properties to an empty object (
        specifically, an empty Python dictionary).
    """
    # Validations of input parameters
    # # The input object is expected to be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    # Get the catalog instance
    catalog = Catalog.Instance()

    # Check if the DataFrame is present in the catalog. If not, raise an error
    if not catalog.is_df_info_present_in_catalog(data_frame):
        logger.error('DataFrame information is not present in the catalog')
        raise KeyError('DataFrame information is not present in the catalog')

    # Call the underlying catalog object's function to delete the properties
    # and relay its return value
    return catalog.del_all_properties(data_frame)
def get_all_properties(data_frame):
    """
    Gets all the properties for a pandas DataFrame object from the catalog.

    Args:
        data_frame (DataFrame): DataFrame for which the properties must be
            retrieved.

    Returns:
        A dictionary containing properties for the input pandas DataFrame.

    Raises:
        AttributeError: If the input object is not of type pandas DataFrame.
        KeyError: If the information about DataFrame is not present in the
            catalog.


    """
    # Validate input parameters
    # # The input object is expected to be of type DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # Get the catalog instance
    catalog = Catalog.Instance()

    # Check if the DataFrame information is present in the catalog. If not
    # raise an error.
    if not catalog.is_df_info_present_in_catalog(data_frame):
        logger.error('DataFrame information is not present in the catalog')
        raise KeyError('DataFrame information is not present in the catalog')

    # Retrieve the properties for the DataFrame from the catalog and return
    # it back to the user.
    return catalog.get_all_properties(data_frame)
Exemplo n.º 14
0
from py_entitymatching.catalog.catalog import Catalog

__version__ = '0.1.0'

_catalog = Catalog.Instance()

# downsampling related methods

from py_entitymatching.sampler.down_sample import down_sample
# # io related methods
#
from py_entitymatching.io.parsers import read_csv_metadata, to_csv_metadata
from py_entitymatching.io.pickles import load_object, load_table, save_object, save_table
#
# import catalog related methods
from py_entitymatching.catalog.catalog_manager import get_property, get_all_properties, \
    set_property, del_property, del_all_properties, init_properties, copy_properties
from py_entitymatching.catalog.catalog_manager import get_catalog, del_catalog, \
    get_catalog_len, show_properties, show_properties_for_id
from py_entitymatching.catalog.catalog_manager import is_property_present_for_df, \
    is_dfinfo_present, is_catalog_empty
from py_entitymatching.catalog.catalog_manager import get_key, set_key, set_fk_ltable,\
    set_fk_rtable, get_ltable, get_rtable, validate_and_set_fk_ltable, \
    validate_and_set_fk_rtable, set_ltable, set_rtable, get_fk_rtable,  \
    get_fk_ltable


#
# # blockers
from py_entitymatching.blocker.attr_equiv_blocker import AttrEquivalenceBlocker
def copy_properties(source_data_frame, target_data_frame, replace=True):
    """
    Copies properties from a source DataFrame to target DataFrame in the
    catalog.

    Args:
        source_data_frame (DataFrame): The DataFrame from which the properties
            to be copied from, in the catalog.
        target_data_frame (DataFrame): The DataFrame to which the properties
            to be copied to, in the catalog.
        replace (boolean): A flag to indicate whether the source
            DataFrame's  properties can replace the target
            DataFrame's properties in the catalog. The default value for the
            flag is True.
            Specifically, if the target DataFrame's information is already
            present in the catalog then the function will check if the
            replace flag is True. If the flag is set to True, then the
            function will first delete the existing properties and then set
            it with the source DataFrame properties.
            If the flag is False, the function will just return without
            modifying the existing properties.

    Returns:
        A Boolean value of True is returned if the copying was successful.

    Raises:
        AssertionError: If `source_data_frame` is not of
            type pandas DataFrame.
        AssertionError: If `target_data_frame` is not of
            type pandas DataFrame.
        KeyError: If source DataFrame is not present in the
            catalog.


    """
    # Validate input parameters

    # # The source_data_frame is expected to be of type pandas DataFrame
    if not isinstance(source_data_frame, pd.DataFrame):
        logger.error('Input object (source_data_frame) is not of type pandas '
                     'DataFrame')
        raise AssertionError(
            'Input object (source_data_frame) is not of type pandas DataFrame')

    # # The target_data_frame is expected to be of type pandas DataFrame
    if not isinstance(target_data_frame, pd.DataFrame):
        logger.error('Input object (target_data_frame) is not of type pandas '
                     'DataFrame')
        raise AssertionError('Input object (target_data_frame) is not  of '
                             'type pandas DataFrame')

    # Get the catalog instance
    catalog = Catalog.Instance()

    # Check if the source DataFrame information is present in the catalog. If
    #  not raise an error.
    if catalog.is_df_info_present_in_catalog(source_data_frame) is False:
        logger.error(
            'DataFrame information (source_data_frame) is not present in the '
            'catalog')
        raise KeyError(
            'DataFrame information (source_data_frame) is not present in the '
            'catalog')

    # Get all properties for the source DataFrame
    metadata = catalog.get_all_properties(source_data_frame)

    # Set the properties to the target DataFrame. Specifically, call the set
    # properties function and relay its return value.

    # Note: There is a redundancy in validating the input parameters. This
    # might have a slight performance impact, but we don't expect that this
    # function gets called so often.
    return set_properties(target_data_frame, metadata,
                          replace)  # this initializes tar in the catalog.
def set_properties(data_frame, properties, replace=True):
    """
    Sets the  properties for a DataFrame in the catalog.

    Args:
        data_frame (DataFrame): DataFrame for which the properties must be set.
        properties (dict): A Python dictionary with keys as property names and
            values as Python objects (typically strings or DataFrames)
        replace (Optional[bool]): Flag to indicate whether the  input
            properties can replace the properties in the catalog. The default
            value for the flag is True.
            Specifically, if the DataFrame information is already present in
            the catalog then the function will check if the replace flag is
            True. If the flag is set to True, then the function will first
            delete the existing properties, set it with the given properties.
            If the flag is False, the function will just return without
            modifying the existing properties.


    Returns:
        A Boolean value of True is returned if the properties were set for
        the given DataFrame, else returns False.

    Raises:
        AssertionError: If the input data_frame object is not of type pandas
            DataFrame.
        AssertionError: If the input properties object is not of type Python
            dictionary.

    """
    # Validate input parameters
    # # Input object is expected to be a pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logger.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # # Input properties is expected to be of type Python dictionary
    if not isinstance(properties, dict):
        logger.error('The properties should be of type Python dictionary')
        raise AssertionError(
            'The properties should be of type Python dictionary')

    # Get the catalog instance
    catalog = Catalog.Instance()
    # Check if the the DataFrame information is present in the catalog. If
    # present, we expect the replace flag to be True. If the flag was set to
    # False, then warn the user and return False.
    if catalog.is_df_info_present_in_catalog(data_frame):
        if not replace:
            logger.warning(
                'Properties already exists for df ( %s ). Not replacing it' %
                str(id(data_frame)))
            return False
        else:
            # DataFrame information is present and replace flag is True. We
            # now reset the properties dictionary for this DataFrame.
            catalog.init_properties(data_frame)
    else:
        # The DataFrame information is not present in the catalog. so
        # initialize the properties
        catalog.init_properties(data_frame)

    # Now iterate through the given properties and set for the DataFrame.
    # Note: Here we don't check the correctness of the input properties (i.e
    # we do not check if a property 'key' is indeed a key)
    for property_name, property_value in six.iteritems(properties):
        catalog.set_property(data_frame, property_name, property_value)

    # Finally return True, if everything was successful
    return True