def test_check_fk_constraint_valid_1(self):
     A = pd.read_csv(path_a)
     B = pd.read_csv(path_b)
     C = pd.read_csv(path_c)
     status = ch.check_fk_constraint(C, 'ltable_ID', A, 'ID')
     self.assertEqual(status, True)
     status = ch.check_fk_constraint(C, 'rtable_ID', B, 'ID')
     self.assertEqual(status, True)
Пример #2
0
def _add_output_attributes(candset,
                           fk_ltable,
                           fk_rtable,
                           ltable=None,
                           rtable=None,
                           l_key=None,
                           r_key=None,
                           l_output_attrs=None,
                           r_output_attrs=None,
                           l_output_prefix='ltable_',
                           r_output_prefix='rtable_',
                           validate=True):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    if not isinstance(fk_ltable, six.string_types):
        logger.error('fk_ltable is not of type string')
        raise AssertionError('fk_ltable is not of type string')

    if not isinstance(fk_rtable, six.string_types):
        logger.error('fk_rtable is not of type string')
        raise AssertionError('fk_rtable is not of type string')

    if l_output_attrs is not None:

        if ltable is None:
            logger.error('ltable is not given to pull l_output_attrs')
            raise AssertionError('ltable is not given to pull l_output_attrs')
        if l_key is None:
            logger.error('ltable key cannot be None')
            raise AssertionError('ltable key cannot be None')

        if validate:
            check_fk_constraint(candset, fk_ltable, ltable, l_key)
        col_names = [l_output_prefix + c for c in l_output_attrs]
        l_df = create_proj_dataframe(ltable, l_key, candset[fk_ltable],
                                     l_output_attrs, col_names)

    if r_output_attrs is not None:
        if rtable is None:
            logger.error('rtable is not given to pull r_output_attrs')
            raise AssertionError('rtable is not given to pull r_output_attrs')
        if r_key is None:
            logger.error('rtable key cannot be None')
            raise AssertionError('rtable key cannot be None')
        if validate:
            check_fk_constraint(candset, fk_rtable, rtable, r_key)
        col_names = [r_output_prefix + c for c in r_output_attrs]
        r_df = create_proj_dataframe(rtable, r_key, candset[fk_rtable],
                                     r_output_attrs, col_names)

    if l_output_attrs is not None:
        candset = pd.concat([candset, l_df], axis=1)
    if r_output_attrs is not None:
        candset = pd.concat([candset, r_df], axis=1)
    return candset
def _add_output_attributes(candset, fk_ltable, fk_rtable, ltable=None, rtable=None,
                           l_key=None, r_key=None,
                           l_output_attrs=None, r_output_attrs=None,
                           l_output_prefix='ltable_', r_output_prefix='rtable_',
                           validate=True):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    if not isinstance(fk_ltable, six.string_types):
        logger.error('fk_ltable is not of type string')
        raise AssertionError('fk_ltable is not of type string')

    if not isinstance(fk_rtable, six.string_types):
        logger.error('fk_rtable is not of type string')
        raise AssertionError('fk_rtable is not of type string')

    if l_output_attrs is not None:

        if ltable is None:
            logger.error('ltable is not given to pull l_output_attrs')
            raise AssertionError('ltable is not given to pull l_output_attrs')
        if l_key is None:
            logger.error('ltable key cannot be None')
            raise AssertionError('ltable key cannot be None')

        if validate:
            check_fk_constraint(candset, fk_ltable, ltable, l_key)
        col_names = [l_output_prefix+c for c in l_output_attrs]
        l_df = create_proj_dataframe(ltable, l_key, candset[fk_ltable], l_output_attrs, col_names)

    if r_output_attrs is not None:
        if rtable is None:
            logger.error('rtable is not given to pull r_output_attrs')
            raise AssertionError('rtable is not given to pull r_output_attrs')
        if r_key is None:
            logger.error('rtable key cannot be None')
            raise AssertionError('rtable key cannot be None')
        if validate:
            check_fk_constraint(candset, fk_rtable, rtable, r_key)
        col_names = [r_output_prefix+c for c in r_output_attrs]
        r_df = create_proj_dataframe(rtable, r_key, candset[fk_rtable], r_output_attrs, col_names)

    if l_output_attrs is not None:
        candset = pd.concat([candset, l_df], axis=1)
    if r_output_attrs is not None:
        candset = pd.concat([candset, r_df], axis=1)
    return candset
 def test_check_fk_constraint_invalid_attr_mval(self):
     A = pd.read_csv(path_a)
     B = pd.read_csv(path_b)
     C = pd.read_csv(path_c)
     C.ix[0, 'ltable_ID'] = pd.np.NaN
     status = ch.check_fk_constraint(C, 'ltable_ID', A, 'ID')
     self.assertEqual(status, False)
def validate_and_set_fk_rtable(foreign_data_frame, foreign_key_rtable, rtable,
                               rtable_key):
    """
    Validates and set the foreign key ltable for a DataFrame in the the catalog.

    Specifically, given a DataFrame and a foreign key attribute it checks
    for the following conditions to be satisfied for the attribute. First it
    checks that foreign key rtable attribute does not have any missing
    values. Second it checks that the subset of foreign key values,
    have unique values in the primary (base) table.

    Args:
        foreign_data_frame (DataFrame): DataFrame containing the foreign key
            (typically a candidate set, for example output from blocking two
            tables).
        foreign_key_rtable (string): An attribute in the foreign DataFrame
        rtable (DataFrame): Base DataFrame, in which the foreign key
            attribute would form the primary key.
        rtable_key (string): An attribute in the base table
            (typically a primary key attribute).

    Returns:
        A Boolean value of True will be returned if the validation was
        successful and the update was successful in the catalog.
    Raises:
        AssertionError: If the input foreign DataFrame (foreign_data_frame)
            is not of type pandas DataFrame.
        AssertionError: If the foreign key ltable (foreign_key_ltable) is not
            of type string.
        AssertionError: If the input ltable (ltable) is not of type pandas
            DataFrame.
        AssertionError: If the ltable key (ltable_key) is not of type string.


    """

    # Validate the foreign key constraint
    # Note: All the basic input validations are done inside the
    # check_fk_constraint function.
    status = ch.check_fk_constraint(foreign_data_frame, foreign_key_rtable,
                                    rtable, rtable_key)

    # If the validation was successful, then set the property
    if status:
        return set_property(foreign_data_frame, 'fk_rtable',
                            foreign_key_rtable)
    # else just warn and return False
    else:
        logger.warning('FK constraint for fk_rtable is not satisfied; Not '
                       'setting the fk_rtable and rtable')
        return False
def _validate_metadata_for_candset(candset, key, foreign_key_ltable,
                                   foreign_key_rtable, ltable, rtable,
                                   ltable_key, rtable_key, lgr, verbose):
    """
    Validates metadata for a candidate set.

    """
    # Validate input parameters
    # # We expect candset to be of type pandas DataFrame
    if not isinstance(candset, pd.DataFrame):
        logger.error('Input candset is not of type pandas DataFrame')
        raise AssertionError('Input candset is not of type pandas DataFrame')

    # Check if the key column is present in the candset
    if not ch.check_attrs_present(candset, key):
        logger.error('Input key ( %s ) not in the DataFrame' % key)
        raise KeyError('Input key ( %s ) not in the DataFrame' % key)

    # Check if the foreign key ltable column is present in the candset
    if not ch.check_attrs_present(candset, foreign_key_ltable):
        logger.error('Input foreign_key_ltable ( %s ) not in the DataFrame' %
                     foreign_key_ltable)
        raise KeyError('Input foreign_key_ltable ( %s ) not in the DataFrame' %
                       foreign_key_ltable)

    # Check if the foreign key rtable column is present in the candset
    if not ch.check_attrs_present(candset, foreign_key_rtable):
        logger.error('Input fk_rtable ( %s ) not in the DataFrame' %
                     foreign_key_rtable)
        raise KeyError('Input fk_rtable ( %s ) not in the DataFrame' %
                       foreign_key_rtable)

    # We expect the ltable to be of type pandas DataFrame
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input ltable is not of type pandas data frame')
        raise AssertionError('Input ltable is not of type pandas data frame')

    # We expect the rtable to be of type pandas DataFrame
    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input rtable is not of type pandas data frame')
        raise AssertionError('Input rtable is not of type pandas data frame')

    # We expect the ltable key to be present in the ltable
    if not ch.check_attrs_present(ltable, ltable_key):
        logger.error('ltable key ( %s ) not in ltable' % ltable_key)
        raise KeyError('ltable key ( %s ) not in ltable' % ltable_key)

    # We expect the rtable key to be present in the rtable
    if not ch.check_attrs_present(rtable, rtable_key):
        logger.error('rtable key ( %s ) not in rtable' % rtable_key)
        raise KeyError('rtable key ( %s ) not in rtable' % rtable_key)

    # First validate metadata for the candidate set (as a table)
    _validate_metadata_for_table(candset, key, 'candset', lgr, verbose)

    ch.log_info(lgr, 'Validating foreign key constraint for left table',
                verbose)
    # Second check foreign key constraints
    if not ch.check_fk_constraint(candset, foreign_key_ltable, ltable,
                                  ltable_key):
        logger.error('Candset does not satisfy foreign key constraint with '
                     'the left table')
        raise AssertionError(
            'Candset does not satisfy foreign key constraint with '
            'the left table')

    if not ch.check_fk_constraint(candset, foreign_key_rtable, rtable,
                                  rtable_key):
        logger.error('Candset does not satisfy foreign key constraint with '
                     'the right table')
        raise AssertionError(
            'Candset does not satisfy foreign key constraint with '
            'the right table')

    ch.log_info(lgr, '..... Done', verbose)
    ch.log_info(lgr, 'Validating foreign key constraint for right table',
                verbose)
    ch.log_info(lgr, '..... Done', verbose)

    return True
 def test_check_fk_constraint_invalid_attr_notin(self):
     A = pd.read_csv(path_a)
     B = pd.read_csv(path_b)
     C = pd.read_csv(path_c)
     status = ch.check_fk_constraint(C, 'ltable_ID', A, 'ID1')
     self.assertEqual(status, False)
 def test_check_fk_constraint_invalid_foreign_attr(self):
     ch.check_fk_constraint(pd.DataFrame(), None, pd.DataFrame(), 'ID')
 def test_check_fk_constraint_invalid_base_attr(self):
     ch.check_fk_constraint(pd.DataFrame(), 'rtable_ID', pd.DataFrame(), None)
Пример #10
0
 def test_check_fk_constraint_invalid_foreign_df(self):
     ch.check_fk_constraint(None, 'rtable_ID', pd.DataFrame(), 'ID')