def test_check_table_order_invalid_df2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(A, B)
     status = afg._check_table_order(A, None, l_attr_types, r_attr_types, attr_corres)
 def test_validate_attr_types_invalid_corres(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     # attr_corres = au.get_attr_corres(A, B)
     response = afg.validate_attr_types(l_attr_types, r_attr_types, None)
Пример #3
0
 def test_validate_attr_types_invalid_corres(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     # attr_corres = au.get_attr_corres(A, B)
     response = afg.validate_attr_types(l_attr_types, r_attr_types, None)
Пример #4
0
 def test_check_table_order_invalid_df2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(A, B)
     status = afg._check_table_order(A, None, l_attr_types, r_attr_types,
                                     attr_corres)
 def test_check_table_order_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(A, B)
     status = afg._check_table_order(A, B, l_attr_types, r_attr_types, attr_corres)
     self.assertEqual(status, True)
Пример #6
0
 def test_check_table_order_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(A, B)
     status = afg._check_table_order(A, B, l_attr_types, r_attr_types,
                                     attr_corres)
     self.assertEqual(status, True)
 def test_get_features_invalid_ltable_rtable_switch(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(B, A)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
 def test_check_table_order_invalid_attrcorres_ltable(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(A, B)
     attr_corres['ltable'] = pd.DataFrame()
     status = afg._check_table_order(A, B, l_attr_types, r_attr_types, attr_corres)
     self.assertEqual(status, False)
Пример #9
0
 def test_check_table_order_invalid_attrcorres_ltable(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(A, B)
     attr_corres['ltable'] = pd.DataFrame()
     status = afg._check_table_order(A, B, l_attr_types, r_attr_types,
                                     attr_corres)
     self.assertEqual(status, False)
Пример #10
0
 def test_get_features_invalid_ltable_rtable_switch(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(B, A)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types,
                                   attr_corres, tok, sim)
Пример #11
0
    def test_validate_attr_types_proceed_yes(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        l_attr_types = au.get_attr_types(A)
        r_attr_types = au.get_attr_types(B)
        attr_corres = au.get_attr_corres(A, B)

        with mockInput('y'):
            status = afg.validate_attr_types(l_attr_types, r_attr_types,
                                             attr_corres)
            self.assertEqual(status is None, False)
 def test_get_features_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(A, B)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
     self.assertEqual(isinstance(feat_table, pd.DataFrame), True)
     functions = feat_table['function']
     for f in functions:
         x = f(A.ix[1], B.ix[2])
         self.assertEqual(x >= 0, True)
Пример #13
0
 def test_get_features_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(A, B)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types,
                                   attr_corres, tok, sim)
     self.assertEqual(isinstance(feat_table, pd.DataFrame), True)
     functions = feat_table['function']
     for f in functions:
         x = f(A.ix[1], B.ix[2])
         self.assertEqual(x >= 0, True)
Пример #14
0
    def test_validate_attr_types_valid(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        l_attr_types = au.get_attr_types(A)
        r_attr_types = au.get_attr_types(B)
        attr_corres = au.get_attr_corres(A, B)

        with mockInput('y'):
            validate_table = afg.validate_attr_types(l_attr_types,
                                                     r_attr_types, attr_corres)

        self.assertEqual(isinstance(validate_table, pd.DataFrame), True)

        actual_names = pd.Series(
            ['ID', 'name', 'birth_year', 'hourly_wage', 'address', 'zipcode'])
        actual_l_types = pd.Series([
            'short string (1 word)', 'short string (1 word to 5 words)',
            'numeric', 'numeric', 'short string (1 word to 5 words)', 'numeric'
        ])
        actual_r_types = pd.Series([
            'short string (1 word)', 'short string (1 word to 5 words)',
            'numeric', 'numeric', 'medium string (5 words to 10 words)',
            'numeric'
        ])
        actual_features = pd.Series([
            'Levenshtein Distance; Levenshtein Similarity',
            'Jaccard Similarity [3-grams, 3-grams]; Cosine Similarity [Space Delimiter, Space Delimiter]',
            'Exact Match; Absolute Norm', 'Exact Match; Absolute Norm',
            'Not Applicable: Types do not match', 'Exact Match; Absolute Norm'
        ])

        names = validate_table['Left Attribute']
        features = validate_table['Example Features']
        l_types = validate_table['Left Attribute Type']
        r_types = validate_table['Right Attribute Type']

        self.assertEqual(actual_names.equals(names), True)
        self.assertEqual(actual_features.equals(features), True)
        self.assertEqual(actual_l_types.equals(l_types), True)
        self.assertEqual(actual_r_types.equals(r_types), True)
def get_features_for_matching(ltable, rtable, validate_inferred_attr_types=True):
    """
    This function automatically generates features that can be used for
    matching purposes.

    Args:
        ltable,rtable (DataFrame): The pandas DataFrames for which the
            features are to be generated.
        validate_inferred_attr_types (boolean): A flag to indicate whether to 
            show the user the inferred attribute types and the features
            chosen for those types.

    Returns:
        A pandas DataFrame containing automatically generated features.

        Specifically, the DataFrame contains the following attributes:
        'feature_name', 'left_attribute', 'right_attribute',
        'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
        'function', 'function_source', and 'is_auto_generated'.


        Further, this function also sets the following global variables:
        _match_t, _match_s, _atypes1, _atypes2, and _match_c.

        The variable _match_t contains the tokenizers used and  _match_s
        contains the similarity functions used for creating features.

        The variables _atypes1, and  _atypes2 contain the attribute types for
        ltable and rtable respectively. The variable _match_c contains the
        attribute correspondences between the two input tables.

    Raises:
        AssertionError: If `ltable` is not of type pandas
            DataFrame.
        AssertionError: If `rtable` is not of type pandas
            DataFrame.
        AssertionError: If `validate_inferred_attr_types` is not of type
            pandas DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)

    Note:
        In the output DataFrame, two
        attributes demand some explanation: (1) function, and (2)
        is_auto_generated. The function, points to the actual Python function
        that implements the feature. Specifically, the function takes in two
        tuples (one from each input table) and returns a numeric value. The
        attribute is_auto_generated contains either True or False. The flag
        is True only if the feature is automatically generated by py_entitymatching.
        This is important because this flag is used to make some assumptions
        about the semantics of the similarity function used and use that
        information for scaling purposes.

    See Also:
     :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
     :meth:`py_entitymatching.get_sim_funs_for_matching`
     :meth:`py_entitymatching.get_tokenizers_for_matching`

    """
    # Validate input parameters
    # # We expect the ltable to be of type pandas DataFrame
    validate_object_type(ltable, pd.DataFrame, 'Input table A')

    # # We expect the rtable to be of type pandas DataFrame
    validate_object_type(rtable, pd.DataFrame, 'Input table B')

    # # We expect the validate_inferred_attr_types to be of type boolean
    validate_object_type(validate_inferred_attr_types, bool, 'Validate inferred attribute type')

    # Get similarity functions for generating the features for matching
    sim_funcs = sim.get_sim_funs_for_matching()
    # Get tokenizer functions for generating the features for matching
    tok_funcs = tok.get_tokenizers_for_matching()

    # Get the attribute types of the input tables
    attr_types_ltable = au.get_attr_types(ltable)
    attr_types_rtable = au.get_attr_types(rtable)

    # Get the attribute correspondence between the input tables
    attr_corres = au.get_attr_corres(ltable, rtable)

    # Show the user inferred attribute types and features and request
    # user permission to proceed
    if validate_inferred_attr_types:
        # if the user does not want to proceed, then exit the function
        if validate_attr_types(attr_types_ltable, attr_types_rtable, attr_corres) is None:
            return

    # Get the features
    feature_table = get_features(ltable, rtable, attr_types_ltable,
                                 attr_types_rtable, attr_corres,
                                 tok_funcs, sim_funcs)

    # Export important variables to global name space
    em._match_t = tok_funcs
    em._match_s = sim_funcs
    em._atypes1 = attr_types_ltable
    em._atypes2 = attr_types_rtable
    em._match_c = attr_corres

    # Finally return the feature table
    return feature_table
 def test_get_attr_types_invalid_df(self):
     x = get_attr_types(None)
def get_features_for_matching(ltable, rtable):
    """
    This function automatically generates features that can be used for
    matching purposes.

    Args:
        ltable,rtable (DataFrame): The pandas DataFrames for which the
            features are to be generated.

    Returns:
        A pandas DataFrame containing automatically generated features.

        Specifically, the DataFrame contains the following attributes:
        'feature_name', 'left_attribute', 'right_attribute',
        'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
        'function', 'function_source', and 'is_auto_generated'.


        Further, this function also sets the following global variables:
        _match_t, _match_s, _atypes1, _atypes2, and _match_c.

        The variable _match_t contains the tokenizers used and  _match_s
        contains the similarity functions used for creating features.

        The variables _atypes1, and  _atypes2 contain the attribute types for
        ltable and rtable respectively. The variable _match_c contains the
        attribute correspondences between the two input tables.

    Raises:
        AssertionError: If `ltable` is not of type pandas
            DataFrame.
        AssertionError: If `rtable` is not of type pandas
            DataFrame.
    Note:
        In the output DataFrame, two
        attributes demand some explanation: (1) function, and (2)
        is_auto_generated. The function, points to the actual Python function
        that implements the feature. Specifically, the function takes in two
        tuples (one from each input table) and returns a numeric value. The
        attribute is_auto_generated contains either True or False. The flag
        is True only if the feature is automatically generated by py_entitymatching.
        This is important because this flag is used to make some assumptions
        about the semantics of the similarity function used and use that
        information for scaling purposes.

    See Also:
     :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
     :meth:`py_entitymatching.get_sim_funs_for_matching`
     :meth:`py_entitymatching.get_tokenizers_for_matching`

    """
    # Validate input parameters
    # # We expect the ltable to be of type pandas DataFrame
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A is not of type pandas DataFrame')
        raise AssertionError('Input table A is not of type pandas DataFrame')

    # # We expect the rtable to be of type pandas DataFrame
    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B is not of type pandas DataFrame')
        raise AssertionError('Input table B is not of type pandas DataFrame')

    # Get similarity functions for generating the features for matching
    sim_funcs = sim.get_sim_funs_for_matching()
    # Get tokenizer functions for generating the features for matching
    tok_funcs = tok.get_tokenizers_for_matching()

    # Get the attribute types of the input tables
    attr_types_ltable = au.get_attr_types(ltable)
    attr_types_rtable = au.get_attr_types(rtable)

    # Get the attribute correspondence between the input tables
    attr_corres = au.get_attr_corres(ltable, rtable)

    # Get the features
    feature_table = get_features(ltable, rtable, attr_types_ltable,
                                 attr_types_rtable, attr_corres, tok_funcs,
                                 sim_funcs)

    # Export important variables to global name space
    em._match_t = tok_funcs
    em._match_s = sim_funcs
    em._atypes1 = attr_types_ltable
    em._atypes2 = attr_types_ltable
    em._match_c = attr_corres

    # Finally return the feature table
    return feature_table
 def test_get_attr_types_valid(self):
     A = read_csv_metadata(path_a)
     x = get_attr_types(A)
 def test_get_attr_types_valid(self):
     A = read_csv_metadata(path_a)
     x = get_attr_types(A)
 def test_get_attr_types_invalid_df(self):
     x = get_attr_types(None)