def test_check_table_order_invalid_df2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) status = afg._check_table_order(A, None, l_attr_types, r_attr_types, attr_corres)
def test_validate_attr_types_invalid_corres(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) # attr_corres = au.get_attr_corres(A, B) response = afg.validate_attr_types(l_attr_types, r_attr_types, None)
def test_validate_attr_types_invalid_corres(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) # attr_corres = au.get_attr_corres(A, B) response = afg.validate_attr_types(l_attr_types, r_attr_types, None)
def test_check_table_order_invalid_df2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) status = afg._check_table_order(A, None, l_attr_types, r_attr_types, attr_corres)
def test_check_table_order_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) status = afg._check_table_order(A, B, l_attr_types, r_attr_types, attr_corres) self.assertEqual(status, True)
def test_check_table_order_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) status = afg._check_table_order(A, B, l_attr_types, r_attr_types, attr_corres) self.assertEqual(status, True)
def test_get_features_invalid_ltable_rtable_switch(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(B, A) tok = get_tokenizers_for_matching() sim = get_sim_funs_for_matching() feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
def test_check_table_order_invalid_attrcorres_ltable(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) attr_corres['ltable'] = pd.DataFrame() status = afg._check_table_order(A, B, l_attr_types, r_attr_types, attr_corres) self.assertEqual(status, False)
def test_check_table_order_invalid_attrcorres_ltable(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) attr_corres['ltable'] = pd.DataFrame() status = afg._check_table_order(A, B, l_attr_types, r_attr_types, attr_corres) self.assertEqual(status, False)
def test_get_features_invalid_ltable_rtable_switch(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(B, A) tok = get_tokenizers_for_matching() sim = get_sim_funs_for_matching() feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
def test_validate_attr_types_proceed_yes(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) with mockInput('y'): status = afg.validate_attr_types(l_attr_types, r_attr_types, attr_corres) self.assertEqual(status is None, False)
def test_get_features_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) tok = get_tokenizers_for_matching() sim = get_sim_funs_for_matching() feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim) self.assertEqual(isinstance(feat_table, pd.DataFrame), True) functions = feat_table['function'] for f in functions: x = f(A.ix[1], B.ix[2]) self.assertEqual(x >= 0, True)
def test_get_features_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) tok = get_tokenizers_for_matching() sim = get_sim_funs_for_matching() feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim) self.assertEqual(isinstance(feat_table, pd.DataFrame), True) functions = feat_table['function'] for f in functions: x = f(A.ix[1], B.ix[2]) self.assertEqual(x >= 0, True)
def test_validate_attr_types_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = au.get_attr_types(A) r_attr_types = au.get_attr_types(B) attr_corres = au.get_attr_corres(A, B) with mockInput('y'): validate_table = afg.validate_attr_types(l_attr_types, r_attr_types, attr_corres) self.assertEqual(isinstance(validate_table, pd.DataFrame), True) actual_names = pd.Series( ['ID', 'name', 'birth_year', 'hourly_wage', 'address', 'zipcode']) actual_l_types = pd.Series([ 'short string (1 word)', 'short string (1 word to 5 words)', 'numeric', 'numeric', 'short string (1 word to 5 words)', 'numeric' ]) actual_r_types = pd.Series([ 'short string (1 word)', 'short string (1 word to 5 words)', 'numeric', 'numeric', 'medium string (5 words to 10 words)', 'numeric' ]) actual_features = pd.Series([ 'Levenshtein Distance; Levenshtein Similarity', 'Jaccard Similarity [3-grams, 3-grams]; Cosine Similarity [Space Delimiter, Space Delimiter]', 'Exact Match; Absolute Norm', 'Exact Match; Absolute Norm', 'Not Applicable: Types do not match', 'Exact Match; Absolute Norm' ]) names = validate_table['Left Attribute'] features = validate_table['Example Features'] l_types = validate_table['Left Attribute Type'] r_types = validate_table['Right Attribute Type'] self.assertEqual(actual_names.equals(names), True) self.assertEqual(actual_features.equals(features), True) self.assertEqual(actual_l_types.equals(l_types), True) self.assertEqual(actual_r_types.equals(r_types), True)
def get_features_for_matching(ltable, rtable, validate_inferred_attr_types=True): """ This function automatically generates features that can be used for matching purposes. Args: ltable,rtable (DataFrame): The pandas DataFrames for which the features are to be generated. validate_inferred_attr_types (boolean): A flag to indicate whether to show the user the inferred attribute types and the features chosen for those types. Returns: A pandas DataFrame containing automatically generated features. Specifically, the DataFrame contains the following attributes: 'feature_name', 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', 'function_source', and 'is_auto_generated'. Further, this function also sets the following global variables: _match_t, _match_s, _atypes1, _atypes2, and _match_c. The variable _match_t contains the tokenizers used and _match_s contains the similarity functions used for creating features. The variables _atypes1, and _atypes2 contain the attribute types for ltable and rtable respectively. The variable _match_c contains the attribute correspondences between the two input tables. Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `validate_inferred_attr_types` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) Note: In the output DataFrame, two attributes demand some explanation: (1) function, and (2) is_auto_generated. The function, points to the actual Python function that implements the feature. Specifically, the function takes in two tuples (one from each input table) and returns a numeric value. The attribute is_auto_generated contains either True or False. The flag is True only if the feature is automatically generated by py_entitymatching. This is important because this flag is used to make some assumptions about the semantics of the similarity function used and use that information for scaling purposes. See Also: :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`, :meth:`py_entitymatching.get_sim_funs_for_matching` :meth:`py_entitymatching.get_tokenizers_for_matching` """ # Validate input parameters # # We expect the ltable to be of type pandas DataFrame validate_object_type(ltable, pd.DataFrame, 'Input table A') # # We expect the rtable to be of type pandas DataFrame validate_object_type(rtable, pd.DataFrame, 'Input table B') # # We expect the validate_inferred_attr_types to be of type boolean validate_object_type(validate_inferred_attr_types, bool, 'Validate inferred attribute type') # Get similarity functions for generating the features for matching sim_funcs = sim.get_sim_funs_for_matching() # Get tokenizer functions for generating the features for matching tok_funcs = tok.get_tokenizers_for_matching() # Get the attribute types of the input tables attr_types_ltable = au.get_attr_types(ltable) attr_types_rtable = au.get_attr_types(rtable) # Get the attribute correspondence between the input tables attr_corres = au.get_attr_corres(ltable, rtable) # Show the user inferred attribute types and features and request # user permission to proceed if validate_inferred_attr_types: # if the user does not want to proceed, then exit the function if validate_attr_types(attr_types_ltable, attr_types_rtable, attr_corres) is None: return # Get the features feature_table = get_features(ltable, rtable, attr_types_ltable, attr_types_rtable, attr_corres, tok_funcs, sim_funcs) # Export important variables to global name space em._match_t = tok_funcs em._match_s = sim_funcs em._atypes1 = attr_types_ltable em._atypes2 = attr_types_rtable em._match_c = attr_corres # Finally return the feature table return feature_table
def test_get_attr_types_invalid_df(self): x = get_attr_types(None)
def get_features_for_matching(ltable, rtable): """ This function automatically generates features that can be used for matching purposes. Args: ltable,rtable (DataFrame): The pandas DataFrames for which the features are to be generated. Returns: A pandas DataFrame containing automatically generated features. Specifically, the DataFrame contains the following attributes: 'feature_name', 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', 'function_source', and 'is_auto_generated'. Further, this function also sets the following global variables: _match_t, _match_s, _atypes1, _atypes2, and _match_c. The variable _match_t contains the tokenizers used and _match_s contains the similarity functions used for creating features. The variables _atypes1, and _atypes2 contain the attribute types for ltable and rtable respectively. The variable _match_c contains the attribute correspondences between the two input tables. Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. Note: In the output DataFrame, two attributes demand some explanation: (1) function, and (2) is_auto_generated. The function, points to the actual Python function that implements the feature. Specifically, the function takes in two tuples (one from each input table) and returns a numeric value. The attribute is_auto_generated contains either True or False. The flag is True only if the feature is automatically generated by py_entitymatching. This is important because this flag is used to make some assumptions about the semantics of the similarity function used and use that information for scaling purposes. See Also: :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`, :meth:`py_entitymatching.get_sim_funs_for_matching` :meth:`py_entitymatching.get_tokenizers_for_matching` """ # Validate input parameters # # We expect the ltable to be of type pandas DataFrame if not isinstance(ltable, pd.DataFrame): logger.error('Input table A is not of type pandas DataFrame') raise AssertionError('Input table A is not of type pandas DataFrame') # # We expect the rtable to be of type pandas DataFrame if not isinstance(rtable, pd.DataFrame): logger.error('Input table B is not of type pandas DataFrame') raise AssertionError('Input table B is not of type pandas DataFrame') # Get similarity functions for generating the features for matching sim_funcs = sim.get_sim_funs_for_matching() # Get tokenizer functions for generating the features for matching tok_funcs = tok.get_tokenizers_for_matching() # Get the attribute types of the input tables attr_types_ltable = au.get_attr_types(ltable) attr_types_rtable = au.get_attr_types(rtable) # Get the attribute correspondence between the input tables attr_corres = au.get_attr_corres(ltable, rtable) # Get the features feature_table = get_features(ltable, rtable, attr_types_ltable, attr_types_rtable, attr_corres, tok_funcs, sim_funcs) # Export important variables to global name space em._match_t = tok_funcs em._match_s = sim_funcs em._atypes1 = attr_types_ltable em._atypes2 = attr_types_ltable em._match_c = attr_corres # Finally return the feature table return feature_table
def test_get_attr_types_valid(self): A = read_csv_metadata(path_a) x = get_attr_types(A)
def test_get_attr_types_valid(self): A = read_csv_metadata(path_a) x = get_attr_types(A)
def test_get_attr_types_invalid_df(self): x = get_attr_types(None)