def test_rulebased_matcher_rule_wi_no_auto_gen_feature(self): feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) > 0.4'] # same as rule_1 self.brm.add_rule(test_rule, self.feature_table) predictions = self.brm.predict(table=self.C) assert_equal(expected_labels_1, predictions)
def test_rb_block_tables_rule_wi_no_auto_gen_feature(self): feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) < 0.3'] # same as rule_1 self.rb.add_rule(test_rule, self.feature_table) C = self.rb.block_tables(self.A, self.B, show_progress=False) validate_metadata(C) validate_data(C, expected_ids_1)
def test_rulebased_matcher_rule_wi_overlap_coeff_sim_fn(self): feature_string = "overlap_coeff(dlm_dc0(ltuple['name']), dlm_dc0(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) f_dict['is_auto_generated'] = True add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) > 1'] # should return an empty set self.brm.add_rule(test_rule, self.feature_table) predictions = self.brm.predict(table=self.C) assert_equal(expected_labels_all_zeroes, predictions)
def test_rb_block_tables_rule_wi_overlap_coeff_sim_fn(self): feature_string = "overlap_coeff(dlm_dc0(ltuple['name']), dlm_dc0(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) f_dict['is_auto_generated'] = True add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) <= 1'] # should return an empty set self.rb.add_rule(test_rule, self.feature_table) C = self.rb.block_tables(self.A, self.B, show_progress=False) validate_metadata(C) validate_data(C)
def test_trigger_rule_wi_no_auto_gen_feature(self): feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) > 0.4'] # same as rule_1 self.mt.add_cond_rule(test_rule, self.feature_table) self.mt.add_cond_status(False) self.mt.add_action(0) preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False) predictions = preds['neg_trig_labels'].tolist() assert_equal(expected_labels_1, predictions)
def test_trigger_rule_wi_no_auto_gen_feature(self): feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) > 0.4'] # same as rule_1 self.mt.add_cond_rule(test_rule, self.feature_table) self.mt.add_cond_status(False) self.mt.add_action(0) preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False) predictions = preds['neg_trig_labels'].tolist() assert_equal(expected_labels_1, predictions)
def test_trigger_rule_wi_diff_tokenizers(self): feature_string = "jaccard(qgm_3(ltuple['address']), dlm_dc0(rtuple['address']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) f_dict['is_auto_generated'] = True add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) > 1'] # should return an empty set self.mt.add_cond_rule(test_rule, self.feature_table) self.mt.add_cond_status(False) self.mt.add_action(0) preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False) predictions = preds['neg_trig_labels'].tolist() assert_equal(expected_labels_zeros, predictions)
def test_trigger_rule_wi_diff_tokenizers(self): feature_string = "jaccard(qgm_3(ltuple['address']), dlm_dc0(rtuple['address']))" f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(), get_sim_funs_for_blocking()) f_dict['is_auto_generated'] = True add_feature(self.feature_table, 'test', f_dict) test_rule = ['test(ltuple, rtuple) > 1'] # should return an empty set self.mt.add_cond_rule(test_rule, self.feature_table) self.mt.add_cond_status(False) self.mt.add_action(0) preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False) predictions = preds['neg_trig_labels'].tolist() assert_equal(expected_labels_zeros, predictions)
def test_valid_tok_sim_valid(self): sim = simfuncs.get_sim_funs_for_blocking() tok = toks.get_tokenizers_for_blocking() status = afg.check_valid_tok_sim(('lev1', 'tok', 'tok'), sim, tok) self.assertEqual(status, None)
def test_valid_tok_sim_valid(self): sim = simfuncs.get_sim_funs_for_blocking() tok = toks.get_tokenizers_for_blocking() status = afg.check_valid_tok_sim(('lev1', 'tok', 'tok'), sim, tok) self.assertEqual(status, None)
def test_get_tokenizers_for_blocking_invalid(self): tok.get_tokenizers_for_blocking(None, None)
def test_get_tokenizers_for_blocking(self): x = tok.get_tokenizers_for_blocking() self.assertEqual(isinstance(x, dict), True) input = 'data science' for name, value in six.iteritems(x): self.assertEqual(isinstance(value(input), list), True)
def get_features_for_blocking(ltable, rtable): """ This function automatically generates features that can be used for blocking purposes. Args: ltable,rtable (DataFrame): The pandas DataFrames for which the features are to be generated. Returns: A pandas DataFrame containing automatically generated features. Specifically, the DataFrame contains the following attributes: 'feature_name', 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', 'function_source', and 'is_auto_generated'. Further, this function also sets the following global variables: _block_t, _block_s, _atypes1, _atypes2, and _block_c. The variable _block_t contains the tokenizers used and _block_s contains the similarity functions used for creating features. The variables _atypes1, and _atypes2 contain the attribute types for ltable and rtable respectively. The variable _block_c contains the attribute correspondences between the two input tables. Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. Note: In the output DataFrame, two attributes demand some explanation: (1) function, and (2) is_auto_generated. The function, points to the actual Python function that implements the feature. Specifically, the function takes in two tuples (one from each input table) and returns a numeric value. The attribute is_auto_generated contains either True or False. The flag is True only if the feature is automatically generated by py_entitymatching. This is important because this flag is used to make some assumptions about the semantics of the similarity function used and use that information for scaling purposes. See Also: :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`, :meth:`py_entitymatching.get_sim_funs_for_blocking` :meth:`py_entitymatching.get_tokenizers_for_blocking` """ # Validate input parameters # # We expect the ltable to be of type pandas DataFrame if not isinstance(ltable, pd.DataFrame): logger.error('Input table A is not of type pandas DataFrame') raise AssertionError('Input table A is not of type pandas DataFrame') # # We expect the rtable to be of type pandas DataFrame if not isinstance(rtable, pd.DataFrame): logger.error('Input table B is not of type pandas dataframe') raise AssertionError('Input table B is not of type pandas dataframe') # Get the similarity functions to be used for blocking sim_funcs = sim.get_sim_funs_for_blocking() # Get the tokenizers to be used for blocking tok_funcs = tok.get_tokenizers_for_blocking() # Get the attr. types for ltable and rtable attr_types_ltable = au.get_attr_types(ltable) attr_types_rtable = au.get_attr_types(rtable) # Get the attr. correspondences between ltable and rtable attr_corres = au.get_attr_corres(ltable, rtable) # Get features based on attr types, attr correspondences, sim functions # and tok. functions feature_table = get_features(ltable, rtable, attr_types_ltable, attr_types_rtable, attr_corres, tok_funcs, sim_funcs) # Export important variables to global name space em._match_t = tok_funcs em._block_s = sim_funcs em._atypes1 = attr_types_ltable em._atypes2 = attr_types_rtable em._block_c = attr_corres # Return the feature table return feature_table
def get_features_for_blocking(ltable, rtable, validate_inferred_attr_types=True): """ This function automatically generates features that can be used for blocking purposes. Args: ltable,rtable (DataFrame): The pandas DataFrames for which the features are to be generated. validate_inferred_attr_types (boolean): A flag to indicate whether to show the user the inferred attribute types and the features chosen for those types. Returns: A pandas DataFrame containing automatically generated features. Specifically, the DataFrame contains the following attributes: 'feature_name', 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', 'function_source', and 'is_auto_generated'. Further, this function also sets the following global variables: _block_t, _block_s, _atypes1, _atypes2, and _block_c. The variable _block_t contains the tokenizers used and _block_s contains the similarity functions used for creating features. The variables _atypes1, and _atypes2 contain the attribute types for ltable and rtable respectively. The variable _block_c contains the attribute correspondences between the two input tables. Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `validate_inferred_attr_types` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> block_f = em.get_features_for_blocking(A, B) Note: In the output DataFrame, two attributes demand some explanation: (1) function, and (2) is_auto_generated. The function, points to the actual Python function that implements the feature. Specifically, the function takes in two tuples (one from each input table) and returns a numeric value. The attribute is_auto_generated contains either True or False. The flag is True only if the feature is automatically generated by py_entitymatching. This is important because this flag is used to make some assumptions about the semantics of the similarity function used and use that information for scaling purposes. See Also: :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`, :meth:`py_entitymatching.get_sim_funs_for_blocking` :meth:`py_entitymatching.get_tokenizers_for_blocking` """ # Validate input parameters # # We expect the ltable to be of type pandas DataFrame validate_object_type(ltable, pd.DataFrame, 'Input table A') # # We expect the rtable to be of type pandas DataFrame validate_object_type(rtable, pd.DataFrame, 'Input table B') # # We expect the validate_inferred_attr_types to be of type boolean validate_object_type(validate_inferred_attr_types, bool, 'Validate inferred attribute type') # Get the similarity functions to be used for blocking sim_funcs = sim.get_sim_funs_for_blocking() # Get the tokenizers to be used for blocking tok_funcs = tok.get_tokenizers_for_blocking() # Get the attr. types for ltable and rtable attr_types_ltable = au.get_attr_types(ltable) attr_types_rtable = au.get_attr_types(rtable) # Get the attr. correspondences between ltable and rtable attr_corres = au.get_attr_corres(ltable, rtable) # Show the user inferred attribute types and features and request # user permission to proceed if validate_inferred_attr_types: # if the user does not want to proceed, then exit the function if validate_attr_types(attr_types_ltable, attr_types_rtable, attr_corres) is None: return # Get features based on attr types, attr correspondences, sim functions # and tok. functions feature_table = get_features(ltable, rtable, attr_types_ltable, attr_types_rtable, attr_corres, tok_funcs, sim_funcs) # Export important variables to global name space em._block_t = tok_funcs em._block_s = sim_funcs em._atypes1 = attr_types_ltable em._atypes2 = attr_types_rtable em._block_c = attr_corres # Return the feature table return feature_table