예제 #1
0
 def test_rulebased_matcher_rule_wi_no_auto_gen_feature(self):
     feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                             get_sim_funs_for_blocking())
     add_feature(self.feature_table, 'test', f_dict)
     test_rule = ['test(ltuple, rtuple) > 0.4']  # same as rule_1
     self.brm.add_rule(test_rule, self.feature_table)
     predictions = self.brm.predict(table=self.C)
     assert_equal(expected_labels_1, predictions)
 def test_rb_block_tables_rule_wi_no_auto_gen_feature(self):
     feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                             get_sim_funs_for_blocking())
     add_feature(self.feature_table, 'test', f_dict)
     test_rule = ['test(ltuple, rtuple) < 0.3'] # same as rule_1
     self.rb.add_rule(test_rule, self.feature_table)
     C = self.rb.block_tables(self.A, self.B, show_progress=False)
     validate_metadata(C)
     validate_data(C, expected_ids_1)
예제 #3
0
 def test_rulebased_matcher_rule_wi_overlap_coeff_sim_fn(self):
     feature_string = "overlap_coeff(dlm_dc0(ltuple['name']), dlm_dc0(rtuple['name']))"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                             get_sim_funs_for_blocking())
     f_dict['is_auto_generated'] = True
     add_feature(self.feature_table, 'test', f_dict)
     test_rule = ['test(ltuple, rtuple) > 1']  # should return an empty set
     self.brm.add_rule(test_rule, self.feature_table)
     predictions = self.brm.predict(table=self.C)
     assert_equal(expected_labels_all_zeroes, predictions)
 def test_rb_block_tables_rule_wi_overlap_coeff_sim_fn(self):
     feature_string = "overlap_coeff(dlm_dc0(ltuple['name']), dlm_dc0(rtuple['name']))"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                             get_sim_funs_for_blocking())
     f_dict['is_auto_generated'] = True
     add_feature(self.feature_table, 'test', f_dict)
     test_rule = ['test(ltuple, rtuple) <= 1'] # should return an empty set
     self.rb.add_rule(test_rule, self.feature_table)
     C = self.rb.block_tables(self.A, self.B, show_progress=False)
     validate_metadata(C)
     validate_data(C)
    def test_trigger_rule_wi_no_auto_gen_feature(self):
        feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                                get_sim_funs_for_blocking())
        add_feature(self.feature_table, 'test', f_dict)
        test_rule = ['test(ltuple, rtuple) > 0.4']  # same as rule_1

        self.mt.add_cond_rule(test_rule, self.feature_table)
        self.mt.add_cond_status(False)
        self.mt.add_action(0)
        preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False)
        predictions = preds['neg_trig_labels'].tolist()
        assert_equal(expected_labels_1, predictions)
    def test_trigger_rule_wi_no_auto_gen_feature(self):
        feature_string = "jaccard(qgm_3(ltuple['name']), qgm_3(rtuple['name']))"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                                get_sim_funs_for_blocking())
        add_feature(self.feature_table, 'test', f_dict)
        test_rule = ['test(ltuple, rtuple) > 0.4']  # same as rule_1

        self.mt.add_cond_rule(test_rule, self.feature_table)
        self.mt.add_cond_status(False)
        self.mt.add_action(0)
        preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False)
        predictions = preds['neg_trig_labels'].tolist()
        assert_equal(expected_labels_1, predictions)
    def test_trigger_rule_wi_diff_tokenizers(self):
        feature_string = "jaccard(qgm_3(ltuple['address']), dlm_dc0(rtuple['address']))"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                                get_sim_funs_for_blocking())
        f_dict['is_auto_generated'] = True
        add_feature(self.feature_table, 'test', f_dict)
        test_rule = ['test(ltuple, rtuple) > 1']  # should return an empty set

        self.mt.add_cond_rule(test_rule, self.feature_table)
        self.mt.add_cond_status(False)
        self.mt.add_action(0)
        preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False)
        predictions = preds['neg_trig_labels'].tolist()
        assert_equal(expected_labels_zeros, predictions)
    def test_trigger_rule_wi_diff_tokenizers(self):
        feature_string = "jaccard(qgm_3(ltuple['address']), dlm_dc0(rtuple['address']))"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_blocking(),
                                get_sim_funs_for_blocking())
        f_dict['is_auto_generated'] = True
        add_feature(self.feature_table, 'test', f_dict)
        test_rule = ['test(ltuple, rtuple) > 1']  # should return an empty set

        self.mt.add_cond_rule(test_rule, self.feature_table)
        self.mt.add_cond_status(False)
        self.mt.add_action(0)
        preds = self.mt.execute(self.C, 'neg_trig_labels', inplace=False)
        predictions = preds['neg_trig_labels'].tolist()
        assert_equal(expected_labels_zeros, predictions)
 def test_valid_tok_sim_valid(self):
     sim = simfuncs.get_sim_funs_for_blocking()
     tok = toks.get_tokenizers_for_blocking()
     status = afg.check_valid_tok_sim(('lev1', 'tok', 'tok'), sim, tok)
     self.assertEqual(status, None)
예제 #10
0
 def test_valid_tok_sim_valid(self):
     sim = simfuncs.get_sim_funs_for_blocking()
     tok = toks.get_tokenizers_for_blocking()
     status = afg.check_valid_tok_sim(('lev1', 'tok', 'tok'), sim, tok)
     self.assertEqual(status, None)
 def test_get_tokenizers_for_blocking_invalid(self):
     tok.get_tokenizers_for_blocking(None, None)
 def test_get_tokenizers_for_blocking(self):
     x = tok.get_tokenizers_for_blocking()
     self.assertEqual(isinstance(x, dict), True)
     input = 'data science'
     for name, value in six.iteritems(x):
         self.assertEqual(isinstance(value(input), list), True)
def get_features_for_blocking(ltable, rtable):
    """

    This function automatically generates features that can be used for
    blocking purposes.

    Args:
        ltable,rtable (DataFrame): The pandas DataFrames for which the
            features are to be generated.

    Returns:
        A pandas DataFrame containing automatically generated features.

        Specifically, the DataFrame contains the following attributes:
        'feature_name', 'left_attribute', 'right_attribute',
        'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
        'function', 'function_source', and 'is_auto_generated'.


        Further, this function also sets the following global variables:
        _block_t, _block_s, _atypes1, _atypes2, and _block_c.

        The variable _block_t contains the tokenizers used and  _block_s
        contains the similarity functions used for creating features.

        The variables _atypes1, and  _atypes2 contain the attribute types for
        ltable and rtable respectively. The variable _block_c contains the
        attribute correspondences between the two input tables.

    Raises:
        AssertionError: If `ltable` is not of type pandas
            DataFrame.
        AssertionError: If `rtable` is not of type pandas
            DataFrame.
    Note:
        In the output DataFrame, two
        attributes demand some explanation: (1) function, and (2)
        is_auto_generated. The function, points to the actual Python function
        that implements the feature. Specifically, the function takes in two
        tuples (one from each input table) and returns a numeric value. The
        attribute is_auto_generated contains either True or False. The flag
        is True only if the feature is automatically generated by py_entitymatching.
        This is important because this flag is used to make some assumptions
        about the semantics of the similarity function used and use that
        information for scaling purposes.

    See Also:
     :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
     :meth:`py_entitymatching.get_sim_funs_for_blocking`
     :meth:`py_entitymatching.get_tokenizers_for_blocking`

    """
    # Validate input parameters
    # # We expect the ltable to be of type pandas DataFrame
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A is not of type pandas DataFrame')
        raise AssertionError('Input table A is not of type pandas DataFrame')

    # # We expect the rtable to be of type pandas DataFrame
    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B is not of type pandas dataframe')
        raise AssertionError('Input table B is not of type pandas dataframe')
    # Get the similarity functions to be used for blocking
    sim_funcs = sim.get_sim_funs_for_blocking()
    # Get the tokenizers to be used for blocking
    tok_funcs = tok.get_tokenizers_for_blocking()

    # Get the attr. types for ltable and rtable
    attr_types_ltable = au.get_attr_types(ltable)
    attr_types_rtable = au.get_attr_types(rtable)
    # Get the attr. correspondences between ltable and rtable
    attr_corres = au.get_attr_corres(ltable, rtable)
    # Get features based on attr types, attr correspondences, sim functions
    # and tok. functions
    feature_table = get_features(ltable, rtable, attr_types_ltable,
                                 attr_types_rtable, attr_corres, tok_funcs,
                                 sim_funcs)

    # Export important variables to global name space
    em._match_t = tok_funcs
    em._block_s = sim_funcs
    em._atypes1 = attr_types_ltable
    em._atypes2 = attr_types_rtable
    em._block_c = attr_corres
    # Return the feature table
    return feature_table
def get_features_for_blocking(ltable, rtable, validate_inferred_attr_types=True):
    """

    This function automatically generates features that can be used for
    blocking purposes.

    Args:
        ltable,rtable (DataFrame): The pandas DataFrames for which the
            features are to be generated.
        validate_inferred_attr_types (boolean): A flag to indicate whether to 
            show the user the inferred attribute types and the features
            chosen for those types.

    Returns:
        A pandas DataFrame containing automatically generated features.

        Specifically, the DataFrame contains the following attributes:
        'feature_name', 'left_attribute', 'right_attribute',
        'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
        'function', 'function_source', and 'is_auto_generated'.


        Further, this function also sets the following global variables:
        _block_t, _block_s, _atypes1, _atypes2, and _block_c.

        The variable _block_t contains the tokenizers used and  _block_s
        contains the similarity functions used for creating features.

        The variables _atypes1, and  _atypes2 contain the attribute types for
        ltable and rtable respectively. The variable _block_c contains the
        attribute correspondences between the two input tables.

    Raises:
        AssertionError: If `ltable` is not of type pandas
            DataFrame.
        AssertionError: If `rtable` is not of type pandas
            DataFrame.
        AssertionError: If `validate_inferred_attr_types` is not of type
            pandas DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> block_f = em.get_features_for_blocking(A, B)

    Note:
        In the output DataFrame, two
        attributes demand some explanation: (1) function, and (2)
        is_auto_generated. The function, points to the actual Python function
        that implements the feature. Specifically, the function takes in two
        tuples (one from each input table) and returns a numeric value. The
        attribute is_auto_generated contains either True or False. The flag
        is True only if the feature is automatically generated by py_entitymatching.
        This is important because this flag is used to make some assumptions
        about the semantics of the similarity function used and use that
        information for scaling purposes.

    See Also:
     :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
     :meth:`py_entitymatching.get_sim_funs_for_blocking`
     :meth:`py_entitymatching.get_tokenizers_for_blocking`

    """
    # Validate input parameters
    # # We expect the ltable to be of type pandas DataFrame
    validate_object_type(ltable, pd.DataFrame, 'Input table A')

    # # We expect the rtable to be of type pandas DataFrame
    validate_object_type(rtable, pd.DataFrame, 'Input table B')

    # # We expect the validate_inferred_attr_types to be of type boolean
    validate_object_type(validate_inferred_attr_types, bool, 'Validate inferred attribute type')

    # Get the similarity functions to be used for blocking
    sim_funcs = sim.get_sim_funs_for_blocking()
    # Get the tokenizers to be used for blocking
    tok_funcs = tok.get_tokenizers_for_blocking()

    # Get the attr. types for ltable and rtable
    attr_types_ltable = au.get_attr_types(ltable)
    attr_types_rtable = au.get_attr_types(rtable)
    # Get the attr. correspondences between ltable and rtable
    attr_corres = au.get_attr_corres(ltable, rtable)
    
    # Show the user inferred attribute types and features and request
    # user permission to proceed
    if validate_inferred_attr_types:
        # if the user does not want to proceed, then exit the function
        if validate_attr_types(attr_types_ltable, attr_types_rtable, attr_corres) is None:
            return

    # Get features based on attr types, attr correspondences, sim functions
    # and tok. functions
    feature_table = get_features(ltable, rtable, attr_types_ltable,
                                 attr_types_rtable, attr_corres,
                                 tok_funcs, sim_funcs)

    # Export important variables to global name space
    em._block_t = tok_funcs
    em._block_s = sim_funcs
    em._atypes1 = attr_types_ltable
    em._atypes2 = attr_types_rtable
    em._block_c = attr_corres
    # Return the feature table
    return feature_table