def is_conjunct_filterable(self, conjunct, rule_name):
     # a conjunct is filterable if it uses
     # a filterable sim function (jaccard, cosine, dice, ...),
     # an allowed operator (<, <=),
     is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, th = parse_conjunct(
         conjunct, self.rule_ft[rule_name])
     if is_auto_gen != True:
         # conjunct not filterable as the feature is not auto generated
         return False
     if sim_fn == 'lev_dist':
         if op == '>' or op == '>=':
             return True
         else:
             # conjunct not filterable due to unsupported operator
             return False
     if l_tok != r_tok:
         # conjunct not filterable because left and right tokenizers mismatch
         return False
     if sim_fn not in self.filterable_sim_fns:
         # conjunct not filterable due to unsupported sim_fn
         return False
     if op not in self.allowed_ops:
         # conjunct not filterable due to unsupported operator
         return False
     # conjunct is filterable
     return True
예제 #2
0
 def is_conjunct_filterable(self, conjunct, rule_name):
     # a conjunct is filterable if it uses
     # a filterable sim function (jaccard, cosine, dice, ...),
     # an allowed operator (<, <=),
     is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, th = parse_conjunct(
         conjunct, self.rule_ft[rule_name])
     if is_auto_gen != True:
         # conjunct not filterable as the feature is not auto generated
         return False
     if sim_fn == 'lev_dist':
         if op == '>' or op == '>=':
             return True
         else:
             # conjunct not filterable due to unsupported operator
             return False
     if l_tok != r_tok:
         # conjunct not filterable because left and right tokenizers mismatch
         return False
     if sim_fn not in self.filterable_sim_fns:
         # conjunct not filterable due to unsupported sim_fn
         return False
     if op not in self.allowed_ops:
         # conjunct not filterable due to unsupported operator
         return False
     # conjunct is filterable
     return True
예제 #3
0
    def apply_filterable_rule(self, rule_name, l_df, r_df, l_key, r_key,
                              l_output_attrs, r_output_attrs, l_output_prefix,
                              r_output_prefix, verbose, show_progress,
                              n_chunks):
        candset = None
        conjunct_list = self.rule_str[rule_name]
        for conjunct in conjunct_list:
            is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, th = parse_conjunct(
                conjunct, self.rule_ft[rule_name])

            if l_tok == 'dlm_dc0':
                tokenizer = WhitespaceTokenizer(return_set=True)
            elif l_tok == 'qgm_3':
                tokenizer = QgramTokenizer(qval=3, return_set=True)

            if sim_fn == 'jaccard':
                join_fn = ssj.jaccard_join
            elif sim_fn == 'cosine':
                join_fn = ssj.cosine_join
            elif sim_fn == 'dice':
                join_fn = ssj.dice_join
            elif sim_fn == 'overlap_coeff':
                join_fn = ssj.overlap_coefficient_join
            elif sim_fn == 'lev_dist':
                join_fn = ssj.edit_distance_join

            if join_fn == ssj.edit_distance_join:
                comp_op = '<='
                if op == '>=':
                    comp_op = '<'
            else:
                comp_op = '>='
                if op == '<=':
                    comp_op = '>'

            ssj.dataframe_column_to_str(l_df, l_attr, inplace=True)
            ssj.dataframe_column_to_str(r_df, r_attr, inplace=True)

            if join_fn == ssj.edit_distance_join:
                c_df = join_fn(l_df, r_df, l_key, r_key, l_attr, r_attr,
                               float(th), comp_op, True, l_output_attrs,
                               r_output_attrs, l_output_prefix,
                               r_output_prefix, False, n_chunks, show_progress)
            else:
                c_df = join_fn(l_df, r_df,
                               l_key, r_key, l_attr, r_attr, tokenizer,
                               float(th), comp_op, True, True, l_output_attrs,
                               r_output_attrs, l_output_prefix,
                               r_output_prefix, False, n_chunks, show_progress)
            if candset is not None:
                # union the candset of this conjunct with the existing candset
                candset = pd.concat([candset, c_df]).drop_duplicates(
                    [l_output_prefix + l_key,
                     r_output_prefix + r_key]).reset_index(drop=True)
            else:
                # candset from the first conjunct of the rule
                candset = c_df
        return candset
    def apply_filterable_rule(self, rule_name, l_df, r_df, l_key, r_key,
                              l_output_attrs, r_output_attrs,
                              l_output_prefix, r_output_prefix,
                              verbose, show_progress, n_jobs):
        candset = None
        conjunct_list = self.rule_str[rule_name]
        for conjunct in conjunct_list:
            is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, th = parse_conjunct(
                conjunct, self.rule_ft[rule_name])

            if l_tok == 'dlm_dc0':
                tokenizer = WhitespaceTokenizer(return_set=True)
            elif l_tok == 'qgm_3':
                tokenizer = QgramTokenizer(qval=3, return_set=True)

            if sim_fn == 'jaccard':
                join_fn = ssj.jaccard_join
            elif sim_fn == 'cosine':
                join_fn = ssj.cosine_join
            elif sim_fn == 'dice':
                join_fn = ssj.dice_join
            elif sim_fn == 'overlap_coeff':
                join_fn = ssj.overlap_coefficient_join
            elif sim_fn == 'lev_dist':
                join_fn = ssj.edit_distance_join

            if join_fn == ssj.edit_distance_join:
                comp_op = '<='
                if op == '>=':
                    comp_op = '<'
            else:
                comp_op = '>='
                if op == '<=':
                    comp_op = '>'

            ssj.dataframe_column_to_str(l_df, l_attr, inplace=True) 
            ssj.dataframe_column_to_str(r_df, r_attr, inplace=True)
 
            if join_fn == ssj.edit_distance_join:
                c_df = join_fn(l_df, r_df, l_key, r_key, l_attr, r_attr,
                               float(th), comp_op, True, l_output_attrs,
                               r_output_attrs, l_output_prefix,
                               r_output_prefix, False, n_jobs, show_progress)
            else:
                c_df = join_fn(l_df, r_df, l_key, r_key, l_attr, r_attr,
                               tokenizer, float(th), comp_op, True, True,
                               l_output_attrs, r_output_attrs,
                               l_output_prefix,
                               r_output_prefix, False, n_jobs, show_progress)
            if candset is not None:
                # union the candset of this conjunct with the existing candset
                candset = pd.concat([candset, c_df]).drop_duplicates(
                    [l_output_prefix + l_key,
                     r_output_prefix + r_key]).reset_index(drop=True)
            else:
                # candset from the first conjunct of the rule
                candset = c_df
        return candset
예제 #5
0
 def get_attrs_to_project(self, l_key, r_key, l_output_attrs,
                          r_output_attrs):
     l_proj_attrs = [l_key]
     r_proj_attrs = [r_key]
     if l_output_attrs:
         l_proj_attrs.extend(
             [c for c in l_output_attrs if c not in l_proj_attrs])
     if r_output_attrs:
         r_proj_attrs.extend(
             [c for c in r_output_attrs if c not in r_proj_attrs])
     for rule_name, conjunct_list in six.iteritems(self.rule_str):
         for conjunct in conjunct_list:
             is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, th = parse_conjunct(
                 conjunct, self.rule_ft[rule_name])
             if l_attr not in l_proj_attrs:
                 l_proj_attrs.append(l_attr)
             if r_attr not in r_proj_attrs:
                 r_proj_attrs.append(r_attr)
     return l_proj_attrs, r_proj_attrs
 def get_attrs_to_project(self, l_key, r_key, l_output_attrs,
                          r_output_attrs):
     l_proj_attrs = [l_key]
     r_proj_attrs = [r_key]
     if l_output_attrs:
         l_proj_attrs.extend(
             [c for c in l_output_attrs if c not in l_proj_attrs])
     if r_output_attrs:
         r_proj_attrs.extend(
             [c for c in r_output_attrs if c not in r_proj_attrs])
     for rule_name, conjunct_list in six.iteritems(self.rule_str):
         for conjunct in conjunct_list:
             is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, th = parse_conjunct(
                 conjunct, self.rule_ft[rule_name])
             if l_attr not in l_proj_attrs:
                 l_proj_attrs.append(l_attr)
             if r_attr not in r_proj_attrs:
                 r_proj_attrs.append(r_attr)
     return l_proj_attrs, r_proj_attrs
예제 #7
0
    def execute(self, input_table, label_column, inplace=True, verbose=False):
        """ Executes the rules of the match trigger for a table of matcher
            results.

            Args:
                input_table (DataFrame): The input table of type pandas DataFrame
                    containing tuple pairs and labels from matching (defaults to None).
                label_column (string): The attribute name where the predictions
                    are stored in the input table (defaults to None).
                inplace (boolean): A flag to indicate whether the append needs to be
                    done inplace (defaults to True).
                verbose (boolean): A flag to indicate whether the debug information
                    should be logged (defaults to False).

            Returns:
                A DataFrame with predictions updated.

            Examples:
                >>> import py_entitymatching as em
                >>> mt = em.MatchTrigger()
                >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id')
                >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id')
                >>> match_f = em.get_features_for_matching(A, B)
                >>> rule = ['title_title_lev_sim(ltuple, rtuple) > 0.7']
                >>> mt.add_cond_rule(rule, match_f)
                >>> mt.add_cond_status(True)
                >>> mt.add_action(1)
                >>> # The table H is a table with prediction labels generated from matching
                >>> mt.execute(input_table=H, label_column='predicted_labels', inplace=False)

        """

        # Validate input parameters
        # # We expect the table to be of type pandas DataFrame
        validate_object_type(input_table, pd.DataFrame, 'Input table')

        # # We expect the target_attr to be of type string if not None
        if label_column is not None and not isinstance(label_column, str):
            logger.error('Input target_attr must be a string.')
            raise AssertionError('Input target_attr must be a string.')

        # # We expect the inplace to be of type boolean
        validate_object_type(inplace, bool, 'Input inplace')

        # # We expect the verbose to be of type boolean
        validate_object_type(verbose, bool, 'Input append')

        # Validate that there are some rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            input_table, logger, verbose)
        # # validate metadata
        cm._validate_metadata_for_candset(input_table, key, fk_ltable,
                                          fk_rtable, ltable, rtable, l_key,
                                          r_key, logger, verbose)

        assert ltable is not None, 'Left table is not set'
        assert rtable is not None, 'Right table is not set'
        assert label_column in input_table.columns, 'Label column not in the input table'

        # Parse conjuncts to validate that the features are in the feature table
        for rule in self.rule_conjunct_list:
            for conjunct in self.rule_conjunct_list[rule]:
                parse_conjunct(conjunct, self.rule_ft[rule])

        if inplace == False:
            table = input_table.copy()
        else:
            table = input_table

        # set the index and store it in l_tbl/r_tbl
        l_tbl = ltable.set_index(l_key, drop=False)
        r_tbl = rtable.set_index(r_key, drop=False)

        # keep track of valid ids
        y = []

        column_names = list(input_table.columns)
        lid_idx = column_names.index(fk_ltable)
        rid_idx = column_names.index(fk_rtable)

        label_idx = column_names.index(label_column)
        idx = 0
        for row in input_table.itertuples(index=False):
            if row[label_idx] != self.value_to_set:
                l_row = l_tbl.loc[row[lid_idx]]
                r_row = r_tbl.loc[row[rid_idx]]
                res = self.apply_rules(l_row, r_row)
                if res == self.cond_status:
                    table.iat[idx, label_idx] = self.value_to_set
            idx += 1
        return table
    def predict(self, table=None, target_attr=None, append=False, inplace=True):
        """Predict interface for the matcher.

            A point to note is all the input parameters have a default value of
            None.

            Args:
                table (DataFrame): The input candidate set of type pandas DataFrame
                    containing tuple pairs (defaults to None).
                target_attr (string): The attribute name where the predictions
                    need to be stored in the input table (defaults to None).
                append (boolean): A flag to indicate whether the predictions need
                    to be appended in the input DataFrame (defaults to False).
                return_probs (boolean): A flag to indicate where the prediction probabilities
                    need to be returned (defaults to False). If set to True, returns the
                    probability if the pair was a match.
                inplace (boolean): A flag to indicate whether the append needs to be
                    done inplace (defaults to True).

            Returns:
                An array of predictions or a DataFrame with predictions updated.

            Examples:
                >>> import py_entitymatching as em
                >>> brm = em.BooleanRuleMatcher()
                >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id')
                >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id')
                >>> match_f = em.get_features_for_matching(A, B)
                >>> rule = ['address_address_lev(ltuple, rtuple) > 6']
                >>> brm.add_rule(rule, match_f)
                >>> # The table S is a cand set generated by the blocking and then labeling phases
                >>> brm.predict(S, target_attr='pred_label', append=True)

        """

        # Validate input parameters
        # # We expect the table to be of type pandas DataFrame
        validate_object_type(table, pd.DataFrame, 'Input table')

        # # We expect the target_attr to be of type string if not None
        if target_attr is not None and not isinstance(target_attr, str):
                logger.error('Input target_attr must be a string.')
                raise AssertionError('Input target_attr must be a string.')

        # # We expect the append to be of type boolean
        validate_object_type(append, bool, 'Input append')

        # # We expect the inplace to be of type boolean
        validate_object_type(inplace, bool, 'Input inplace')

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            table, logger, False)

        # # validate metadata
        cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key,
                                          logger, False)

        # Validate that there are some rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # Parse conjuncts to validate that the features are in the feature table
        for rule in self.rule_conjunct_list:
            for conjunct in self.rule_conjunct_list[rule]:
                parse_conjunct(conjunct, self.rule_ft[rule])

        if table is not None:
            y = self._predict_candset(table)
            if target_attr is not None and append is True:
                if inplace == True:
                    table[target_attr] = y
                    return table
                else:
                    tbl = table.copy()
                    tbl[target_attr] = y
                    return tbl
            else:
                return y
        else:
            raise SyntaxError('The arguments supplied does not match the signatures supported !!!')
    def predict(self,
                table=None,
                target_attr=None,
                append=False,
                inplace=True):
        """Predict interface for the matcher.

            A point to note is all the input parameters have a default value of
            None.

            Args:
                table (DataFrame): The input candidate set of type pandas DataFrame
                    containing tuple pairs (defaults to None).
                target_attr (string): The attribute name where the predictions
                    need to be stored in the input table (defaults to None).
                append (boolean): A flag to indicate whether the predictions need
                    to be appended in the input DataFrame (defaults to False).
                return_probs (boolean): A flag to indicate where the prediction probabilities
                    need to be returned (defaults to False). If set to True, returns the
                    probability if the pair was a match.
                inplace (boolean): A flag to indicate whether the append needs to be
                    done inplace (defaults to True).

            Returns:
                An array of predictions or a DataFrame with predictions updated.

            Examples:
                >>> import py_entitymatching as em
                >>> brm = em.BooleanRuleMatcher()
                >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id')
                >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id')
                >>> match_f = em.get_features_for_matching(A, B)
                >>> rule = ['address_address_lev(ltuple, rtuple) > 6']
                >>> brm.add_rule(rule, match_f)
                >>> # The table S is a cand set generated by the blocking and then labeling phases
                >>> brm.predict(S, target_attr='pred_label', append=True)

        """

        # Validate input parameters
        # # We expect the table to be of type pandas DataFrame
        validate_object_type(table, pd.DataFrame, 'Input table')

        # # We expect the target_attr to be of type string if not None
        if target_attr is not None and not isinstance(target_attr, str):
            logger.error('Input target_attr must be a string.')
            raise AssertionError('Input target_attr must be a string.')

        # # We expect the append to be of type boolean
        validate_object_type(append, bool, 'Input append')

        # # We expect the inplace to be of type boolean
        validate_object_type(inplace, bool, 'Input inplace')

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            table, logger, False)

        # # validate metadata
        cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key, logger,
                                          False)

        # Validate that there are some rules
        assert len(self.rules.keys()) > 0, 'There are no rules to apply'

        # Parse conjuncts to validate that the features are in the feature table
        for rule in self.rule_conjunct_list:
            for conjunct in self.rule_conjunct_list[rule]:
                parse_conjunct(conjunct, self.rule_ft[rule])

        if table is not None:
            y = self._predict_candset(table)
            if target_attr is not None and append is True:
                if inplace == True:
                    table[target_attr] = y
                    return table
                else:
                    tbl = table.copy()
                    tbl[target_attr] = y
                    return tbl
            else:
                return y
        else:
            raise SyntaxError(
                'The arguments supplied does not match the signatures supported !!!'
            )