示例#1
0
class PrefixFilterTestCase(unittest.TestCase):
    def setUp(self):
        self.prefix_filter = PrefixFilter(A, A_tokenized, 'str', tok, 0.8,
                                          token_ordering)
        self.prefix_filter.build_index()

    def test_apply_filter(self):
        # prefix filter satisfies
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['fg', 'cd', 'aa'],
                                              token_ordering)
        self.assertTrue(
            self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                            len(r_tokens), 0.8))

        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['aa'], token_ordering)
        self.assertTrue(
            self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                            len(r_tokens), 0.8))

        # prefix filter doesn't satisfy
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering(['fg'], token_ordering)
        self.assertFalse(
            self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                            len(r_tokens), 0.8))

        # test empty list of tokens
        l_tokens = order_using_token_ordering(['aa', 'bb', 'cd', 'ef', 'fg'],
                                              token_ordering)
        r_tokens = order_using_token_ordering([], token_ordering)
        self.assertFalse(
            self.prefix_filter.apply_filter(l_tokens, r_tokens, len(l_tokens),
                                            len(r_tokens), 0.8))
        self.assertFalse(
            self.prefix_filter.apply_filter(r_tokens, l_tokens, len(r_tokens),
                                            len(l_tokens), 0.8))

    def test_find_candidates(self):
        # test default case (presence of candidates)
        tokens = order_using_token_ordering(['aa', 'ef', 'lp'], token_ordering)
        self.assertSetEqual(
            self.prefix_filter.find_candidates(tokens, len(tokens), 0.8),
            set([0, 3]))

        # test empty set of candidates
        tokens = order_using_token_ordering(['op', 'lp', 'mp'], token_ordering)
        self.assertSetEqual(
            self.prefix_filter.find_candidates(tokens, len(tokens), 0.8),
            set())

        # test empty list of probe tokens
        tokens = order_using_token_ordering([], token_ordering)
        self.assertSetEqual(
            self.prefix_filter.find_candidates(tokens, len(tokens), 0.8),
            set())
def _edit_distance_join_split(ltable_list, rtable_list,
                              l_columns, r_columns,
                              l_key_attr, r_key_attr,
                              l_join_attr, r_join_attr,
                              tokenizer, threshold, comp_op,
                              l_out_attrs, r_out_attrs,
                              l_out_prefix, r_out_prefix,
                              out_sim_score, show_progress):
    """Perform edit distance join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_list, rtable_list],
                         [l_join_attr_index, r_join_attr_index],
                         tokenizer, sim_measure_type)

    # cache l_join_attr lengths
    l_join_attr_list = []
    for row in ltable_list:
        l_join_attr_list.append(len(row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_list, l_join_attr_index,
                               tokenizer, sim_measure_type, threshold,
                               token_ordering)
    prefix_index.build(False)

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)

    comp_fn = COMP_OP_MAP[comp_op]
    sim_fn = get_sim_function(sim_measure_type)

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]
        r_len = len(r_string)

        r_ordered_tokens = order_using_token_ordering(
                tokenizer.tokenize(r_string), token_ordering)

        # obtain candidates by applying prefix filter. 
        candidates = prefix_filter.find_candidates(r_ordered_tokens,
                                                   prefix_index)

        for cand in candidates:
            if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold:
                l_row = ltable_list[cand]

                # compute the actual edit distance                           
                edit_dist = sim_fn(l_row[l_join_attr_index], r_string)

                if comp_fn(edit_dist, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         l_row, r_row,
                                         l_key_attr_index, r_key_attr_index,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                    else:
                        output_row = [l_row[l_key_attr_index],
                                      r_row[r_key_attr_index]]

                    # if out_sim_score flag is set, append the edit distance 
                    # score to the output record.
                    if out_sim_score:
                        output_row.append(edit_dist)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
示例#3
0
def _edit_distance_join_split(ltable_list, rtable_list, l_columns, r_columns,
                              l_key_attr, r_key_attr, l_join_attr, r_join_attr,
                              tokenizer, threshold, comp_op, l_out_attrs,
                              r_out_attrs, l_out_prefix, r_out_prefix,
                              out_sim_score, show_progress):
    """Perform edit distance join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
        [ltable_list, rtable_list], [l_join_attr_index, r_join_attr_index],
        tokenizer, sim_measure_type)

    # cache l_join_attr lengths
    l_join_attr_list = []
    for row in ltable_list:
        l_join_attr_list.append(len(row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_list, l_join_attr_index, tokenizer,
                               sim_measure_type, threshold, token_ordering)
    prefix_index.build(False)

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)

    comp_fn = COMP_OP_MAP[comp_op]
    sim_fn = get_sim_function(sim_measure_type)

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]
        r_len = len(r_string)

        r_ordered_tokens = order_using_token_ordering(
            tokenizer.tokenize(r_string), token_ordering)

        # obtain candidates by applying prefix filter.
        candidates = prefix_filter.find_candidates(r_ordered_tokens,
                                                   prefix_index)

        for cand in candidates:
            if r_len - threshold <= l_join_attr_list[cand] <= r_len + threshold:
                l_row = ltable_list[cand]

                # compute the actual edit distance
                edit_dist = sim_fn(l_row[l_join_attr_index], r_string)

                if comp_fn(edit_dist, threshold):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            l_row, r_row, l_key_attr_index, r_key_attr_index,
                            l_out_attrs_indices, r_out_attrs_indices)
                    else:
                        output_row = [
                            l_row[l_key_attr_index], r_row[r_key_attr_index]
                        ]

                    # if out_sim_score flag is set, append the edit distance
                    # score to the output record.
                    if out_sim_score:
                        output_row.append(edit_dist)

                    output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table