예제 #1
0
def overlap_join(ltable, rtable,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer,
                 threshold,
                 l_out_attrs=None, r_out_attrs=None,
                 l_out_prefix='l_', r_out_prefix='r_',
                 out_sim_score=True, n_jobs=1):
    """Join two tables using overlap similarity measure.

    Finds tuple pairs from ltable and rtable such that
    Overlap(ltable.l_join_attr, rtable.r_join_attr) >= threshold

    Args:
    ltable, rtable : Pandas data frame
    l_key_attr, r_key_attr : String, key attribute from ltable and rtable
    l_join_attr, r_join_attr : String, join attribute from ltable and rtable
    tokenizer : Tokenizer object, tokenizer to be used to tokenize join attributes
    threshold : float, overlap threshold to be satisfied
    l_out_attrs, r_out_attrs : list of attributes to be included in the output table from ltable and rtable
    l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table
    out_sim_score : boolean, indicates if similarity score needs to be included in the output table

    Returns:
    result : Pandas data frame
    """
    overlap_filter = OverlapFilter(tokenizer, threshold)
    return overlap_filter.filter_tables(ltable, rtable,
                                        l_key_attr, r_key_attr,
                                        l_join_attr, r_join_attr,
                                        l_out_attrs, r_out_attrs,
                                        l_out_prefix, r_out_prefix,
                                        out_sim_score, n_jobs)
    def test_apply_matcher_with_join_attr_of_type_int(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='
        l_join_attr = 'A.zipcode'
        r_join_attr = 'B.zipcode'

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(
            lambda row: sim_func(tok.tokenize(str(row[l_join_attr])),
                                 tok.tokenize(str(row[r_join_attr]))),
            axis=1)

        comp_fn = COMP_OP_MAP[comp_op]
        # compute expected output pairs
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join(
                    (str(row[self.l_key_attr]), str(row[self.r_key_attr]))))

        # use overlap filter to obtain a candset.
        overlap_filter = OverlapFilter(tok, 1, comp_op)
        candset = overlap_filter.filter_tables(self.ltable, self.rtable,
                                               self.l_key_attr,
                                               self.r_key_attr, l_join_attr,
                                               r_join_attr)

        # apply a jaccard matcher to the candset
        output_candset = apply_matcher(
            candset, DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, self.ltable, self.rtable,
            self.l_key_attr, self.r_key_attr, l_join_attr, r_join_attr, tok,
            sim_func, threshold)

        expected_output_attrs = [
            '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'
        ]

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join(
                (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                 str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
    def test_apply_matcher(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                tok.tokenize(str(row[self.l_join_attr])),
                tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        comp_fn = COMP_OP_MAP[comp_op]
        # compute expected output pairs
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join((str(row[self.l_key_attr]),
                                             str(row[self.r_key_attr]))))

        # use overlap filter to obtain a candset.
        overlap_filter = OverlapFilter(tok, 1, comp_op)
        candset = overlap_filter.filter_tables(self.ltable, self.rtable,
                              self.l_key_attr, self.r_key_attr,
                              self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset
        output_candset = apply_matcher(candset,
            DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr,
            self.ltable, self.rtable, self.l_key_attr, self.r_key_attr,
            self.l_join_attr, self.r_join_attr, tok, sim_func, threshold,
            comp_op, False,
            [self.l_join_attr], [self.r_join_attr], out_sim_score=True)

        expected_output_attrs=['_id',
                               DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                               DEFAULT_L_OUT_PREFIX + self.l_join_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_join_attr,
                               '_sim_score']

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                                       str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
예제 #4
0
    def test_candset_with_numeric_r_filter_attr(self):
        A = pd.DataFrame([{'l_id': 1, 'l_attr': '1990'}])
        B = pd.DataFrame([{'r_id': 1, 'r_attr': 2001}])

        A['tmp_join_key'] = 1
        B['tmp_join_key'] = 1
        C = pd.merge(A[['l_id', 'tmp_join_key']],
                     B[['r_id', 'tmp_join_key']],
                     on='tmp_join_key').drop('tmp_join_key', 1)

        qg2_tok = QgramTokenizer(2, return_set=True)
        overlap_filter = OverlapFilter(qg2_tok)
        overlap_filter.filter_candset(C, 'l_id', 'r_id', A, B, 'l_id', 'r_id',
                                      'l_attr', 'r_attr')
 def test_candset_with_numeric_r_filter_attr(self):                          
     A = pd.DataFrame([{'l_id': 1, 'l_attr':'1990'}])                          
     B = pd.DataFrame([{'r_id': 1, 'r_attr':2001}])                          
                                                                             
     A['tmp_join_key'] = 1                                                   
     B['tmp_join_key'] = 1                                                   
     C = pd.merge(A[['l_id', 'tmp_join_key']],                               
                  B[['r_id', 'tmp_join_key']],                               
              on='tmp_join_key').drop('tmp_join_key', 1)                     
                                                                             
     qg2_tok = QgramTokenizer(2, return_set=True)                            
     overlap_filter = OverlapFilter(qg2_tok)                                 
     overlap_filter.filter_candset(C, 'l_id', 'r_id',                                        
                                   A, B, 'l_id', 'r_id',                     
                                   'l_attr', 'r_attr')  
예제 #6
0
    def test_filter_tables(self, tokenizer, overlap_size, comp_op,
                           allow_missing, args, expected_pairs):
        overlap_filter = OverlapFilter(tokenizer, overlap_size, comp_op,
                                       allow_missing)
        actual_candset = overlap_filter.filter_tables(*args)

        expected_output_attrs = ['_id']
        l_out_prefix = self.default_l_out_prefix
        r_out_prefix = self.default_r_out_prefix

        # Check for l_out_prefix in args.
        if len(args) > 8:
            l_out_prefix = args[8]
        expected_output_attrs.append(l_out_prefix + args[2])

        # Check for r_out_prefix in args.
        if len(args) > 9:
            r_out_prefix = args[9]
        expected_output_attrs.append(r_out_prefix + args[3])

        # Check for l_out_attrs in args.
        if len(args) > 6:
            if args[6]:
                l_out_attrs = remove_redundant_attrs(args[6], args[2])
                for attr in l_out_attrs:
                    expected_output_attrs.append(l_out_prefix + attr)

        # Check for r_out_attrs in args.
        if len(args) > 7:
            if args[7]:
                r_out_attrs = remove_redundant_attrs(args[7], args[3])
                for attr in r_out_attrs:
                    expected_output_attrs.append(r_out_prefix + attr)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_candset.columns.values),
                          expected_output_attrs)

        actual_pairs = set()
        for idx, row in actual_candset.iterrows():
            actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]),
                                       str(row[r_out_prefix + args[3]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
    def test_filter_tables(self, tokenizer, overlap_size, comp_op,
                           allow_missing, args, expected_pairs):
        overlap_filter = OverlapFilter(tokenizer, overlap_size,
                                       comp_op, allow_missing)
        actual_candset = overlap_filter.filter_tables(*args)

        expected_output_attrs = ['_id']
        l_out_prefix = self.default_l_out_prefix
        r_out_prefix = self.default_r_out_prefix

        # Check for l_out_prefix in args.
        if len(args) > 8:
            l_out_prefix = args[8]
        expected_output_attrs.append(l_out_prefix + args[2])

        # Check for r_out_prefix in args.
        if len(args) > 9:
            r_out_prefix = args[9]
        expected_output_attrs.append(r_out_prefix + args[3])

        # Check for l_out_attrs in args.
        if len(args) > 6:
            if args[6]:
                l_out_attrs = remove_redundant_attrs(args[6], args[2])
                for attr in l_out_attrs:
                    expected_output_attrs.append(l_out_prefix + attr)

        # Check for r_out_attrs in args.
        if len(args) > 7:
            if args[7]:
                r_out_attrs = remove_redundant_attrs(args[7], args[3])
                for attr in r_out_attrs:
                    expected_output_attrs.append(r_out_prefix + attr)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_candset.columns.values),
                          expected_output_attrs)

        actual_pairs = set()
        for idx, row in actual_candset.iterrows():
            actual_pairs.add(','.join((str(row[l_out_prefix + args[2]]),
                                       str(row[r_out_prefix + args[3]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
예제 #8
0
    def test_filter_candset(self, tokenizer, overlap_size, args,
                            expected_pairs):
        overlap_filter = OverlapFilter(tokenizer, overlap_size)
        actual_output_candset = overlap_filter.filter_candset(*args)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_output_candset.columns.values),
                          list(args[0].columns.values))

        actual_pairs = set()
        for idx, row in actual_output_candset.iterrows():
            actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
    def test_filter_candset(self, tokenizer, overlap_size, comp_op,
                            allow_missing, args, expected_pairs):
        overlap_filter = OverlapFilter(tokenizer, overlap_size,
                                       comp_op, allow_missing)
        actual_output_candset = overlap_filter.filter_candset(*args)

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(actual_output_candset.columns.values),
                          list(args[0].columns.values))

        actual_pairs = set()
        for idx, row in actual_output_candset.iterrows():
            actual_pairs.add(','.join((str(row[args[1]]), str(row[args[2]]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
예제 #10
0
def _overlap_coefficient_join_split(
        ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr,
        l_join_attr, r_join_attr, tokenizer, threshold, comp_op, allow_empty,
        l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score,
        show_progress):
    """Perform overlap coefficient join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build inverted index over ltable
    inverted_index = InvertedIndex(ltable_list,
                                   l_join_attr_index,
                                   tokenizer,
                                   cache_size_flag=True)
    # While building the index, we cache the record ids with empty set of
    # tokens. This is needed to handle the allow_empty flag.
    cached_data = inverted_index.build(allow_empty)
    l_empty_records = cached_data['empty_records']

    overlap_filter = OverlapFilter(tokenizer, 1)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]

        r_join_attr_tokens = tokenizer.tokenize(r_string)
        r_num_tokens = len(r_join_attr_tokens)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining
        # the current rtable record with those records in ltable with empty set
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the inverted
        # index.
        if allow_empty and r_num_tokens == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable_list[l_id], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable_list[l_id][l_key_attr_index],
                        r_row[r_key_attr_index]
                    ]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # probe inverted index and find overlap of candidates
        candidate_overlap = overlap_filter.find_candidates(
            r_join_attr_tokens, inverted_index)

        for cand, overlap in iteritems(candidate_overlap):
            # compute the actual similarity score
            sim_score = (
                float(overlap) /
                float(min(r_num_tokens, inverted_index.size_cache[cand])))

            if comp_fn(sim_score, threshold):
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable_list[cand], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable_list[cand][l_key_attr_index],
                        r_row[r_key_attr_index]
                    ]

                # if out_sim_score flag is set, append the overlap coefficient
                # score to the output record.
                if out_sim_score:
                    output_row.append(sim_score)

                output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
def _overlap_coefficient_join_split(ltable_list, rtable_list,
                                    l_columns, r_columns,
                                    l_key_attr, r_key_attr,
                                    l_join_attr, r_join_attr,
                                    tokenizer, threshold, comp_op,
                                    allow_empty,
                                    l_out_attrs, r_out_attrs,
                                    l_out_prefix, r_out_prefix,
                                    out_sim_score, show_progress):
    """Perform overlap coefficient join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build inverted index over ltable
    inverted_index = InvertedIndex(ltable_list, l_join_attr_index,
                                   tokenizer, cache_size_flag=True)
    # While building the index, we cache the record ids with empty set of 
    # tokens. This is needed to handle the allow_empty flag.
    cached_data = inverted_index.build(allow_empty)
    l_empty_records = cached_data['empty_records']

    overlap_filter = OverlapFilter(tokenizer, 1)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]

        r_join_attr_tokens = tokenizer.tokenize(r_string)
        r_num_tokens = len(r_join_attr_tokens)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining   
        # the current rtable record with those records in ltable with empty set 
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the inverted 
        # index.
        if allow_empty and r_num_tokens == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable_list[l_id], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices,
                                     r_out_attrs_indices)
                else:
                    output_row = [ltable_list[l_id][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # probe inverted index and find overlap of candidates 
        candidate_overlap = overlap_filter.find_candidates(
                                r_join_attr_tokens, inverted_index)

        for cand, overlap in iteritems(candidate_overlap):
            # compute the actual similarity score                           
            sim_score = (float(overlap) /
                         float(min(r_num_tokens,
                                   inverted_index.size_cache[cand])))

            if comp_fn(sim_score, threshold):
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable_list[cand], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices, r_out_attrs_indices)
                else:
                    output_row = [ltable_list[cand][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                # if out_sim_score flag is set, append the overlap coefficient 
                # score to the output record.  
                if out_sim_score:
                    output_row.append(sim_score)

                output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
 def test_rtable_r_key_attr_with_missing_value(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id',
                                   self.A, self.B,
                                  'l_id', 'r_attr',
                                  'l_attr', 'r_attr')
    def test_apply_matcher_with_allow_missing(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(
            lambda row: sim_func(tok.tokenize(str(row[self.l_join_attr])),
                                 tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        # compute expected output pairs
        comp_fn = COMP_OP_MAP[comp_op]
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join(
                    (str(row[self.l_key_attr]), str(row[self.r_key_attr]))))

        # find pairs that need to be included in output due to
        # the presence of missing value in one of the join attributes.
        missing_pairs = set()
        for l_idx, l_row in self.orig_ltable.iterrows():
            for r_idx, r_row in self.orig_rtable.iterrows():
                if (pd.isnull(l_row[self.l_join_attr])
                        or pd.isnull(r_row[self.r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[self.l_key_attr]),
                                                str(r_row[self.r_key_attr]))))

        # add the pairs containing missing value to the set of expected pairs.
        expected_pairs = expected_pairs.union(missing_pairs)

        # use overlap filter to obtain a candset with allow_missing set to True.
        overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True)
        candset = overlap_filter.filter_tables(
            self.orig_ltable, self.orig_rtable, self.l_key_attr,
            self.r_key_attr, self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset with allow_missing set to True.
        output_candset = apply_matcher(candset,
                                       DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                                       DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                                       self.orig_ltable,
                                       self.orig_rtable,
                                       self.l_key_attr,
                                       self.r_key_attr,
                                       self.l_join_attr,
                                       self.r_join_attr,
                                       tok,
                                       sim_func,
                                       threshold,
                                       comp_op,
                                       True,
                                       out_sim_score=True)

        expected_output_attrs = [
            '_id', DEFAULT_L_OUT_PREFIX + self.l_key_attr,
            DEFAULT_R_OUT_PREFIX + self.r_key_attr, '_sim_score'
        ]

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join(
                (str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                 str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))
 def test_numeric_r_filter_attr(self):                                       
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)          
     overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id',            
                                  'A.attr', 'B.int_attr')
 def test_invalid_ltable(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id',
                                   [], self.B,
                                  'l_id', 'r_id',
                                  'l_attr', 'r_attr')
예제 #16
0
 def test_invalid_rtable_r_filter_attr(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id', self.A, self.B,
                                   'l_id', 'r_id', 'l_attr', 'invalid_attr')
예제 #17
0
 def test_invalid_r_out_attr(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)
     overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr',
                                  'B.attr', ['A.attr'], ['B.invalid_attr'])
예제 #18
0
 def test_filter_pair(self, lstring, rstring, tokenizer, overlap_size,
                      comp_op, allow_missing, expected_output):
     overlap_filter = OverlapFilter(tokenizer, overlap_size, comp_op,
                                    allow_missing)
     actual_output = overlap_filter.filter_pair(lstring, rstring)
     assert_equal(actual_output, expected_output)
예제 #19
0
def overlap_join(ltable, rtable,
                 l_key_attr, r_key_attr,
                 l_join_attr, r_join_attr,
                 tokenizer, threshold, comp_op='>=',
                 allow_missing=False,
                 l_out_attrs=None, r_out_attrs=None,
                 l_out_prefix='l_', r_out_prefix='r_',
                 out_sim_score=True, n_jobs=1, show_progress=True):
    """Join two tables using overlap measure.

    For two sets X and Y, the overlap between them is given by:                       
                                                                                
        :math:`overlap(X, Y) = |X \\cap Y|`

    Finds tuple pairs from left table and right table such that the overlap 
    between the join attributes satisfies the condition on input threshold. For 
    example, if the comparison operator is '>=', finds tuple pairs whose 
    overlap between the strings that are the values of the join attributes is 
    greater than or equal to the input threshold, as specified in "threshold".

    Args:
        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_join_attr (string): join attribute in left table.

        r_join_attr (string): join attribute in right table.

        tokenizer (Tokenizer): tokenizer to be used to tokenize join     
            attributes.                                                         
                                                                                
        threshold (float): overlap threshold to be satisfied.        
                                                                                
        comp_op (string): comparison operator. Supported values are '>=', '>'   
            and '=' (defaults to '>=').                                         
                                                                                
        allow_missing (boolean): flag to indicate whether tuple pairs with      
            missing value in at least one of the join attributes should be      
            included in the output (defaults to False). If this flag is set to  
            True, a tuple in ltable with missing value in the join attribute    
            will be matched with every tuple in rtable and vice versa.          
                                                                                
        l_out_attrs (list): list of attribute names from the left table to be   
            included in the output table (defaults to None).                    
                                                                                
        r_out_attrs (list): list of attribute names from the right table to be  
            included in the output table (defaults to None).                    
                                                                                
        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').       
                                                                                
        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').      
                                                                                
        out_sim_score (boolean): flag to indicate whether similarity score      
            should be included in the output table (defaults to True). Setting  
            this flag to True will add a column named '_sim_score' in the       
            output table. This column will contain the similarity scores for the
            tuple pairs in the output.                                          

        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default).                                                                                 
                                                                                
        show_progress (boolean): flag to indicate whether task progress should  
            be displayed to the user (defaults to True).                        
                                                                                
    Returns:                                                                    
        An output table containing tuple pairs that satisfy the join            
        condition (DataFrame).  
    """

    # check if the input tokenizer is valid
    validate_tokenizer(tokenizer)

    # set return_set flag of tokenizer to be True, in case it is set to False
    revert_tokenizer_return_set_flag = False
    if not tokenizer.get_return_set():
        tokenizer.set_return_set(True)
        revert_tokenizer_return_set_flag = True

    # use overlap filter to perform the join.
    overlap_filter = OverlapFilter(tokenizer, threshold, comp_op, allow_missing)
    output_table =  overlap_filter.filter_tables(ltable, rtable,
                                                 l_key_attr, r_key_attr,
                                                 l_join_attr, r_join_attr,
                                                 l_out_attrs, r_out_attrs,
                                                 l_out_prefix, r_out_prefix,
                                                 out_sim_score, n_jobs,
                                                 show_progress)

    # revert the return_set flag of tokenizer, in case it was modified.
    if revert_tokenizer_return_set_flag:
        tokenizer.set_return_set(False)

    return output_table
예제 #20
0
def sample_pairs(ltable, rtable, l_key_attr, r_key_attr, 
                 l_join_attr, r_join_attr, sample_size, y_param, seed,
                 l_out_prefix='l_', r_out_prefix='r_', show_progress=True):
    # get attributes to project.                                                
    l_proj_attrs = get_attrs_to_project(None, l_key_attr, l_join_attr)   
    r_proj_attrs = get_attrs_to_project(None, r_key_attr, r_join_attr)  

    # convert dataframe to array for faster access       
    ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr)
    rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr)

    # find column indices of key attr and join attr in ltable array                  
    l_key_attr_index = l_proj_attrs.index(l_key_attr)                              
    l_join_attr_index = l_proj_attrs.index(l_join_attr)                            
                                                                                
    # find column indices of key attr and join attr in rtable array                   
    r_key_attr_index = r_proj_attrs.index(r_key_attr)                              
    r_join_attr_index = r_proj_attrs.index(r_join_attr)  

    # create a whitespace tokenizer to tokenize join attributes                 
    ws_tok = WhitespaceTokenizer(return_set=True)     

    # build inverted index on join attriubute in ltable
    inverted_index = InvertedIndex(ltable_array, l_join_attr_index, ws_tok)
    inverted_index.build()

    number_of_r_tuples_to_sample = int(ceil(float(sample_size) / float(y_param)))   
    sample_rtable_indices = random.sample(range(0, len(rtable_array)),
                                          number_of_r_tuples_to_sample)
    cand_pos_ltuples_required = int(ceil(y_param / 2.0))                    

    overlap_filter = OverlapFilter(ws_tok, 1)                                

    output_rows = [] 

    if show_progress:                                                           
        prog_bar = pyprind.ProgBar(number_of_r_tuples_to_sample)    

    for r_idx in sample_rtable_indices:
        r_row = rtable_array[r_idx]
        r_id = r_row[r_key_attr_index]
        r_join_attr_tokens = ws_tok.tokenize(r_row[r_join_attr_index])

        # probe inverted index and find ltable candidates                   
        cand_overlap = overlap_filter.find_candidates(                     
                           r_join_attr_tokens, inverted_index)          

        sampled_ltuples = set() 
        for cand in sorted(cand_overlap.items(), key=operator.itemgetter(1), 
                           reverse=True):
            if len(sampled_ltuples) == cand_pos_ltuples_required:
                break 
            sampled_ltuples.add(cand[0])

        ltable_size = len(ltable_array)
        while len(sampled_ltuples) < y_param:
            rand_idx = random.randint(0, ltable_size - 1)
            sampled_ltuples.add(rand_idx)

        for l_idx in sampled_ltuples:
            output_rows.append([ltable_array[l_idx][l_key_attr_index], r_id])

        if show_progress:                                                       
            prog_bar.update()

    for seed_pair_row in seed.itertuples(index=False):                          
        output_rows.append([seed_pair_row[0], seed_pair_row[1]])
   
    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,       
                                                  None, None,     
                                                  l_out_prefix, r_out_prefix)

    output_table = pd.DataFrame(output_rows, columns=output_header)
             
    # add an id column named '_id' to the output table.                         
    output_table.insert(0, '_id', range(0, len(output_table)))    

    return output_table           
def overlap_join_py(ltable,
                    rtable,
                    l_key_attr,
                    r_key_attr,
                    l_join_attr,
                    r_join_attr,
                    tokenizer,
                    threshold,
                    comp_op='>=',
                    allow_missing=False,
                    l_out_attrs=None,
                    r_out_attrs=None,
                    l_out_prefix='l_',
                    r_out_prefix='r_',
                    out_sim_score=True,
                    n_jobs=1,
                    show_progress=True):
    """Join two tables using overlap measure.

    For two sets X and Y, the overlap between them is given by:                       
                                                                                
        :math:`overlap(X, Y) = |X \\cap Y|`

    Finds tuple pairs from left table and right table such that the overlap 
    between the join attributes satisfies the condition on input threshold. For 
    example, if the comparison operator is '>=', finds tuple pairs whose 
    overlap between the strings that are the values of the join attributes is 
    greater than or equal to the input threshold, as specified in "threshold".

    Args:
        ltable (DataFrame): left input table.

        rtable (DataFrame): right input table.

        l_key_attr (string): key attribute in left table.

        r_key_attr (string): key attribute in right table.

        l_join_attr (string): join attribute in left table.

        r_join_attr (string): join attribute in right table.

        tokenizer (Tokenizer): tokenizer to be used to tokenize join     
            attributes.                                                         
                                                                                
        threshold (float): overlap threshold to be satisfied.        
                                                                                
        comp_op (string): comparison operator. Supported values are '>=', '>'   
            and '=' (defaults to '>=').                                         
                                                                                
        allow_missing (boolean): flag to indicate whether tuple pairs with      
            missing value in at least one of the join attributes should be      
            included in the output (defaults to False). If this flag is set to  
            True, a tuple in ltable with missing value in the join attribute    
            will be matched with every tuple in rtable and vice versa.          
                                                                                
        l_out_attrs (list): list of attribute names from the left table to be   
            included in the output table (defaults to None).                    
                                                                                
        r_out_attrs (list): list of attribute names from the right table to be  
            included in the output table (defaults to None).                    
                                                                                
        l_out_prefix (string): prefix to be used for the attribute names coming 
            from the left table, in the output table (defaults to 'l\_').       
                                                                                
        r_out_prefix (string): prefix to be used for the attribute names coming 
            from the right table, in the output table (defaults to 'r\_').      
                                                                                
        out_sim_score (boolean): flag to indicate whether similarity score      
            should be included in the output table (defaults to True). Setting  
            this flag to True will add a column named '_sim_score' in the       
            output table. This column will contain the similarity scores for the
            tuple pairs in the output.                                          

        n_jobs (int): number of parallel jobs to use for the computation        
            (defaults to 1). If -1 is given, all CPUs are used. If 1 is given,  
            no parallel computing code is used at all, which is useful for      
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used      
            (where n_cpus is the total number of CPUs in the machine). Thus for 
            n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs)    
            becomes less than 1, then no parallel computing code will be used   
            (i.e., equivalent to the default).                                                                                 
                                                                                
        show_progress (boolean): flag to indicate whether task progress should  
            be displayed to the user (defaults to True).                        
                                                                                
    Returns:                                                                    
        An output table containing tuple pairs that satisfy the join            
        condition (DataFrame).  
    """

    # check if the input tokenizer is valid
    validate_tokenizer(tokenizer)

    # set return_set flag of tokenizer to be True, in case it is set to False
    revert_tokenizer_return_set_flag = False
    if not tokenizer.get_return_set():
        tokenizer.set_return_set(True)
        revert_tokenizer_return_set_flag = True

    # use overlap filter to perform the join.
    overlap_filter = OverlapFilter(tokenizer, threshold, comp_op,
                                   allow_missing)
    output_table = overlap_filter.filter_tables(ltable, rtable, l_key_attr,
                                                r_key_attr, l_join_attr,
                                                r_join_attr, l_out_attrs,
                                                r_out_attrs, l_out_prefix,
                                                r_out_prefix, out_sim_score,
                                                n_jobs, show_progress)

    # revert the return_set flag of tokenizer, in case it was modified.
    if revert_tokenizer_return_set_flag:
        tokenizer.set_return_set(False)

    return output_table
 def test_filter_pair(self, lstring, rstring, tokenizer,
                      overlap_size, comp_op, allow_missing, expected_output):
     overlap_filter = OverlapFilter(tokenizer, overlap_size,
                                    comp_op, allow_missing)
     actual_output = overlap_filter.filter_pair(lstring, rstring)
     assert_equal(actual_output, expected_output)
 def test_invalid_r_out_attr(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)
     overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id',
                                  'A.attr', 'B.attr',
                                  ['A.attr'], ['B.invalid_attr'])
 def test_invalid_rtable_r_filter_attr(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id',
                                   self.A, self.B,
                                  'l_id', 'r_id',
                                  'l_attr', 'invalid_attr')
예제 #25
0
 def test_invalid_ltable(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id', [], self.B,
                                   'l_id', 'r_id', 'l_attr', 'r_attr')
예제 #26
0
 def test_invalid_threshold(self):
     overlap_filter = OverlapFilter(self.tokenizer, -1)
예제 #27
0
 def test_rtable_r_key_attr_with_missing_value(self):
     overlap_filter = OverlapFilter(self.dlm)
     overlap_filter.filter_candset(self.C, 'l_id', 'r_id', self.A, self.B,
                                   'l_id', 'r_attr', 'l_attr', 'r_attr')
예제 #28
0
    def block_candset(self,
                      candset,
                      l_overlap_attr,
                      r_overlap_attr,
                      rem_stop_words=False,
                      q_val=None,
                      word_level=True,
                      overlap_size=1,
                      allow_missing=False,
                      verbose=False,
                      show_progress=True,
                      n_jobs=1):
        """Blocks an input candidate set of tuple pairs based on the overlap
           of token sets of attribute values.

        Finds tuple pairs from an input candidate set of tuple pairs such that
        the overlap between (a) the set of tokens obtained by tokenizing the
        value of attribute l_overlap_attr of the left tuple in a tuple pair,
        and (b) the set of tokens obtained by tokenizing the value of
        attribute r_overlap_attr of the right tuple in the tuple pair,
        is above a certain threshold.

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).
 
            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.

            verbose (boolean): A flag to indicate whether the debug information


                should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).

            n_jobs (int): The number of parallel jobs to be used for computation
                (defaults to 1). If -1 all CPUs are used. If 0 or 1,
                no parallel computation is used at all, which is useful for
                debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
                used (where n_cpus are the total number of CPUs in the
                machine).Thus, for n_jobs = -2, all CPUs but one are used.
                If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
                computation is used (i.e., equivalent to the default).

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_overlap_attr` is not of type string.
            AssertionError: If `r_overlap_attr` is not of type string.
            AssertionError: If `q_val` is not of type int.
            AssertionError: If `word_level` is not of type boolean.
            AssertionError: If `overlap_size` is not of type int.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_jobs` is not of type
                int.
            AssertionError: If `l_overlap_attr` is not in the ltable
                columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.
            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.
        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'])

            >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1)
            # Use q-gram tokenizer
            >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2)


        """

        # validate data types of standard input parameters
        self.validate_types_params_candset(candset, verbose, show_progress,
                                           n_jobs)

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level,
                                         overlap_size)

        # get and validate metadata
        log_info(
            logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
            'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key, logger,
                                          verbose)

        # validate overlap attrs
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # do blocking

        # # do projection before merge
        l_df = ltable[[l_key, l_overlap_attr]]
        r_df = rtable[[r_key, r_overlap_attr]]

        # # case the overlap attribute to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)

        # # cleanup the tables from non-ascii characters, punctuations, and stop words
        self.cleanup_table(l_df, l_overlap_attr, rem_stop_words)
        self.cleanup_table(r_df, r_overlap_attr, rem_stop_words)

        # # determine which tokenizer to use
        if word_level == True:
            # # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # # create a qgram tokenizer
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # create a filter for overlap similarity join
        overlap_filter = OverlapFilter(tokenizer,
                                       overlap_size,
                                       allow_missing=allow_missing)

        # # perform overlap similarity filtering of the candset
        out_table = overlap_filter.filter_candset(candset,
                                                  fk_ltable,
                                                  fk_rtable,
                                                  l_df,
                                                  r_df,
                                                  l_key,
                                                  r_key,
                                                  l_overlap_attr,
                                                  r_overlap_attr,
                                                  n_jobs,
                                                  show_progress=show_progress)
        # update catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return candidate set
        return out_table
예제 #29
0
 def test_invalid_rtable(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)
     overlap_filter.filter_tables(self.A, [], 'A.id', 'B.id', 'A.attr',
                                  'B.attr')
 def test_invalid_rtable(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)
     overlap_filter.filter_tables(self.A, [], 'A.id', 'B.id',
                                  'A.attr', 'B.attr')
예제 #31
0
 def test_numeric_r_filter_attr(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold)
     overlap_filter.filter_tables(self.A, self.B, 'A.id', 'B.id', 'A.attr',
                                  'B.int_attr')
예제 #32
0
def sample_pairs(ltable,
                 rtable,
                 l_key_attr,
                 r_key_attr,
                 l_join_attr,
                 r_join_attr,
                 sample_size,
                 y_param,
                 seed,
                 l_out_prefix='l_',
                 r_out_prefix='r_',
                 show_progress=True):
    # get attributes to project.
    l_proj_attrs = get_attrs_to_project(None, l_key_attr, l_join_attr)
    r_proj_attrs = get_attrs_to_project(None, r_key_attr, r_join_attr)

    # convert dataframe to array for faster access
    ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs,
                                              l_join_attr)
    rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs,
                                              r_join_attr)

    # find column indices of key attr and join attr in ltable array
    l_key_attr_index = l_proj_attrs.index(l_key_attr)
    l_join_attr_index = l_proj_attrs.index(l_join_attr)

    # find column indices of key attr and join attr in rtable array
    r_key_attr_index = r_proj_attrs.index(r_key_attr)
    r_join_attr_index = r_proj_attrs.index(r_join_attr)

    # create a whitespace tokenizer to tokenize join attributes
    ws_tok = WhitespaceTokenizer(return_set=True)

    # build inverted index on join attriubute in ltable
    inverted_index = InvertedIndex(ltable_array, l_join_attr_index, ws_tok)
    inverted_index.build()

    number_of_r_tuples_to_sample = int(
        ceil(float(sample_size) / float(y_param)))
    sample_rtable_indices = random.sample(range(0, len(rtable_array)),
                                          number_of_r_tuples_to_sample)
    cand_pos_ltuples_required = int(ceil(y_param / 2.0))

    overlap_filter = OverlapFilter(ws_tok, 1)

    output_rows = []

    if show_progress:
        prog_bar = pyprind.ProgBar(number_of_r_tuples_to_sample)

    for r_idx in sample_rtable_indices:
        r_row = rtable_array[r_idx]
        r_id = r_row[r_key_attr_index]
        r_join_attr_tokens = ws_tok.tokenize(r_row[r_join_attr_index])

        # probe inverted index and find ltable candidates
        cand_overlap = overlap_filter.find_candidates(r_join_attr_tokens,
                                                      inverted_index)

        sampled_ltuples = set()
        for cand in sorted(cand_overlap.items(),
                           key=operator.itemgetter(1),
                           reverse=True):
            if len(sampled_ltuples) == cand_pos_ltuples_required:
                break
            sampled_ltuples.add(cand[0])

        ltable_size = len(ltable_array)
        while len(sampled_ltuples) < y_param:
            rand_idx = random.randint(0, ltable_size - 1)
            sampled_ltuples.add(rand_idx)

        for l_idx in sampled_ltuples:
            output_rows.append([ltable_array[l_idx][l_key_attr_index], r_id])

        if show_progress:
            prog_bar.update()

    for seed_pair_row in seed.itertuples(index=False):
        output_rows.append([seed_pair_row[0], seed_pair_row[1]])

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr, None,
                                                  None, l_out_prefix,
                                                  r_out_prefix)

    output_table = pd.DataFrame(output_rows, columns=output_header)

    # add an id column named '_id' to the output table.
    output_table.insert(0, '_id', range(0, len(output_table)))

    return output_table
예제 #33
0
 def test_invalid_tokenizer(self):
     overlap_filter = OverlapFilter([], self.threshold)
    def block_candset(self, candset, l_overlap_attr, r_overlap_attr,
                      rem_stop_words=False, q_val=None, word_level=True,
                      overlap_size=1, allow_missing=False,
                      verbose=False, show_progress=True, n_jobs=1):
        """Blocks an input candidate set of tuple pairs based on the overlap
           of token sets of attribute values.

        Finds tuple pairs from an input candidate set of tuple pairs such that
        the overlap between (a) the set of tokens obtained by tokenizing the
        value of attribute l_overlap_attr of the left tuple in a tuple pair,
        and (b) the set of tokens obtained by tokenizing the value of
        attribute r_overlap_attr of the right tuple in the tuple pair,
        is above a certain threshold.

        Args:
            candset (DataFrame): The input candidate set of tuple pairs.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).
 
            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple pair with missing value in either
                                     blocking attribute will be retained in the
                                     output candidate set.

            verbose (boolean): A flag to indicate whether the debug information

                should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                                     be displayed to the user (defaults to True).

            n_jobs (int): The number of parallel jobs to be used for computation
                (defaults to 1). If -1 all CPUs are used. If 0 or 1,
                no parallel computation is used at all, which is useful for
                debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
                used (where n_cpus are the total number of CPUs in the
                machine).Thus, for n_jobs = -2, all CPUs but one are used.
                If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
                computation is used (i.e., equivalent to the default).

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `candset` is not of type pandas
                DataFrame.
            AssertionError: If `l_overlap_attr` is not of type string.
            AssertionError: If `r_overlap_attr` is not of type string.
            AssertionError: If `q_val` is not of type int.
            AssertionError: If `word_level` is not of type boolean.
            AssertionError: If `overlap_size` is not of type int.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `show_progress` is not of type
                boolean.
            AssertionError: If `n_jobs` is not of type
                int.
            AssertionError: If `l_overlap_attr` is not in the ltable
                columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.
            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.
        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'])

            >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Include all possible tuple pairs with missing values
            >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True)
            # Execute blocking using multiple cores
            >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1)
            # Use q-gram tokenizer
            >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2)


        """

        # validate data types of standard input parameters
        self.validate_types_params_candset(candset, verbose, show_progress,
                                           n_jobs)

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val,
                                         word_level, overlap_size)

        # get and validate metadata
        log_info(logger,
                 'Required metadata: cand.set key, fk ltable, fk rtable, '
                 'ltable, rtable, ltable key, rtable key', verbose)

        # # get metadata
        key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
            candset, logger, verbose)

        # # validate metadata
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                          ltable, rtable, l_key, r_key,
                                          logger, verbose)

        # validate overlap attrs
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr,
                                    r_overlap_attr)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # do blocking

        # # do projection before merge
        l_df = ltable[[l_key, l_overlap_attr]]
        r_df = rtable[[r_key, r_overlap_attr]]

        # # case the overlap attribute to string if required.
        l_df.is_copy, r_df.is_copy = False, False  # to avoid setwithcopy warning
        ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True)
        ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True)

        # # cleanup the tables from non-ascii characters, punctuations, and stop words
        self.cleanup_table(l_df, l_overlap_attr, rem_stop_words)
        self.cleanup_table(r_df, r_overlap_attr, rem_stop_words)

        # # determine which tokenizer to use
        if word_level == True:
            # # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # # create a qgram tokenizer
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # create a filter for overlap similarity join
        overlap_filter = OverlapFilter(tokenizer, overlap_size,
                                       allow_missing=allow_missing)

        # # perform overlap similarity filtering of the candset
        out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable,
                                                  l_df, r_df, l_key, r_key,
                                                  l_overlap_attr,
                                                  r_overlap_attr,
                                                  n_jobs,
                                                  show_progress=show_progress)
        # update catalog
        cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable,
                                  rtable)

        # return candidate set
        return out_table
예제 #35
0
 def test_invalid_comp_op_le(self):
     overlap_filter = OverlapFilter(self.tokenizer, self.threshold, '<=')
    def block_tuples(self, ltuple, rtuple, l_overlap_attr, r_overlap_attr,
                     rem_stop_words=False, q_val=None, word_level=True,
                     overlap_size=1, allow_missing=False):
        """Blocks a tuple pair based on the overlap of token sets of attribute
           values.
        
        Args:
            ltuple (Series): The input left tuple.

            rtuple (Series): The input right tuple.
            
            l_overlap_attr (string): The overlap attribute in left tuple.

            r_overlap_attr (string): The overlap attribute in right tuple.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): A value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).
 
            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether a tuple pair
                                     with missing value in at least one of the
                                     blocking attributes should be blocked
                                     (defaults to False). If this flag is set
                                     to True, the pair will be kept if either
                                     ltuple has missing value in l_block_attr
                                     or rtuple has missing value in r_block_attr
                                     or both.

        Returns:
            A status indicating if the tuple pair is blocked (boolean).

        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            >>> status = ob.block_tuples(A.ix[0], B.ix[0], 'address', 'address')

        """

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val,
                                         word_level, overlap_size)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # determine which tokenizer to use
        if word_level == True:
            # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # create a qgram tokenizer 
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # cleanup the tuples from non-ascii characters, punctuations, and stop words
        l_val = self.cleanup_tuple_val(ltuple[l_overlap_attr], rem_stop_words)
        r_val = self.cleanup_tuple_val(rtuple[r_overlap_attr], rem_stop_words)

        # create a filter for overlap similarity 
        overlap_filter = OverlapFilter(tokenizer, overlap_size,
                                       allow_missing=allow_missing)

        return overlap_filter.filter_pair(l_val, r_val)
예제 #37
0
    def block_tuples(self,
                     ltuple,
                     rtuple,
                     l_overlap_attr,
                     r_overlap_attr,
                     rem_stop_words=False,
                     q_val=None,
                     word_level=True,
                     overlap_size=1,
                     allow_missing=False):
        """Blocks a tuple pair based on the overlap of token sets of attribute
           values.
        
        Args:
            ltuple (Series): The input left tuple.

            rtuple (Series): The input right tuple.
            
            l_overlap_attr (string): The overlap attribute in left tuple.

            r_overlap_attr (string): The overlap attribute in right tuple.

            rem_stop_words (boolean): A flag to indicate whether stop words
                                      (e.g., a, an, the) should be removed
                                      from the token sets of the overlap
                                      attribute values (defaults to False).

            q_val (int): A value of q to use if the overlap attributes values
                         are to be tokenized as qgrams (defaults to None).
 
            word_level (boolean): A flag to indicate whether the overlap
                                  attributes should be tokenized as words
                                  (i.e, using whitespace as delimiter)
                                  (defaults to True).

            overlap_size (int): The minimum number of tokens that must overlap
                                (defaults to 1).

            allow_missing (boolean): A flag to indicate whether a tuple pair
                                     with missing value in at least one of the
                                     blocking attributes should be blocked
                                     (defaults to False). If this flag is set
                                     to True, the pair will be kept if either
                                     ltuple has missing value in l_block_attr
                                     or rtuple has missing value in r_block_attr
                                     or both.

        Returns:
            A status indicating if the tuple pair is blocked (boolean).

        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = em.OverlapBlocker()
            >>> status = ob.block_tuples(A.ix[0], B.ix[0], 'address', 'address')

        """

        # validate data types of input parameters specific to overlap blocker
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level,
                                         overlap_size)

        # validate word_level and q_val
        self.validate_word_level_qval(word_level, q_val)

        # determine which tokenizer to use
        if word_level == True:
            # # create a whitespace tokenizer
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            # # create a qgram tokenizer
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        # # cleanup the tuples from non-ascii characters, punctuations, and stop words
        l_val = self.cleanup_tuple_val(ltuple[l_overlap_attr], rem_stop_words)
        r_val = self.cleanup_tuple_val(rtuple[r_overlap_attr], rem_stop_words)

        # create a filter for overlap similarity
        overlap_filter = OverlapFilter(tokenizer,
                                       overlap_size,
                                       allow_missing=allow_missing)

        return overlap_filter.filter_pair(l_val, r_val)
    def test_apply_matcher_with_allow_missing(self):
        tok = QgramTokenizer(qval=2, return_set=True)
        sim_func = get_sim_function('JACCARD')
        threshold = 0.3
        comp_op = '>='

        # apply sim function to the entire cartesian product to obtain
        # the expected set of pairs satisfying the threshold.
        cartprod = self.cartprod
        cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                tok.tokenize(str(row[self.l_join_attr])),
                tok.tokenize(str(row[self.r_join_attr]))),
            axis=1)

        # compute expected output pairs
        comp_fn = COMP_OP_MAP[comp_op]
        expected_pairs = set()
        for idx, row in cartprod.iterrows():
            if comp_fn(float(row['sim_score']), threshold):
                expected_pairs.add(','.join((str(row[self.l_key_attr]),
                                             str(row[self.r_key_attr]))))

        # find pairs that need to be included in output due to
        # the presence of missing value in one of the join attributes.
        missing_pairs = set()
        for l_idx, l_row in self.orig_ltable.iterrows():
            for r_idx, r_row in self.orig_rtable.iterrows():
                if (pd.isnull(l_row[self.l_join_attr]) or
                    pd.isnull(r_row[self.r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[self.l_key_attr]),
                                                str(r_row[self.r_key_attr]))))

        # add the pairs containing missing value to the set of expected pairs.
        expected_pairs = expected_pairs.union(missing_pairs)

        # use overlap filter to obtain a candset with allow_missing set to True. 
        overlap_filter = OverlapFilter(tok, 1, comp_op, allow_missing=True)
        candset = overlap_filter.filter_tables(self.orig_ltable, self.orig_rtable,
                              self.l_key_attr, self.r_key_attr,
                              self.l_join_attr, self.r_join_attr)

        # apply a jaccard matcher to the candset with allow_missing set to True.
        output_candset = apply_matcher(candset,
            DEFAULT_L_OUT_PREFIX+self.l_key_attr, DEFAULT_R_OUT_PREFIX+self.r_key_attr,
            self.orig_ltable, self.orig_rtable, self.l_key_attr, self.r_key_attr,
            self.l_join_attr, self.r_join_attr, tok, sim_func, threshold,
            comp_op, True, out_sim_score=True)

        expected_output_attrs=['_id',
                               DEFAULT_L_OUT_PREFIX + self.l_key_attr,
                               DEFAULT_R_OUT_PREFIX + self.r_key_attr,
                               '_sim_score']

        # verify whether the output table has the necessary attributes.
        assert_list_equal(list(output_candset.columns.values),
                          expected_output_attrs)
        actual_pairs = set()
        for idx, row in output_candset.iterrows():
            actual_pairs.add(','.join((str(row[DEFAULT_L_OUT_PREFIX + self.l_key_attr]),
                                       str(row[DEFAULT_R_OUT_PREFIX + self.r_key_attr]))))

        # verify whether the actual pairs and the expected pairs match.
        assert_equal(len(expected_pairs), len(actual_pairs))
        common_pairs = actual_pairs.intersection(expected_pairs)
        assert_equal(len(common_pairs), len(expected_pairs))