def test_rb_block_tables():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    rb = mg.RuleBasedBlocker()
    feature_table = mg.get_features_for_blocking(A, B)
    rb.add_rule([
        'name_name_mel(ltuple, rtuple) < 0.4',
        'birth_year_birth_year_lev(ltuple, rtuple) < 0.5'
    ], feature_table)
    rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) != 1'], feature_table)
    C = rb.block_tables(A, B, 'zipcode', 'zipcode')
    s1 = sorted(
        ['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')

    A['dummy'] = 1
    B['dummy'] = 1
    ab = mg.AttrEquivalenceBlocker()
    D = ab.block_tables(A, B, 'dummy', 'dummy')
    fv = mg.extract_feature_vecs(D, feature_table=feature_table)
    expected_ids = fv.ix[((fv.name_name_mel >= 0.4) |
                          (fv.birth_year_birth_year_lev >= 0.5))
                         & fv.zipcode_zipcode_exm == 1, [
                             'ltable.ID',
                             'rtable.ID',
                         ]]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(
        expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(
        actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
def test_bb_block_tables():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(block_fn)
    C = bb.block_tables(A, B, 'zipcode', 'zipcode')
    s1 = sorted(
        ['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')

    feature_table = mg.get_features_for_blocking(A, B)
    A['dummy'] = 1
    B['dummy'] = 1
    ab = mg.AttrEquivalenceBlocker()
    D = ab.block_tables(A, B, 'dummy', 'dummy')
    fv = mg.extract_feature_vecs(D, feature_table=feature_table)
    expected_ids = fv.ix[(fv.name_name_mel >= 0.4), ['ltable.ID', 'rtable.ID']]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(
        expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(
        actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
def test_rb_block_tables():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    rb = mg.RuleBasedBlocker()
    feature_table = mg.get_features_for_blocking(A, B)
    rb.add_rule(['name_name_mel(ltuple, rtuple) < 0.4',
                 'birth_year_birth_year_lev(ltuple, rtuple) < 0.5'],
                feature_table)
    rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) != 1'],
                feature_table)
    C = rb.block_tables(A, B, 'zipcode', 'zipcode')
    s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')

    A['dummy'] = 1
    B['dummy'] = 1
    ab = mg.AttrEquivalenceBlocker()
    D = ab.block_tables(A, B, 'dummy','dummy')
    fv = mg.extract_feature_vecs(D,  feature_table=feature_table)
    expected_ids = fv.ix[((fv.name_name_mel >= 0.4) | (fv.birth_year_birth_year_lev >= 0.5)) &
      fv.zipcode_zipcode_exm == 1 ,
      ['ltable.ID', 'rtable.ID',
       ]]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
def test_bb_block_candset():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    E = ab.block_tables(A, B, 'zipcode', 'zipcode')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(block_fn)
    C = bb.block_candset(E)
    feature_table = mg.get_features_for_blocking(A, B)
    fv = mg.extract_feature_vecs(C, feature_table=feature_table)
    expected_ids = fv.ix[(fv.name_name_mel >= 0.4) ,
      ['ltable.ID', 'rtable.ID']]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
def test_bb_block_candset():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    E = ab.block_tables(A, B, 'zipcode', 'zipcode')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(block_fn)
    C = bb.block_candset(E)
    feature_table = mg.get_features_for_blocking(A, B)
    fv = mg.extract_feature_vecs(C, feature_table=feature_table)
    expected_ids = fv.ix[(fv.name_name_mel >= 0.4), ['ltable.ID', 'rtable.ID']]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(
        expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(
        actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
def test_bb_block_tables():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(block_fn)
    C = bb.block_tables(A, B, 'zipcode', 'zipcode')
    s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')

    feature_table = mg.get_features_for_blocking(A, B)
    A['dummy'] = 1
    B['dummy'] = 1
    ab = mg.AttrEquivalenceBlocker()
    D = ab.block_tables(A, B, 'dummy','dummy')
    fv = mg.extract_feature_vecs(D,  feature_table=feature_table)
    expected_ids = fv.ix[(fv.name_name_mel >= 0.4) ,
      ['ltable.ID', 'rtable.ID']]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)