def test_save_load_bb_blocker(): #mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib') from magellan.feature.simfunctions import jaccard from magellan.feature.tokenizers import tok_qgram def block_fn_1(ltuple, rtuple): val = jaccard(tok_qgram(ltuple['address'], 3), tok_qgram(rtuple['address'], 3)) if val < 0.4: return True else: return False bb0 = mg.BlackBoxBlocker() bb0.set_black_box_function(block_fn_1) filename = '__mg_obj__.pkl' A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') C0 = bb0.block_tables(A, B) mg.save_object(bb0, filename) bb1 = mg.load_object(filename) try: os.remove(filename) except OSError: pass assert_equal(type(bb0), type(bb1)) C1 = bb1.block_tables(A, B) assert_equal(len(C0), len(C1)) assert_equal(sorted(C0.columns), sorted(C0.columns))
def test_bb_block_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') bb = mg.BlackBoxBlocker() bb.set_black_box_function(block_fn) assert_equal(bb.block_tuples(A.ix[0], B.ix[0]), True) assert_equal(bb.block_tuples(A.ix[2], B.ix[1]), False)
def test_bb_block_tables(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') bb = mg.BlackBoxBlocker() bb.set_black_box_function(block_fn) C = bb.block_tables(A, B, 'zipcode', 'zipcode') s1 = sorted( ['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode']) assert_equal(s1, sorted(C.columns)) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID') feature_table = mg.get_features_for_blocking(A, B) A['dummy'] = 1 B['dummy'] = 1 ab = mg.AttrEquivalenceBlocker() D = ab.block_tables(A, B, 'dummy', 'dummy') fv = mg.extract_feature_vecs(D, feature_table=feature_table) expected_ids = fv.ix[(fv.name_name_mel >= 0.4), ['ltable.ID', 'rtable.ID']] actual_ids = C[['ltable.ID', 'rtable.ID']] ids_exp = list( expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) ids_act = list( actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) assert_equal(cmp(ids_exp, ids_act), 0)
def test_bb_block_tables_wi_no_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') bb = mg.BlackBoxBlocker() bb.set_black_box_function(evil_block_fn) C = bb.block_tables(A, B) assert_equal(len(C), 0) assert_equal(sorted(C.columns), sorted(['_id', 'ltable.ID', 'rtable.ID'])) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
def test_bb_block_candset_wi_no_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'birth_year', 'birth_year') bb = mg.BlackBoxBlocker() bb.set_black_box_function(evil_block_fn) D = bb.block_candset(C) assert_equal(len(D), 0) assert_equal(sorted(D.columns), sorted(['_id', 'ltable.ID', 'rtable.ID'])) assert_equal(D.get_key(), '_id') assert_equal(D.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(D.get_property('foreign_key_rtable'), 'rtable.ID')
def test_bb_block_candset(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() E = ab.block_tables(A, B, 'zipcode', 'zipcode') bb = mg.BlackBoxBlocker() bb.set_black_box_function(block_fn) C = bb.block_candset(E) feature_table = mg.get_features_for_blocking(A, B) fv = mg.extract_feature_vecs(C, feature_table=feature_table) expected_ids = fv.ix[(fv.name_name_mel >= 0.4), ['ltable.ID', 'rtable.ID']] actual_ids = C[['ltable.ID', 'rtable.ID']] ids_exp = list( expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) ids_act = list( actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) assert_equal(cmp(ids_exp, ids_act), 0)