from distributed import Client from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize import cloudpickle pbar = ProgressBar() pbar.register() print("Mem. usage before reading:{0}".format(psutil.virtual_memory().used / 1e9)) #A = pd.read_csv('../datasets/sample_citeseer_200k.csv') #B = pd.read_csv('../datasets/sample_dblp_200k.csv') A = pd.read_csv('../datasets/sample_citeseer_100k.csv') B = pd.read_csv('../datasets/sample_dblp_100k.csv') print(len(A), len(B)) block_f = get_features_for_blocking(A, B) rb = RuleBasedBlocker() #_ = rb.add_rule(['title_title_jac_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.8'], block_f) _ = rb.add_rule(['title_title_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.8'], block_f) rb.set_table_attrs(['title'], ['title']) memUsageBefore = psutil.virtual_memory().used / 1e9 timeBefore = time.time() print("Mem. usage before reading:{0}".format(memUsageBefore)) C = rb.block_tables(A, B, 'id', 'id', nltable_chunks=2, nrtable_chunks=2, l_output_attrs=['title'], r_output_attrs=['title'],
def test_create_dag(): datapath = "/Users/pradap/Documents/Research/Python-Package/scaling/dmagellan/datasets" A = pd.read_csv(os.path.join(datapath, 'person_table_A.csv'), low_memory=False) B = pd.read_csv(os.path.join(datapath, 'person_table_B.csv'), low_memory=False) # A = pd.read_csv(os.path.join(datapath, 'tracks.csv'), low_memory=False) # B = pd.read_csv(os.path.join(datapath, 'songs.csv'), low_memory=False) print('Reading the files done') ab = AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'ID', 'ID', 'birth_year', 'birth_year', ['name', 'address', 'zipcode'], ['name', 'address', 'zipcode'], nltable_chunks=2, nrtable_chunks=2, compute=False, scheduler=dask.get) def last_name_match(ltuple, rtuple): l_first_name, l_last_name = ltuple['name'].split() r_first_name, r_last_name = rtuple['name'].split() return l_last_name != r_last_name bb = BlackBoxBlocker() bb.set_black_box_function(last_name_match) bb.set_ltable_attrs(['name']) bb.set_rtable_attrs(['name']) D = bb.block_candset(C, A, B, 'l_ID', 'r_ID', "ID", "ID", nchunks=4, compute=False, scheduler=dask.get) ob = OverlapBlocker() E = ob.block_candset(D, A, B, "l_ID", "r_ID", "ID", "ID", 'name', 'name', nchunks=4, overlap_size=1, compute=False) block_f = get_features_for_blocking(A, B) rb = RuleBasedBlocker() # Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4 _ = rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f) rb.set_table_attrs(['name'], ['name']) F = rb.block_candset(E, A, B, 'l_ID', 'r_ID', "ID", "ID", nchunks=4, compute=False, scheduler=dask.get) return F
B = pd.read_csv('../datasets/sample_dblp_100k.csv') A.reset_index(inplace=True, drop=True) B.reset_index(inplace=True, drop=True) s = A.title.str.len().sort_values().index A1 = A.reindex(s) A1 = A1.reset_index(drop=True) s = B.title.str.len().sort_values().index B1 = B.reindex(s) B1 = B1.reset_index(drop=True) rb = RuleBasedBlocker() feature_table = get_features_for_blocking(A, B) sim = get_sim_funs_for_blocking() tok = get_tokenizers_for_blocking() block_f = get_features_for_blocking(A1, B1) _ = rb.add_rule(['title_title_lev_dist(ltuple, rtuple) > 6'], block_f) rb.set_table_attrs(['title'], ['title']) input_tables = OrderedDict() input_tables['ltable'] = A1 input_tables['rtable'] = B1 input_args = OrderedDict() input_args['l_key'] = 'id' input_args['r_key'] = 'id' input_args['compute'] = True