示例#1
0
from distributed import Client
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize
import cloudpickle

pbar = ProgressBar()
pbar.register()

print("Mem. usage before reading:{0}".format(psutil.virtual_memory().used /
                                             1e9))
#A = pd.read_csv('../datasets/sample_citeseer_200k.csv')
#B = pd.read_csv('../datasets/sample_dblp_200k.csv')
A = pd.read_csv('../datasets/sample_citeseer_100k.csv')
B = pd.read_csv('../datasets/sample_dblp_100k.csv')
print(len(A), len(B))

block_f = get_features_for_blocking(A, B)
rb = RuleBasedBlocker()
#_ = rb.add_rule(['title_title_jac_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.8'], block_f)
_ = rb.add_rule(['title_title_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.8'], block_f)
rb.set_table_attrs(['title'], ['title'])
memUsageBefore = psutil.virtual_memory().used / 1e9
timeBefore = time.time()
print("Mem. usage before reading:{0}".format(memUsageBefore))
C = rb.block_tables(A,
                    B,
                    'id',
                    'id',
                    nltable_chunks=2,
                    nrtable_chunks=2,
                    l_output_attrs=['title'],
                    r_output_attrs=['title'],
示例#2
0
def test_create_dag():
    datapath = "/Users/pradap/Documents/Research/Python-Package/scaling/dmagellan/datasets"
    A = pd.read_csv(os.path.join(datapath, 'person_table_A.csv'),
                    low_memory=False)
    B = pd.read_csv(os.path.join(datapath, 'person_table_B.csv'),
                    low_memory=False)

    # A = pd.read_csv(os.path.join(datapath, 'tracks.csv'), low_memory=False)
    # B = pd.read_csv(os.path.join(datapath, 'songs.csv'), low_memory=False)

    print('Reading the files done')
    ab = AttrEquivalenceBlocker()
    C = ab.block_tables(A,
                        B,
                        'ID',
                        'ID',
                        'birth_year',
                        'birth_year', ['name', 'address', 'zipcode'],
                        ['name', 'address', 'zipcode'],
                        nltable_chunks=2,
                        nrtable_chunks=2,
                        compute=False,
                        scheduler=dask.get)

    def last_name_match(ltuple, rtuple):
        l_first_name, l_last_name = ltuple['name'].split()
        r_first_name, r_last_name = rtuple['name'].split()
        return l_last_name != r_last_name

    bb = BlackBoxBlocker()
    bb.set_black_box_function(last_name_match)
    bb.set_ltable_attrs(['name'])
    bb.set_rtable_attrs(['name'])
    D = bb.block_candset(C,
                         A,
                         B,
                         'l_ID',
                         'r_ID',
                         "ID",
                         "ID",
                         nchunks=4,
                         compute=False,
                         scheduler=dask.get)

    ob = OverlapBlocker()
    E = ob.block_candset(D,
                         A,
                         B,
                         "l_ID",
                         "r_ID",
                         "ID",
                         "ID",
                         'name',
                         'name',
                         nchunks=4,
                         overlap_size=1,
                         compute=False)

    block_f = get_features_for_blocking(A, B)
    rb = RuleBasedBlocker()
    # Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
    _ = rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)
    rb.set_table_attrs(['name'], ['name'])

    F = rb.block_candset(E,
                         A,
                         B,
                         'l_ID',
                         'r_ID',
                         "ID",
                         "ID",
                         nchunks=4,
                         compute=False,
                         scheduler=dask.get)

    return F
示例#3
0
B = pd.read_csv('../datasets/sample_dblp_100k.csv')

A.reset_index(inplace=True, drop=True)
B.reset_index(inplace=True, drop=True)


s = A.title.str.len().sort_values().index
A1 = A.reindex(s)
A1 = A1.reset_index(drop=True)

s = B.title.str.len().sort_values().index
B1 = B.reindex(s)
B1 = B1.reset_index(drop=True)

rb = RuleBasedBlocker()
feature_table = get_features_for_blocking(A, B)
sim = get_sim_funs_for_blocking()
tok = get_tokenizers_for_blocking()

block_f = get_features_for_blocking(A1, B1)
_ = rb.add_rule(['title_title_lev_dist(ltuple, rtuple) > 6'], block_f)

rb.set_table_attrs(['title'], ['title'])
input_tables = OrderedDict()
input_tables['ltable'] = A1
input_tables['rtable'] = B1

input_args = OrderedDict()
input_args['l_key'] = 'id'
input_args['r_key'] = 'id'
input_args['compute'] = True