def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.feature_table = em.get_features_for_blocking(self.A, self.B, validate_inferred_attr_types=False)
     self.rb = em.RuleBasedBlocker()
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.feature_table = em.get_features_for_blocking(self.A, self.B)
     self.rb = em.RuleBasedBlocker()
示例#3
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv'])
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
         self.feature_table = mg.get_features_for_blocking(self.A, self.B)
     except AssertionError:
         print("Dataset \'anime\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
示例#4
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'ebooks', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'ebooks', 'B.csv'])
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'record_id')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'record_id')
         ob = mg.OverlapBlocker()
         self.C = ob.block_tables(A, B, 'title', 'title', overlap_size=2,
                              rem_stop_words = True,
                              l_output_attrs=['title', 'author', 'publisher', 'date'],
                              r_output_attrs=['title', 'author', 'publisher', 'date'])
         feature_table = mg.get_features_for_blocking(A,B)
         self.rb = mg.RuleBasedBlocker()
         self.rb.add_rule(['date_date_lev_sim(ltuple, rtuple) < 0.6'], feature_table)
     except AssertionError:
         print("Dataset \'beer\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
示例#5
0
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'ID')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'ID')
            ob = mg.OverlapBlocker()
            self.C = ob.block_tables(A, B, 'ADDRESS', 'ADDRESS', overlap_size=4,
			         l_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'],
                                 r_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'])
            feature_table = mg.get_features_for_blocking(A,B)
            self.rb = mg.RuleBasedBlocker()
            self.rb.add_rule(['ADDRESS_ADDRESS_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.44'],
                         feature_table)
        except AssertionError:
            print("Dataset \'beer\' not found. Please visit the project "
                  "website to download the dataset.")
            raise SystemExit
示例#6
0
# In[21]:

len(A.ID) - A.count()

# In[22]:

len(B.ID) - B.count()

# In[41]:

A

# In[79]:

block_f = em.get_features_for_blocking(A, B)

# In[80]:

block_f

# In[81]:

rb1 = em.RuleBasedBlocker()
rule1 = 'name_name_lev_sim(ltuple,rtuple) < 0.8'
rb1.add_rule(rule1, block_f)

rb2 = em.RuleBasedBlocker()
rule2 = 'address_address_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.9'
rb2.add_rule(rule2, block_f)
示例#7
0
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'

A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

# print(A.head())
feature_table = em.get_features_for_blocking(A, B)
print(feature_table.head())

sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()), 
                            wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)

# Add feature to F
em.add_feature(feature_table, 'jac_ws_name_address', feature)
                     l_output_attrs=[
                         'Title', 'Genre', 'Score', 'Release Date', 'Rating',
                         'Directed By', 'Written By', 'Studio'
                     ],
                     r_output_attrs=[
                         'Title', 'Genre', 'Score', 'Release Date', 'Rating',
                         'Directed By', 'Written By', 'Studio'
                     ])

# Combine the outputs from attr. equivalence blocker and overlap blocker
# union because if there is an error in the release date, at least the movies should have their names in common
D = em.combine_blocker_outputs_via_union([C1, C2, C3])

# Rule based blocker after D
block_f = em.get_features_for_blocking(A,
                                       B,
                                       validate_inferred_attr_types=False)
rb = em.RuleBasedBlocker()
# print(block_f)
rb.add_rule(['Title_Title_lev_sim(ltuple, rtuple) < 0.4'], block_f)
C = rb.block_candset(D, show_progress=False)
print('Candidate Match set C Size: ', len(C))
print('Finish Blocking stage')

################################## Matcher Portion ##################################
# Open up out labeled data from the last Project.
path_G = '../data/G.csv'
G = em.read_csv_metadata(path_G,
                         key='_id',
                         ltable=A,
                         rtable=B,
示例#9
0
                                  show_progress=True)

print len(candidate_pairs)

candidate_pairs = ob.block_candset(candidate_pairs,
                                   'artist',
                                   'artist',
                                   word_level=True,
                                   overlap_size=1,
                                   show_progress=True)

print len(candidate_pairs)

#em.to_csv_metadata(reduced_pairs,'C:/Users/Daniel/Documents/UW/838/Project/Stage3/data/pairs_after_ob_title_and_artist.csv')

block_f = em.get_features_for_blocking(songs, tracks)
block_c = em.get_attr_corres(songs, tracks)
block_t = em.get_tokenizers_for_blocking()
block_s = em.get_sim_funs_for_blocking()

atypes1 = em.get_attr_types(songs)
atypes2 = em.get_attr_types(tracks)

block_f = em.get_features(songs, tracks, atypes1, atypes2, block_c, block_t,
                          block_s)

rb = em.RuleBasedBlocker()
rb.add_rule(["name_name_jac_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.3"], block_f)

candidate_pairs = rb.block_candset(candidate_pairs, show_progress=True)
示例#10
0
                     'year',
                     l_output_attrs=['title', 'year'],
                     r_output_attrs=['title', 'year'])

C2 = ob.block_candset(C1,
                      'title',
                      'title',
                      word_level=True,
                      rem_stop_words=True,
                      overlap_size=1)
# C2 = ob.block_tables(sample_movies, sample_tracks, 'title', 'title', word_level=True, rem_stop_words=True,
# 					overlap_size=1, l_output_attrs=['title', 'year'],
#                     r_output_attrs=['title', 'year'],
#                     show_progress=False)

block_f = em.get_features_for_blocking(sample_movies, sample_tracks)

rb.add_rule([
    'title_title_cos_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.6',
    'title_title_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.6'
], block_f)
C3 = rb.block_candset(C1, n_jobs=-1, show_progress=False)

# C3 = ab.block_candset(C1, l_block_attr='year', r_block_attr='year')

D = em.combine_blocker_outputs_via_union([C2, C3])

# C3 = C1
# Use block_tables to apply blocking over two input tables.

# corres = [('title', 'title'), ('year', 'year')]
def workflow(path_A, path_B, path_labeled):

    # Load csv files as dataframes and set the key attribute in the dataframe
    A = em.read_csv_metadata(path_A, key='ID')
    B = em.read_csv_metadata(path_B, key='ID')

    # Run attribute equivalence blocker on brand
    ab = em.AttrEquivalenceBlocker()
    C1 = ab.block_tables(A,
                         B,
                         'Brand',
                         'Brand',
                         l_output_attrs=[
                             'Name', 'Price', 'Brand', 'Screen Size', 'RAM',
                             'Hard Drive Capacity', 'Processor Type',
                             'Processor Speed', 'Operating System',
                             'Clean Name'
                         ],
                         r_output_attrs=[
                             'Name', 'Price', 'Brand', 'Screen Size', 'RAM',
                             'Hard Drive Capacity', 'Processor Type',
                             'Processor Speed', 'Operating System',
                             'Clean Name'
                         ])

    # Get features for rule based blocking
    block_f = em.get_features_for_blocking(A,
                                           B,
                                           validate_inferred_attr_types=False)

    # Run rule based blocker with rule for jaccard score on Clean Name column
    rb = em.RuleBasedBlocker()
    rb.add_rule(
        ['Clean_Name_Clean_Name_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.2'],
        block_f)
    C2 = rb.block_candset(C1)

    # Run black box blocker to compare screen size, ram, and hard drive capacity
    bb_screen = em.BlackBoxBlocker()
    bb_screen.set_black_box_function((screen_ram_hd_equal))
    C = bb_screen.block_candset(C2)

    # Load the labeled data
    L = em.read_csv_metadata(path_labeled,
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='ltable_ID',
                             fk_rtable='rtable_ID')

    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)
    feature_subset = feature_table.iloc[np.r_[4:10, 40:len(feature_table)], :]
    em.add_blackbox_feature(feature_subset, 'refurbished', refurbished)

    # Extract feature vectors
    feature_vectors_dev = em.extract_feature_vecs(L,
                                                  feature_table=feature_subset,
                                                  attrs_after='gold')

    # Impute feature vectors with the mean of the column values.
    feature_vectors_dev = em.impute_table(
        feature_vectors_dev,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'],
        strategy='mean')

    # Train using feature vectors from the labeled data
    matcher = em.RFMatcher(name='RF')
    matcher.fit(table=feature_vectors_dev,
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'],
                target_attr='gold')

    # Extract feature vectors for the rest of the data
    feature_vectors = em.extract_feature_vecs(C, feature_table=feature_subset)

    # Impute feature vectors with the mean of the column values.
    feature_vectors = em.impute_table(
        feature_vectors,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
        strategy='mean')

    # Make predictions for the whole data set
    predictions = matcher.predict(
        table=feature_vectors,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
        append=True,
        target_attr='predicted',
        inplace=False)
    predictions = predictions.loc[:, [
        '_id', 'ltable_ID', 'rtable_ID', 'predicted'
    ]]

    return predictions[predictions['predicted'] == 1]