def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv']) l_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] r_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'id') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'id') C = ab.block_tables(A, B, 'city_posted', 'city_posted', l_output_attrs, r_output_attrs) self.D = ab.block_candset(C, 'model_year', 'model_year') bb.set_black_box_function(_bikes_function) except AssertionError: print("Dataset \'bikes\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def _get_stop_words(): stop_words_set = set() install_path = em.get_install_path() dataset_path = os.sep.join([install_path, 'utils']) stop_words_file = os.sep.join([dataset_path, 'stop_words.txt']) with open(stop_words_file, "rb") as stopwords_file: for stop_words in stopwords_file: stop_words_set.add(stop_words.rstrip()) return stop_words_set
def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv']) self.A = mg.read_csv_metadata(path_for_A) try: mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') self.l_output_attrs = ['Brand', 'Amazon_Price'] self.r_output_attrs = ['Brand', 'Price'] bb.set_black_box_function(_electronics_function) except AssertionError: print( "Dataset \'electronics\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'id') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'id') except AssertionError: print("Dataset \'bikes\' not found. Please visit the project" " website to download the dataset.") raise SystemExit self.l_block_attr = 'city_posted' self.r_block_attr = 'city_posted' self.l_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year'] self.r_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year']
def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'id') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'id') except AssertionError: print("Dataset \'bikes\' not found. Please visit the project" " website to download the dataset.") raise SystemExit self.l_block_attr = 'city_posted' self.r_block_attr = 'city_posted' self.l_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] self.r_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ]
import os from nose.tools import * import pandas as pd import unittest import py_entitymatching as em p = em.get_install_path() path_a = os.sep.join([p, 'tests', 'test_datasets', 'A.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'B.csv']) l_block_attr_1 = 'zipcode' l_block_attr_2 = 'birth_year' l_block_attr_3 = 'name' r_block_attr_1 = 'zipcode' r_block_attr_2 = 'birth_year' r_block_attr_3 = 'name' l_output_attrs = ['zipcode', 'birth_year'] r_output_attrs = ['zipcode', 'birth_year'] l_output_prefix = 'l_' r_output_prefix = 'r_' # attribute equivalence on [l|r]_block_attr_1 expected_ids_1 = [('a1', 'b1'), ('a1', 'b2'), ('a1', 'b6'), ('a2', 'b3'), ('a2', 'b4'), ('a2', 'b5'), ('a3', 'b1'), ('a3', 'b2'), ('a3', 'b6'), ('a4', 'b3'), ('a4', 'b4'), ('a4', 'b5'), ('a5', 'b3'), ('a5', 'b4'), ('a5', 'b5')] # attribute equivalence on [l|r]_block_attr_1 \intersection [l|r]_block_attr_2 expected_ids_2 = [('a2', 'b3'), ('a3', 'b2'), ('a5', 'b5')]
import os from nose.tools import * import pandas as pd import unittest import py_entitymatching as em p = em.get_install_path() path_a = os.sep.join([p, 'tests', 'test_datasets', 'A.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'B.csv']) l_overlap_attr_1 = 'name' l_overlap_attr_2 = 'address' r_overlap_attr_1 = 'name' r_overlap_attr_2 = 'address' l_output_attrs = ['name', 'address'] r_output_attrs = ['name', 'address'] l_output_prefix = 'l_' r_output_prefix = 'r_' # overlap on [r,l]_overlap_attr_1 with overlap_size=1 expected_ids_1 = [('a2', 'b3'), ('a2', 'b6'), ('a3', 'b2'), ('a5', 'b5')] # overlap on [r,l]_overlap_attr_2 with overlap_size=4 expected_ids_2 = [('a2', 'b3'), ('a3', 'b2')] # overlap on birth_year with q_val=3, overlap_size=2 (no padding) =6 (padding) expected_ids_3 = [('a2', 'b3'), ('a3', 'b2'), ('a4', 'b1'), ('a4', 'b6'), ('a5', 'b5')] # block tables on [l|r]_overlap_attr_2, block candset on [l|r]overlap_attr_3 expected_ids_2_and_3 = [('a2', 'b3'), ('a3', 'b2')]
# Write the benchmarking functions here. # See "Writing benchmarks" in the asv docs for more information. import os import sys import py_entitymatching as mg p = mg.get_install_path() datasets_path = os.sep.join([p, 'datasets', 'example_datasets']) ab = mg.AttrEquivalenceBlocker() class TimeBlockTablesAnime: def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) self.l_block_attr = 'Year' self.r_block_attr = 'Year' self.l_output_attrs = ['Title', 'Year', 'Episodes'] self.r_output_attrs = ['Title', 'Year', 'Episodes'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'anime\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
import os from dask import get from dmagellan.feature.extractfeatures import extract_feature_vecs from dmagellan.feature.autofeaturegen import get_features_for_matching from dmagellan.matcher.dtmatcher import DTMatcher from dmagellan.matcher.svmmatcher import SVMMatcher from dmagellan.matcher.rfmatcher import RFMatcher from dmagellan.matcher.logregmatcher import LogRegMatcher from dmagellan.matcher.nbmatcher import NBMatcher from dmagellan.matcher.linregmatcher import LinRegMatcher from dmagellan.mlmatcherselection.mlmatcherselection import select_matcher # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' path_A = datasets_dir + os.sep + 'dblp_demo.csv' path_B = datasets_dir + os.sep + 'acm_demo.csv' path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv' A = em.read_csv_metadata(path_A, key='id') B = em.read_csv_metadata(path_B, key='id') # Load the pre-labeled data S = em.read_csv_metadata(path_labeled_data, key='_id', ltable=A, rtable=B, fk_ltable='ltable_id', fk_rtable='rtable_id') # Split S into I an J
#!/bin/python # Invoke this script from /root import sys sys.path.append('/magellan/py_entitymatching/py_entitymatching/') import py_entitymatching as em import pandas as pd import os path_A = em.get_install_path( ) + os.sep + 'datasets' + os.sep + 'end-to-end' + os.sep + 'restaurants/fodors.csv' path_B = em.get_install_path( ) + os.sep + 'datasets' + os.sep + 'end-to-end' + os.sep + 'restaurants/zagats.csv' A = em.read_csv_metadata(path_A, key='id') B = em.read_csv_metadata(path_B, key='id') print('Number of tuples in A: ' + str(len(A))) print('Number of tuples in B: ' + str(len(B))) print('Number of tuples in A X B (i.e the cartesian product): ' + str(len(A) * len(B))) ob = em.OverlapBlocker() C = ob.block_tables(A, B, 'name', 'name', l_output_attrs=['name', 'addr', 'city', 'phone'], r_output_attrs=['name', 'addr', 'city', 'phone'], overlap_size=1, show_progress=False)
# Write the benchmarking functions here. # See "Writing benchmarks" in the asv docs for more information. import os import sys import py_entitymatching as mg p = mg.get_install_path() datasets_path = os.sep.join([p, 'datasets', 'example_datasets']) snb = mg.SortedNeighborhoodBlocker() class TimeBlockTablesAnime: def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) self.l_block_attr = 'Year' self.r_block_attr = 'Year' self.l_output_attrs = ['Title', 'Year', 'Episodes'] self.r_output_attrs = ['Title', 'Year', 'Episodes'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'anime\' not found. Please visit the project" " website to download the dataset.") raise SystemExit