def setup(self): p = mg.get_install_path() path_for_A = os.sep.join( [p, 'datasets', 'example_datasets', 'bikes', 'A.csv']) path_for_B = os.sep.join( [p, 'datasets', 'example_datasets', 'bikes', 'B.csv']) l_key = 'id' r_key = 'id' self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, l_key) self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, r_key) l_block_attr_1 = 'city_posted' r_block_attr_1 = 'city_posted' l_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] r_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] self.ab = mg.AttrEquivalenceBlocker() self.C = self.ab.block_tables(self.A, self.B, l_block_attr_1, r_block_attr_1, l_output_attrs, r_output_attrs, verbose=False) self.l_block_attr = 'model_year' self.r_block_attr = 'model_year'
import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table=S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
def setUp(self): self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, l_key) self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, r_key) self.ab = mg.AttrEquivalenceBlocker()