# check to see whether the feature is indeed added to feature table feat_table # In[32]: # Try executing the newly added function over tuples from A and B feat_table.ix[6, 'function'](A.ix[2], B.ix[3]) # In[33]: # Extract feature vectors for labeled candidate set. Also, we mention 'ltable.name, rtable.name' must be included before # feature vectors and 'gold_label' after the feature vector s_prime = mg.extract_feature_vecs(L, attrs_before=['ltable.name', 'rtable.name'], feature_table=feat_table, attrs_after=['gold_label']) # In[34]: # display feature vector table s_prime # In[35]: # Fitting/Predicting # Create a set of matchers nb = mg.NBMatcher() # naive bayes dt = mg.DTMatcher() # decision tree rf = mg.RFMatcher() # random forest
import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table = S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table=S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
import magellan as mg from magellan.debugmatcher.debug_gui_decisiontree_matcher import _vis_debug_dt, \ vis_tuple_debug_dt_matcher datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets']) path_c = os.sep.join([datasets_path, 'C.csv']) A = mg.load_dataset('table_A', key='ID') B = mg.load_dataset('table_B', key='ID') C = mg.read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = mg.get_features_for_matching(A, B) feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = mg.DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') vis_tuple_debug_dt_matcher(dt, feature_vectors.ix[0], exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels']) # feature_table = mg.get_features_for_matching(A, B) # # labels = [0]*7 # labels.extend([1]*8) # C['labels'] = labels # # feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, # attrs_after='labels')
# In[31]: # check to see whether the feature is indeed added to feature table feat_table # In[32]: # Try executing the newly added function over tuples from A and B feat_table.ix[6, 'function'](A.ix[2], B.ix[3]) # In[33]: # Extract feature vectors for labeled candidate set. Also, we mention 'ltable.name, rtable.name' must be included before # feature vectors and 'gold_label' after the feature vector s_prime = mg.extract_feature_vecs(L, attrs_before=['ltable.name', 'rtable.name'], feature_table=feat_table, attrs_after=['gold_label']) # In[34]: # display feature vector table s_prime # In[35]: # Fitting/Predicting # Create a set of matchers nb = mg.NBMatcher() # naive bayes dt = mg.DTMatcher() # decision tree rf = mg.RFMatcher() # random forest
# coding=utf-8 # coding=utf-8 import os import magellan as mg datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets']) path_c = os.sep.join([datasets_path, 'C.csv']) A = mg.load_dataset('table_A', key='ID') B = mg.load_dataset('table_B', key='ID') C = mg.read_csv_metadata(path_c, ltable=A, rtable=B) feature_table = mg.get_features_for_matching(A, B) labels = [0]*7 labels.extend([1]*8) C['labels'] = labels feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') rf = mg.RFMatcher() train_test = mg.train_test_split(feature_vectors) train = train_test['train'] test = train_test['test'] rf.fit(table=train, exclude_attrs=['ltable_ID', 'rtable_ID', '_id'], target_attr='labels') mg.debug_randomforest_matcher(rf, A.ix[1], B.ix[2], feat_table=feature_table, fv_columns=feature_vectors.columns, exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels']) print('Hi')
# coding=utf-8 import logging import os import magellan as mg logging.basicConfig(level=logging.DEBUG) datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets', 'matcherselector']) path_a = os.sep.join([datasets_path, 'ACM_demo.csv']) path_b = os.sep.join([datasets_path, 'DBLP_demo.csv']) path_c = os.sep.join([datasets_path, 'dblp_acm_demo_labels.csv']) A = mg.read_csv_metadata(path_a, key='id') B = mg.read_csv_metadata(path_b, key='id') C = mg.read_csv_metadata(path_c, ltable=B, rtable=A, fk_ltable='ltable.id', fk_rtable='rtable.id', key='_id') feature_table = mg.get_features_for_matching(A, B) feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold', verbose=True) # dtmatcher = mg.DTMatcher() # nbmatcher = mg.NBMatcher() # rfmatcher = mg.RFMatcher() # svmmatcher = mg.SVMMatcher() # linregmatcher = mg.LinRegMatcher() # logregmatcher = mg.LogRegMatcher()