def main(): #ltable = mg.read_csv('../datasets/magellan_builtin_test/table_A.csv', key='ID') #rtable = mg.read_csv('../datasets/magellan_builtin_test/table_B.csv', key='ID') #ltable = mg.read_csv('../datasets/books_test/bowker_final_custom_id.csv', key='id') #rtable = mg.read_csv('../datasets/books_test/walmart_final_custom_id.csv', key='id') #ltable = mg.read_csv('../datasets/books_full/bowker_final_custom_id.csv', key='id') #rtable = mg.read_csv('../datasets/books_full/walmart_final_custom_id.csv', key='id') #ltable = mg.read_csv('../datasets/products_test/walmart_final_custom_id_lowercase.csv', key='id') #rtable = mg.read_csv('../datasets/products_test/amazon_final_custom_id_python_lowercase.csv', key='id') #ltable = mg.read_csv('../datasets/books/BN.csv', key='id') #rtable = mg.read_csv('../datasets/books/amazon_books.csv', key='id') #ltable = mg.read_csv('../datasets/books/BN_small.csv', key='id') #rtable = mg.read_csv('../datasets/books/amazon_books_small.csv', key='id') #blocker = mg.AttrEquivalenceBlocker() #candidate_set = blocker.block_tables(ltable, rtable, 'pubYear', 'pubYear') '''CS784 datasets''' #ltable = mg.read_csv('../datasets/CS784/M_ganz/tableA.csv', key='id') #rtable = mg.read_csv('../datasets/CS784/M_ganz/tableB.csv', key='id') #candidate_set = mg.read_csv('../datasets/CS784/M_ganz/tableC.csv', key='_id', ltable=ltable, rtable=rtable) #outdir = '../datasets/CS784/M_ganz/topK_results/' dataset = 'S_hanli' ltable = mg.read_csv('../datasets/CS784/' + dataset + '/tableA.csv', key='id') rtable = mg.read_csv('../datasets/CS784/' + dataset + '/tableB.csv', key='id') candidate_set = mg.read_csv('../datasets/CS784/' + dataset + '/tableC.csv', key='_id', ltable=ltable, rtable=rtable) outdir = '../datasets/CS784/' + dataset + '/topK_results/' pred_table = iterative_topK_debug_blocker(ltable, rtable, candidate_set, outdir)
def main(): ltable = mg.read_csv('./datasets/magellan_builtin_test/table_A.csv', key='ID') rtable = mg.read_csv('./datasets/magellan_builtin_test/table_B.csv', key='ID') # ltable = mg.read_csv('./datasets/books_test/bowker_final_custom_id.csv', key='id') # rtable = mg.read_csv('./datasets/books_test/walmart_final_custom_id.csv', key='id') # ltable = mg.read_csv('./datasets/products_test/walmart_final_custom_id_lowercase.csv', key='id') # rtable = mg.read_csv('./datasets/products_test/amazon_final_custom_id_python_lowercase.csv', key='id') # ltable = mg.read_csv('./datasets/books/BN_small.csv', key='id') # rtable = mg.read_csv('./datasets/books/amazon_books_small.csv', key='id') blocker = mg.AttrEquivalenceBlocker() candidate_set = blocker.block_tables(ltable, rtable, 'address', 'address') pred_table = debug_blocker(ltable, rtable, candidate_set) print pred_table
def load_dataset(filename, key=None): p = get_install_path() p = os.sep.join([p, 'datasets', filename + '.csv']) if filename is 'table_A' or 'table_B': key = 'ID' df = read_csv(p, key=key) return df
def main(): #ltable = mg.read_csv('../datasets/magellan_builtin_test/table_A.csv', key='ID') #rtable = mg.read_csv('../datasets/magellan_builtin_test/table_B.csv', key='ID') ltable = mg.read_csv('../datasets/books_test/bowker_final_custom_id.csv', key='id') rtable = mg.read_csv('../datasets/books_test/walmart_final_custom_id.csv', key='id') #ltable = mg.read_csv('../datasets/books_full/bowker_final_custom_id.csv', key='id') #rtable = mg.read_csv('../datasets/books_full/walmart_final_custom_id.csv', key='id') #ltable = mg.read_csv('../datasets/products_test/walmart_final_custom_id_lowercase.csv', key='id') #rtable = mg.read_csv('../datasets/products_test/amazon_final_custom_id_python_lowercase.csv', key='id') #ltable = mg.read_csv('../datasets/books/BN.csv', key='id') #rtable = mg.read_csv('../datasets/books/amazon_books.csv', key='id') #ltable = mg.read_csv('../datasets/books/BN_small.csv', key='id') #rtable = mg.read_csv('../datasets/books/amazon_books_small.csv', key='id') #blocker = mg.AttrEquivalenceBlocker() #candidate_set = blocker.block_tables(ltable, rtable, 'pubYear', 'pubYear') candidate_set = MTable() pred_table = iterative_topK_debug_blocker(ltable, rtable, candidate_set)
import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table = S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
import pandas as pd import magellan as mg import sys sys.path.append('C:\Pradap\Research\Python-work\Saranam\magellan') A = mg.read_csv('../magellan/data/toy/table_A.csv', key='ID') B = mg.read_csv('../magellan/data/toy/table_B.csv', key='ID') blocker = mg.AttrEquivalenceBlocker() C = blocker.block_tables(A, B, 'zipcode', 'zipcode', ['name', 'address', 'hourly_wage'], ['name', 'address', 'hourly_wage']) D = mg.block_union_combine([C, C]) S = mg.sample_one_table(D, 10) #L = mg.label(S, 'gold_label') #print mg._m_global_tokenizers #print mg._m_global_sim_fns t = mg.get_single_arg_tokenizers() print t s = mg.get_sim_funs() print s corres = mg.get_attr_corres(A, B) print corres['corres'] t_1 = mg.get_attr_types(A) print t_1 t_2 = mg.get_attr_types(B)
import sys import magellan as mg from magellan.gui.debug_gui_base import vis_debug_dt sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/') mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) L = mg.read_csv('label_demo.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold') S = mg.train_test_split(G, 8, 7) dt = mg.DTMatcher(name='DecisionTree') dt.fit(table=S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted', append=True) d = mg.eval_matches(S['test'], 'gold', 'predicted') vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table) print "Hi"
from magellan.evaluation.matcher_and_trigger_crossvalidation import cv_matcher_and_trigger import magellan as mg import pandas as pd mg.init_jvm() # Read walmart books data wal = mg.read_csv(mg.get_install_path()+'/datasets/books/walmart.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') # Read bowker books data bwk = mg.read_csv(mg.get_install_path()+'/datasets/books/bowker.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk) feature_table = mg.get_features_for_matching(wal, bwk) f = feature_table.ix[[3,7,18,26, 53]] m = mg.DTMatcher() # feature_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold') G = mg.impute_table(G, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], strategy='most_frequent') # m = mg.LinRegMatcher() # print G pos_trigger = mg.MatchTrigger() pos_trigger.add_cond_rule('author_author_lev(ltuple, rtuple) == 1', feature_table=feature_table) pos_trigger.add_cond_status(True) pos_trigger.add_action(1)
from magellan.evaluation.matcher_and_trigger_crossvalidation import cv_matcher_and_trigger import magellan as mg A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name']) mg.init_jvm() #L = mg.label_table(C, 'gold') #L.to_csv('label.csv') L = mg.read_csv('label.csv', ltable=A, rtable=B) feature_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=feature_table, attrs_after='gold') m = mg.LinRegMatcher() t = cv_matcher_and_trigger( m, None, table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold', k=5, metric='precision', random_state=0) res = mg.select_matcher( [m], table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold', k=5, metric='f1', random_state=0)
import magellan as mg import pandas as pd mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') mg.init_jvm() #not reqd ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name', 'address'], ['name', 'address']) L = mg.read_csv('label_ab_correct_labels.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) f = feat_table.ix[[9, 10, 17]] G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold') dt = mg.DTMatcher() dt.fit(table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') t = dt.predict(table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], append=True, inplace=False, target_attr='predicted') # ret_val, node_list = mg.vis_tuple_debug_dt_matcher(dt, G.ix[0], # exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], ensemble_flag=False) # print ret_val # print node_list
import sys sys.path.append('/Users/pradap/Documents/Research/Python-Package/enrique/') import magellan as mg import pandas as pd mg.init_jvm() wal = mg.read_csv(mg.get_install_path() + '/datasets/books/walmart.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') bwk = mg.read_csv(mg.get_install_path() + '/datasets/books/bowker.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(wal, bwk, 'isbn', 'isbn', ['title', 'author'], ['title', 'author']) L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk) print len(L) feat_table = mg.get_features_for_matching(wal, bwk) f = feat_table.ix[[3,7,18,26, 53]] G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold') dt = mg.DTMatcher() svm = mg.SVMMatcher()
import magellan as mg import pandas as pd mg.init_jvm() A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') mg.init_jvm() #not reqd ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name', 'address'], ['name', 'address']) L = mg.read_csv('label_ab_correct_labels.csv', ltable=A, rtable=B) feat_table = mg.get_features_for_matching(A, B) f = feat_table.ix[[9, 10, 17]] G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold') dt = mg.DTMatcher() dt.fit(table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold') t = dt.predict(table=G, exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], append=True, inplace=False, target_attr='predicted') # ret_val, node_list = mg.vis_tuple_debug_dt_matcher(dt, G.ix[0], # exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], ensemble_flag=False) # print ret_val