def setup(self): p = mg.get_install_path() path_for_A = os.sep.join( [p, 'datasets', 'example_datasets', 'bikes', 'A.csv']) path_for_B = os.sep.join( [p, 'datasets', 'example_datasets', 'bikes', 'B.csv']) l_key = 'id' r_key = 'id' self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, l_key) self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, r_key) l_block_attr_1 = 'city_posted' r_block_attr_1 = 'city_posted' l_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] r_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] self.ab = mg.AttrEquivalenceBlocker() self.C = self.ab.block_tables(self.A, self.B, l_block_attr_1, r_block_attr_1, l_output_attrs, r_output_attrs, verbose=False) self.l_block_attr = 'model_year' self.r_block_attr = 'model_year'
def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([p, 'datasets', 'example_datasets', 'bikes', 'A.csv']) path_for_B = os.sep.join([p, 'datasets', 'example_datasets', 'bikes', 'B.csv']) l_key = 'id' r_key = 'id' self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, l_key) self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, r_key) self.l_block_attr = 'city_posted' self.r_block_attr = 'city_posted' self.l_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year'] self.r_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year'] self.ab = mg.AttrEquivalenceBlocker()
import sys #sys.path.append('/Users/pradap/Documents/Research/Python-Package/enrique') #sys.path.append('/scratch/pradap/python-work/enrqiue') import os import magellan as mg import jpype p = mg.get_install_path() path_for_A = os.sep.join([p, 'datasets', 'table_A.csv']) path_for_B = os.sep.join([p, 'datasets', 'table_B.csv']) # mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib') jvm_path = jpype.get_default_jvm_path() if os.path.isfile(jvm_path): mg.init_jvm(jvm_path) #mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib') else: x = [] for t in jvm_path.split(os.sep): if t == 'client': t = 'server' elif t == 'server': r = 'client' x.append(t) jp = os.sep.join(x) if os.path.isfile(jp): mg.init_jvm(jp) else: jp = raw_input('Give path to jvm library (i.e libjvm.so in linux) : ') if os.path.isfile(jp): mg.init_jvm(jp) else: print 'Invalid path; cannot run tests; exiting'
# coding=utf-8 import os import magellan as mg from magellan.debugmatcher.debug_gui_decisiontree_matcher import _vis_debug_dt, \ vis_tuple_debug_dt_matcher datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets']) path_c = os.sep.join([datasets_path, 'C.csv']) A = mg.load_dataset('table_A', key='ID') B = mg.load_dataset('table_B', key='ID') C = mg.read_csv_metadata(path_c, ltable=A, rtable=B) labels = [0] * 7 labels.extend([1] * 8) C['labels'] = labels feature_table = mg.get_features_for_matching(A, B) feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='labels') dt = mg.DTMatcher() dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'], target_attr='labels') vis_tuple_debug_dt_matcher(dt, feature_vectors.ix[0], exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels']) # feature_table = mg.get_features_for_matching(A, B) # # labels = [0]*7 # labels.extend([1]*8)
import os import magellan as mg from magellan.utils.generic_helper import del_files_in_dir p = os.sep.join([mg.get_install_path(), 'datasets','test_datasets', 'sandbox']) del_files_in_dir(p)
from magellan.evaluation.matcher_and_trigger_crossvalidation import cv_matcher_and_trigger import magellan as mg import pandas as pd mg.init_jvm() # Read walmart books data wal = mg.read_csv(mg.get_install_path()+'/datasets/books/walmart.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') # Read bowker books data bwk = mg.read_csv(mg.get_install_path()+'/datasets/books/bowker.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk) feature_table = mg.get_features_for_matching(wal, bwk) f = feature_table.ix[[3,7,18,26, 53]] m = mg.DTMatcher() # feature_table = mg.get_features_for_matching(A, B) G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold') G = mg.impute_table(G, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], strategy='most_frequent') # m = mg.LinRegMatcher() # print G pos_trigger = mg.MatchTrigger() pos_trigger.add_cond_rule('author_author_lev(ltuple, rtuple) == 1', feature_table=feature_table) pos_trigger.add_cond_status(True) pos_trigger.add_action(1)
import os import magellan as mg from magellan.utils.generic_helper import del_files_in_dir p = os.sep.join( [mg.get_install_path(), 'datasets', 'test_datasets', 'sandbox']) del_files_in_dir(p)
import sys sys.path.append('/Users/pradap/Documents/Research/Python-Package/enrique/') import magellan as mg import pandas as pd mg.init_jvm() wal = mg.read_csv(mg.get_install_path() + '/datasets/books/walmart.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') bwk = mg.read_csv(mg.get_install_path() + '/datasets/books/bowker.csv', dtype={'isbn':pd.np.str, 'pages':pd.np.str, 'volume':pd.np.str, 'editionNum':pd.np.str}, low_memory=False, key='id') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(wal, bwk, 'isbn', 'isbn', ['title', 'author'], ['title', 'author']) L = mg.read_csv('label_ab_correct_books.csv', ltable=wal, rtable=bwk) print len(L) feat_table = mg.get_features_for_matching(wal, bwk) f = feat_table.ix[[3,7,18,26, 53]] G = mg.extract_feature_vecs(L, feature_table=f, attrs_after='gold') dt = mg.DTMatcher() svm = mg.SVMMatcher()
import magellan as mg import pandas as pd import os from PyQt4 import QtCore datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets']) path_a = os.sep.join([datasets_path, 'A.csv']) path_b = os.sep.join([datasets_path, 'B.csv']) path_c = os.sep.join([datasets_path, 'C.csv']) A = mg.read_csv_metadata(path_a) B = mg.read_csv_metadata(path_b, key='ID') C = mg.read_csv_metadata(path_c, ltable=A, rtable=B) D = mg.label_table(C, 'label') print(D) # timer = QtCore.QTimer() # timer.setInterval(2000) # 2 seconds # mg._viewapp.loadFinished.connect(timer.start) # timer.timeout.connect(mg._viewapp.quit)
# coding=utf-8 import logging import os import magellan as mg logging.basicConfig(level=logging.DEBUG) datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets', 'matcherselector']) path_a = os.sep.join([datasets_path, 'ACM_demo.csv']) path_b = os.sep.join([datasets_path, 'DBLP_demo.csv']) path_c = os.sep.join([datasets_path, 'dblp_acm_demo_labels.csv']) A = mg.read_csv_metadata(path_a, key='id') B = mg.read_csv_metadata(path_b, key='id') C = mg.read_csv_metadata(path_c, ltable=B, rtable=A, fk_ltable='ltable.id', fk_rtable='rtable.id', key='_id') feature_table = mg.get_features_for_matching(A, B) feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold', verbose=True) # dtmatcher = mg.DTMatcher() # nbmatcher = mg.NBMatcher() # rfmatcher = mg.RFMatcher() # svmmatcher = mg.SVMMatcher() # linregmatcher = mg.LinRegMatcher() # logregmatcher = mg.LogRegMatcher()