示例#1
0
datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets'])
path_c = os.sep.join([datasets_path, 'C.csv'])
A = mg.load_dataset('table_A', key='ID')
B = mg.load_dataset('table_B', key='ID')
C = mg.read_csv_metadata(path_c, ltable=A, rtable=B)

labels = [0] * 7
labels.extend([1] * 8)
C['labels'] = labels

feature_table = mg.get_features_for_matching(A, B)
feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table,
                                       attrs_after='labels')

dt = mg.DTMatcher()
dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
       target_attr='labels')
vis_tuple_debug_dt_matcher(dt, feature_vectors.ix[0],
                           exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])

# feature_table = mg.get_features_for_matching(A, B)
#
# labels = [0]*7
# labels.extend([1]*8)
# C['labels'] = labels
#
# feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table,
#                                          attrs_after='labels')
#
#
示例#2
0
import sys
import magellan as mg
from magellan.gui.debug_gui_base import vis_debug_dt

sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/')
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
L = mg.read_csv('label_demo.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold')
S = mg.train_test_split(G, 8, 7)
dt = mg.DTMatcher(name='DecisionTree')
dt.fit(table=S['train'],
       exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
       target_attr='gold')
dt.predict(table=S['test'],
           exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
           target_attr='predicted',
           append=True)
d = mg.eval_matches(S['test'], 'gold', 'predicted')

vis_debug_dt(dt,
             d,
             S['test'],
             exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
             feat_table=feat_table)
print "Hi"
示例#3
0
s_prime = mg.extract_feature_vecs(L,
                                  attrs_before=['ltable.name', 'rtable.name'],
                                  feature_table=feat_table,
                                  attrs_after=['gold_label'])

# In[34]:

# display feature vector table
s_prime

# In[35]:

# Fitting/Predicting
# Create a set of matchers
nb = mg.NBMatcher()  # naive bayes
dt = mg.DTMatcher()  # decision tree
rf = mg.RFMatcher()  # random forest

# In[36]:

# Select a matcher using cross validation
m = mg.select_matcher([nb, dt, rf],
                      x=s_prime[list(feat_table['feature_name'])],
                      y=s_prime['gold_label'],
                      k=5)

# In[37]:

# see what was selected and the stats
m