示例#1
0
# check to see whether the feature is indeed added to feature table
feat_table


# In[32]:

# Try executing the newly added function over tuples from A and B
feat_table.ix[6, 'function'](A.ix[2], B.ix[3])


# In[33]:

# Extract feature vectors for labeled candidate set. Also, we mention 'ltable.name, rtable.name' must be included before
# feature vectors and 'gold_label' after the feature vector
s_prime = mg.extract_feature_vecs(L, attrs_before=['ltable.name', 'rtable.name'], feature_table=feat_table, attrs_after=['gold_label'])


# In[34]:

# display feature vector table
s_prime


# In[35]:

# Fitting/Predicting
# Create a set of matchers
nb = mg.NBMatcher() # naive bayes 
dt = mg.DTMatcher() # decision tree
rf = mg.RFMatcher() # random forest
示例#2
0
import sys
import magellan as mg
from magellan.gui.debug_gui_base import vis_debug_dt

sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/')
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
L = mg.read_csv('label_demo.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold')
S = mg.train_test_split(G, 8, 7)
dt = mg.DTMatcher(name='DecisionTree')
dt.fit(table = S['train'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='gold')
dt.predict(table=S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], target_attr='predicted',
           append=True)
d = mg.eval_matches(S['test'], 'gold', 'predicted')

vis_debug_dt(dt, d, S['test'], exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'], feat_table=feat_table)
print "Hi"
示例#3
0
import sys
import magellan as mg
from magellan.gui.debug_gui_base import vis_debug_dt

sys.path.append('/Users/Pradap/Documents/Research/Python-Package/enrique/')
mg.init_jvm()

A = mg.load_dataset('table_A')
B = mg.load_dataset('table_B')
ab = mg.AttrEquivalenceBlocker()
C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['name'], ['name'])
L = mg.read_csv('label_demo.csv', ltable=A, rtable=B)
feat_table = mg.get_features_for_matching(A, B)
G = mg.extract_feature_vecs(L, feature_table=feat_table, attrs_after='gold')
S = mg.train_test_split(G, 8, 7)
dt = mg.DTMatcher(name='DecisionTree')
dt.fit(table=S['train'],
       exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
       target_attr='gold')
dt.predict(table=S['test'],
           exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
           target_attr='predicted',
           append=True)
d = mg.eval_matches(S['test'], 'gold', 'predicted')

vis_debug_dt(dt,
             d,
             S['test'],
             exclude_attrs=['_id', 'ltable.ID', 'rtable.ID', 'gold'],
             feat_table=feat_table)
print "Hi"
示例#4
0
import magellan as mg
from magellan.debugmatcher.debug_gui_decisiontree_matcher import _vis_debug_dt, \
    vis_tuple_debug_dt_matcher

datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets'])
path_c = os.sep.join([datasets_path, 'C.csv'])
A = mg.load_dataset('table_A', key='ID')
B = mg.load_dataset('table_B', key='ID')
C = mg.read_csv_metadata(path_c, ltable=A, rtable=B)

labels = [0] * 7
labels.extend([1] * 8)
C['labels'] = labels

feature_table = mg.get_features_for_matching(A, B)
feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table,
                                       attrs_after='labels')

dt = mg.DTMatcher()
dt.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
       target_attr='labels')
vis_tuple_debug_dt_matcher(dt, feature_vectors.ix[0],
                           exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])

# feature_table = mg.get_features_for_matching(A, B)
#
# labels = [0]*7
# labels.extend([1]*8)
# C['labels'] = labels
#
# feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table,
#                                          attrs_after='labels')
示例#5
0
# In[31]:

# check to see whether the feature is indeed added to feature table
feat_table

# In[32]:

# Try executing the newly added function over tuples from A and B
feat_table.ix[6, 'function'](A.ix[2], B.ix[3])

# In[33]:

# Extract feature vectors for labeled candidate set. Also, we mention 'ltable.name, rtable.name' must be included before
# feature vectors and 'gold_label' after the feature vector
s_prime = mg.extract_feature_vecs(L,
                                  attrs_before=['ltable.name', 'rtable.name'],
                                  feature_table=feat_table,
                                  attrs_after=['gold_label'])

# In[34]:

# display feature vector table
s_prime

# In[35]:

# Fitting/Predicting
# Create a set of matchers
nb = mg.NBMatcher()  # naive bayes
dt = mg.DTMatcher()  # decision tree
rf = mg.RFMatcher()  # random forest
示例#6
0
# coding=utf-8
# coding=utf-8
import os
import magellan as mg

datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets'])
path_c = os.sep.join([datasets_path, 'C.csv'])
A = mg.load_dataset('table_A', key='ID')
B = mg.load_dataset('table_B', key='ID')
C = mg.read_csv_metadata(path_c, ltable=A, rtable=B)
feature_table = mg.get_features_for_matching(A, B)

labels = [0]*7
labels.extend([1]*8)
C['labels'] = labels

feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table,
                                         attrs_after='labels')


rf = mg.RFMatcher()
train_test = mg.train_test_split(feature_vectors)

train = train_test['train']
test = train_test['test']

rf.fit(table=train, exclude_attrs=['ltable_ID', 'rtable_ID', '_id'], target_attr='labels')
mg.debug_randomforest_matcher(rf, A.ix[1], B.ix[2], feat_table=feature_table,
                              fv_columns=feature_vectors.columns,
                              exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
print('Hi')
示例#7
0
# coding=utf-8
import logging
import os

import magellan as mg

logging.basicConfig(level=logging.DEBUG)
datasets_path = os.sep.join([mg.get_install_path(), 'datasets', 'test_datasets', 'matcherselector'])
path_a = os.sep.join([datasets_path, 'ACM_demo.csv'])
path_b = os.sep.join([datasets_path, 'DBLP_demo.csv'])
path_c = os.sep.join([datasets_path, 'dblp_acm_demo_labels.csv'])

A = mg.read_csv_metadata(path_a, key='id')
B = mg.read_csv_metadata(path_b, key='id')
C = mg.read_csv_metadata(path_c, ltable=B, rtable=A, fk_ltable='ltable.id', fk_rtable='rtable.id', key='_id')

feature_table = mg.get_features_for_matching(A, B)
feature_vectors = mg.extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold', verbose=True)
# dtmatcher = mg.DTMatcher()
# nbmatcher = mg.NBMatcher()
# rfmatcher = mg.RFMatcher()
# svmmatcher = mg.SVMMatcher()
# linregmatcher = mg.LinRegMatcher()
# logregmatcher = mg.LogRegMatcher()