Exemplo n.º 1
0
    def test_ml_matcher_inplace_false_predict(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.train_test_split(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        test.drop('gold', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test,
                                 exclude_attrs='_id',
                                 target_attr='predicted',
                                 inplace=False,
                                 append=True)

        self.assertNotEqual(id(predictions), id(test))
        self.assertEqual(len(predictions), len(test))
        self.assertEqual(
            set(list(test.columns)).issubset(list(predictions.columns)), True)
        p_col = predictions.columns[len(predictions.columns) - 1]
        self.assertEqual(p_col, 'predicted')
Exemplo n.º 2
0
 def test_ml_matcher_invalid_df_predict(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.train_test_split(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold')
     predictions = dt.predict(table="", exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                              target_attr='predicted',
                              append=True)
Exemplo n.º 3
0
 def test_ml_matcher_invalid_df_predict(self):
     A = read_csv_metadata(fpath_a, key='id')
     B = read_csv_metadata(fpath_b, key='id')
     feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
     train_test = mu.train_test_split(feature_vectors)
     train, test = train_test['train'], train_test['test']
     dt = DTMatcher(name='DecisionTree')
     dt.fit(table=train,
            exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
            target_attr='gold')
     predictions = dt.predict(
         table="",
         exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
         target_attr='predicted',
         append=True)
Exemplo n.º 4
0
    def test_ml_matcher_valid_1(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.train_test_split(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id'], target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                 target_attr='predicted',
                                 append=True)

        self.assertEqual(len(predictions), len(test))
        self.assertEqual(set(list(predictions.columns)).issubset(list(test.columns)), True)
        p_col = predictions.columns[len(predictions.columns)-1]
        self.assertEqual(p_col, 'predicted')
Exemplo n.º 5
0
    def test_ml_matcher_valid_2(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.train_test_split(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                                 cm.get_fk_rtable(feature_vectors),
                                 'gold'])
        X = train[l]
        Y = train['gold']

        dt.fit(x=X, y=Y)
        predictions = dt.predict(test[l])
        self.assertEqual(len(predictions), len(test))
Exemplo n.º 6
0
    def test_ml_matcher_append_false_predict(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.train_test_split(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        test.drop('gold', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs='_id',
                                 target_attr='predicted',
                                 append=False)

        self.assertEqual(len(predictions), len(test))
Exemplo n.º 7
0
    def test_ml_matcher_valid_with_id_in_y(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.train_test_split(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [
            cm.get_fk_ltable(feature_vectors),
            cm.get_fk_rtable(feature_vectors), 'gold'
        ])
        X = train[l]
        Y = train[['_id', 'gold']]

        dt.fit(x=X, y=Y)
        predictions = dt.predict(test[l])
        self.assertEqual(len(predictions), len(test))
Exemplo n.º 8
0
    def test_ml_matcher_target_attr_present_in_ex_attrs(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.train_test_split(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        dt.fit(table=train,
               exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
               target_attr='gold')
        predictions = dt.predict(
            table=test,
            exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
            target_attr='predicted',
            append=True)

        self.assertEqual(len(predictions), len(test))
        l = len(set(list(predictions.columns)).difference(list(test.columns)))
        self.assertEqual(l, 0)
        p_col = predictions.columns[len(predictions.columns) - 1]
        self.assertEqual(p_col, 'predicted')
Exemplo n.º 9
0
    def test_ml_matcher_ex_attrs_not_list(self):
        A = read_csv_metadata(fpath_a, key='id')
        B = read_csv_metadata(fpath_b, key='id')
        feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
        train_test = mu.train_test_split(feature_vectors)
        train, test = train_test['train'], train_test['test']
        dt = DTMatcher(name='DecisionTree')
        train.drop('ltable.id', axis=1, inplace=True)
        train.drop('rtable.id', axis=1, inplace=True)
        test.drop('ltable.id', axis=1, inplace=True)
        test.drop('rtable.id', axis=1, inplace=True)
        dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
        predictions = dt.predict(table=test, exclude_attrs=['_id', 'gold'],
                                 target_attr='predicted',
                                 append=True)

        self.assertEqual(len(predictions), len(test))
        l = len(set(list(predictions.columns)).difference(list(test.columns)))
        self.assertEqual(l, 0)
        p_col = predictions.columns[len(predictions.columns)-1]
        self.assertEqual(p_col, 'predicted')
Exemplo n.º 10
0
import os

import magellan.matcher.matcherutils as mu
from magellan.io.parsers import read_csv_metadata
from magellan.matcher.dtmatcher import DTMatcher
from magellan.utils.generic_helper import get_install_path

feat_datasets_path = os.sep.join(
    [get_install_path(), 'datasets', 'test_datasets', 'matcherselector'])
fpath_a = os.sep.join([feat_datasets_path, 'DBLP_demo.csv'])
fpath_b = os.sep.join([feat_datasets_path, 'ACM_demo.csv'])
fpath_c = os.sep.join([feat_datasets_path, 'dblp_acm_demo_labels.csv'])
fpath_f = os.sep.join([feat_datasets_path, 'feat_vecs.csv'])

A = read_csv_metadata(fpath_a, key='id')
B = read_csv_metadata(fpath_b, key='id')
feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
train_test = mu.train_test_split(feature_vectors)
train, test = train_test['train'], train_test['test']
dt = DTMatcher(name='DecisionTree')
dt.fit(table=train,
       exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
       target_attr='gold')
predictions = dt.predict(
    table=test,
    exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
    target_attr='predicted',
    append=True)
print('Done')
Exemplo n.º 11
0
import os

import magellan.matcher.matcherutils as mu
from magellan.io.parsers import read_csv_metadata
from magellan.matcher.dtmatcher import DTMatcher
from magellan.utils.generic_helper import get_install_path

feat_datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets', 'matcherselector'])
fpath_a = os.sep.join([feat_datasets_path, 'DBLP_demo.csv'])
fpath_b = os.sep.join([feat_datasets_path, 'ACM_demo.csv'])
fpath_c = os.sep.join([feat_datasets_path, 'dblp_acm_demo_labels.csv'])
fpath_f = os.sep.join([feat_datasets_path, 'feat_vecs.csv'])

A = read_csv_metadata(fpath_a, key='id')
B = read_csv_metadata(fpath_b, key='id')
feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
train_test = mu.train_test_split(feature_vectors)
train, test = train_test['train'], train_test['test']
dt = DTMatcher(name='DecisionTree')
dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold')
predictions = dt.predict(table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                         target_attr='predicted',
                         append=True)
print('Done')
Exemplo n.º 12
0
 def test_ml_invalid_predict_sign(self):
     dt = DTMatcher(name='DecisionTree')
     dt.predict()
Exemplo n.º 13
0
 def test_ml_invalid_predict_sign(self):
     dt = DTMatcher(name='DecisionTree')
     dt.predict()