Пример #1
0
import os
from nose.tools import *
import unittest
import pandas as pd
import six

from magellan.utils.generic_helper import get_install_path
from magellan.io.parsers import read_csv_metadata
from magellan.feature.simfunctions import get_sim_funs_for_matching
from magellan.feature.tokenizers import get_tokenizers_for_matching
from magellan.feature.autofeaturegen import get_features_for_matching
from magellan.feature.addfeatures import add_feature, add_blackbox_feature, get_feature_fn, parse_feat_str, create_feature_table

import magellan.catalog.catalog_manager as cm

datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])


class AddFeaturesTestCases(unittest.TestCase):
    def setUp(self):
        cm.del_catalog()

    def tearDown(self):
        cm.del_catalog()

    def test_add_features_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        feature_table = get_features_for_matching(A, B)
Пример #2
0
import os

import magellan.matcher.matcherutils as mu
from magellan.io.parsers import read_csv_metadata
from magellan.matcher.dtmatcher import DTMatcher
from magellan.utils.generic_helper import get_install_path

feat_datasets_path = os.sep.join(
    [get_install_path(), 'datasets', 'test_datasets', 'matcherselector'])
fpath_a = os.sep.join([feat_datasets_path, 'DBLP_demo.csv'])
fpath_b = os.sep.join([feat_datasets_path, 'ACM_demo.csv'])
fpath_c = os.sep.join([feat_datasets_path, 'dblp_acm_demo_labels.csv'])
fpath_f = os.sep.join([feat_datasets_path, 'feat_vecs.csv'])

A = read_csv_metadata(fpath_a, key='id')
B = read_csv_metadata(fpath_b, key='id')
feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
train_test = mu.train_test_split(feature_vectors)
train, test = train_test['train'], train_test['test']
dt = DTMatcher(name='DecisionTree')
dt.fit(table=train,
       exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
       target_attr='gold')
predictions = dt.predict(
    table=test,
    exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
    target_attr='predicted',
    append=True)
print('Done')
import os
from nose.tools import *
import unittest
import pandas as pd

from magellan.utils.generic_helper import get_install_path
from magellan.io.parsers import read_csv_metadata
from magellan.feature.simfunctions import get_sim_funs_for_matching
from magellan.feature.tokenizers import get_tokenizers_for_matching

import magellan.feature.autofeaturegen as afg
import magellan.catalog.catalog_manager as cm

datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets'])
bc_datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets', 'blockercombiner'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])

class AutoFeatureGenerationTestCases(unittest.TestCase):
    def setUp(self):
        cm.del_catalog()

    def tearDown(self):
        cm.del_catalog()

    def test_get_features_valid(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        l_attr_types = afg.get_attr_types(A)
        r_attr_types = afg.get_attr_types(B)
import os
from nose.tools import *
import unittest
import pandas as pd

from magellan.utils.generic_helper import get_install_path
from magellan.io.parsers import read_csv_metadata

from magellan.feature.extractfeatures import extract_feature_vecs
from magellan.feature.autofeaturegen import get_features_for_matching
import magellan.catalog.catalog_manager as cm

datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])

class ExtractFeaturesTestCases(unittest.TestCase):
    def test_extract_feature_vecs_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        col_pos = len(C.columns)
        C.insert(col_pos, 'label', [0]*len(C))
        feature_table = get_features_for_matching(A, B)
        F = extract_feature_vecs(C, attrs_before=['ltable_name', 'rtable_name'], feature_table=feature_table,
                                 attrs_after='label')
        self.assertEqual(isinstance(F, pd.DataFrame), True)
        self.assertEqual(F.columns[0], '_id')
        self.assertEqual(F.columns[1], cm.get_fk_ltable(C))
        self.assertEqual(F.columns[2], cm.get_fk_rtable(C))
Пример #5
0
# coding=utf-8
from __future__ import unicode_literals

import os
import unittest
import pandas as pd
from nose.tools import *

from magellan.io.pickles import load_object, load_table, save_object, save_table
from magellan.io.parsers import read_csv_metadata
from magellan.blocker.rule_based_blocker import RuleBasedBlocker
from magellan.feature.autofeaturegen import get_features_for_blocking
from magellan.utils.generic_helper import get_install_path, del_files_in_dir, creat_dir_ifnot_exists
import magellan.catalog.catalog_manager as cm

datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])
sndbx_path = os.sep.join([os.sep.join([get_install_path(), 'datasets', 'test_datasets']), 'sandbox'])


class SaveObjectTestCases(unittest.TestCase):
    @raises(AssertionError)
    def test_invalid_path_1(self):
        p = os.sep.join([sndbx_path, 'A_saved.pkl'])
        save_object(p, 10)

    @raises(AssertionError)
    def test_invalid_path_2(self):
        p = os.sep.join([sndbx_path, 'A_saved.pkl'])
Пример #6
0
#
#
# mg.to_csv_metadata(C, './C.csv')
# print 'Hi'
import logging
import os
import pandas as pd

from magellan.io.parsers import read_csv_metadata
from magellan.utils.generic_helper import get_install_path
import magellan.catalog.catalog_manager as cm

logging.basicConfig()

io_datasets_path = os.sep.join(
    [get_install_path(), 'datasets', 'test_datasets', 'io'])
# path_a = os.sep.join([io_datasets_path, 'A.csv'])
path_b = os.sep.join([io_datasets_path, 'B.csv'])
# path_c = os.sep.join([io_datasets_path, 'C.csv'])
#
# A = read_csv_metadata(path_a)
# B = read_csv_metadata(path_b)
#
# C = mg.read_csv_metadata(path_c, ltable=A, rtable=B)
#
# cm.show_properties(C)

p_C = os.sep.join([io_datasets_path, 'C.csv'])
p_A = os.sep.join([io_datasets_path, 'A_fk1.csv'])
A = read_csv_metadata(p_A)
cm.set_property(A, 'key', 'ID')
import os
from nose.tools import *
import unittest
import pandas as pd

from magellan.utils.generic_helper import get_install_path
from magellan.io.parsers import read_csv_metadata
from magellan.feature.simfunctions import get_sim_funs_for_matching
from magellan.feature.tokenizers import get_tokenizers_for_matching

import magellan.feature.autofeaturegen as afg
import magellan.catalog.catalog_manager as cm

datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets'])
bc_datasets_path = os.sep.join(
    [get_install_path(), 'datasets', 'test_datasets', 'blockercombiner'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])


class AutoFeatureGenerationTestCases(unittest.TestCase):
    def setUp(self):
        cm.del_catalog()

    def tearDown(self):
        cm.del_catalog()

    def test_get_features_valid(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
Пример #8
0
import os

import magellan.matcher.matcherutils as mu
from magellan.io.parsers import read_csv_metadata
from magellan.matcher.dtmatcher import DTMatcher
from magellan.utils.generic_helper import get_install_path

feat_datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets', 'matcherselector'])
fpath_a = os.sep.join([feat_datasets_path, 'DBLP_demo.csv'])
fpath_b = os.sep.join([feat_datasets_path, 'ACM_demo.csv'])
fpath_c = os.sep.join([feat_datasets_path, 'dblp_acm_demo_labels.csv'])
fpath_f = os.sep.join([feat_datasets_path, 'feat_vecs.csv'])

A = read_csv_metadata(fpath_a, key='id')
B = read_csv_metadata(fpath_b, key='id')
feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
train_test = mu.train_test_split(feature_vectors)
train, test = train_test['train'], train_test['test']
dt = DTMatcher(name='DecisionTree')
dt.fit(table=train, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'], target_attr='gold')
predictions = dt.predict(table=test, exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                         target_attr='predicted',
                         append=True)
print('Done')
Пример #9
0
# coding=utf-8
from __future__ import unicode_literals

import os
import unittest
import pandas as pd
from nose.tools import *

from magellan.io.parsers import read_csv_metadata, to_csv_metadata, _get_metadata_from_file
from magellan.utils.generic_helper import get_install_path, del_files_in_dir, creat_dir_ifnot_exists
import magellan.catalog.catalog_manager as cm
datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets'])
io_datasets_path = os.sep.join(
    [get_install_path(), 'datasets', 'test_datasets', 'io'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])
sndbx_path = os.sep.join([
    os.sep.join([get_install_path(), 'datasets', 'test_datasets']), 'sandbox'
])


class ReadCSVMetadataTestCases(unittest.TestCase):
    def test_valid_path_wi_valid_metadata(self):
        cm.del_catalog()
        A = read_csv_metadata(path_a)
        pd_A = pd.read_csv(path_a)
        self.assertEqual(A.equals(pd_A), True)
        self.assertEqual(cm.get_key(A), 'ID')

    def test_valid_path_candset_wi_valid_metadata(self):
import pandas as pd
import six

from magellan.utils.generic_helper import get_install_path, list_diff
from magellan.io.parsers import read_csv_metadata
from magellan.matcherselector.mlmatcherselection import select_matcher
from magellan.matcher.dtmatcher import DTMatcher
from magellan.matcher.linregmatcher import LinRegMatcher
from magellan.matcher.logregmatcher import LogRegMatcher
from magellan.matcher.nbmatcher import NBMatcher
from magellan.matcher.rfmatcher import RFMatcher
from magellan.matcher.svmmatcher import SVMMatcher

import magellan.catalog.catalog_manager as cm

datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets', 'matcherselector'])
path_a = os.sep.join([datasets_path, 'DBLP_demo.csv'])
path_b = os.sep.join([datasets_path, 'ACM_demo.csv'])
path_c = os.sep.join([datasets_path, 'dblp_acm_demo_labels.csv'])
path_f = os.sep.join([datasets_path, 'feat_vecs.csv'])


class MLMatcherSelectionTestCases(unittest.TestCase):
    def setUp(self):
        cm.del_catalog()

    def tearDown(self):
        cm.del_catalog()

    # @nottest
    def test_select_matcher_valid_1(self):
Пример #11
0
# coding=utf-8
from __future__ import unicode_literals

import os
import unittest
import pandas as pd
from nose.tools import *

from magellan.io.parsers import read_csv_metadata, to_csv_metadata, _get_metadata_from_file
from magellan.utils.generic_helper import get_install_path, del_files_in_dir, creat_dir_ifnot_exists
import magellan.catalog.catalog_manager as cm
datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets'])
io_datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets', 'io'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])
sndbx_path = os.sep.join([os.sep.join([get_install_path(), 'datasets', 'test_datasets']), 'sandbox'])

class ReadCSVMetadataTestCases(unittest.TestCase):
    def test_valid_path_wi_valid_metadata(self):
        cm.del_catalog()
        A = read_csv_metadata(path_a)
        pd_A = pd.read_csv(path_a)
        self.assertEqual(A.equals(pd_A), True)
        self.assertEqual(cm.get_key(A), 'ID')

    def test_valid_path_candset_wi_valid_metadata(self):
        cm.del_catalog()
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID') # not initializing with ID will raise key_error
        C = read_csv_metadata(path_c, ltable=A, rtable=B)