def _get_stop_words():
    stop_words_set = set()
    install_path = get_install_path()
    dataset_path = os.sep.join([install_path, 'utils'])
    stop_words_file = os.sep.join([dataset_path, 'stop_words.txt'])
    with open(stop_words_file, "rb") as stopwords_file:
        for stop_words in stopwords_file:
            stop_words_set.add(stop_words.rstrip())

    return stop_words_set
Пример #2
0
def _get_stop_words():
    stop_words_set = set()
    install_path = get_install_path()
    dataset_path = os.sep.join([install_path, 'utils'])
    stop_words_file = os.sep.join([dataset_path, 'stop_words.txt'])
    with open(stop_words_file, "rb") as stopwords_file:
        for stop_words in stopwords_file:
            stop_words_set.add(stop_words.rstrip())

    return stop_words_set
import pandas as pd

import py_entitymatching.catalog.catalog_manager as cm
import py_entitymatching.matcher.matcherutils as mu
from py_entitymatching.debugmatcher.debug_gui_randomforest_matcher import _vis_debug_rf, \
    vis_tuple_debug_rf_matcher
from py_entitymatching.debugmatcher.debug_randomforest_matcher import debug_randomforest_matcher

from py_entitymatching.feature.autofeaturegen import get_features_for_matching
from py_entitymatching.feature.extractfeatures import extract_feature_vecs
from py_entitymatching.io.parsers import read_csv_metadata
from py_entitymatching.matcher.rfmatcher import RFMatcher
from py_entitymatching.utils.generic_helper import get_install_path

datasets_path = os.sep.join([get_install_path(), 'datasets', 'test_datasets'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])


class VisRFDebugMatcherTestCases(unittest.TestCase):
    def setUp(self):
        cm.del_catalog()

    def tearDown(self):
        cm.del_catalog()

    def test_vis_debug_matcher_rf_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
Пример #4
0
import six

from py_entitymatching.utils.generic_helper import get_install_path, list_diff
from py_entitymatching.io.parsers import read_csv_metadata
from py_entitymatching.matcherselector.mlmatcherselection import select_matcher
from py_entitymatching.matcher.dtmatcher import DTMatcher
from py_entitymatching.matcher.linregmatcher import LinRegMatcher
from py_entitymatching.matcher.logregmatcher import LogRegMatcher
from py_entitymatching.matcher.nbmatcher import NBMatcher
from py_entitymatching.matcher.rfmatcher import RFMatcher
from py_entitymatching.matcher.svmmatcher import SVMMatcher

import py_entitymatching.catalog.catalog_manager as cm

datasets_path = os.sep.join(
    [get_install_path(), 'tests', 'test_datasets', 'matcherselector'])
path_a = os.sep.join([datasets_path, 'DBLP_demo.csv'])
path_b = os.sep.join([datasets_path, 'ACM_demo.csv'])
path_c = os.sep.join([datasets_path, 'dblp_acm_demo_labels.csv'])
path_f = os.sep.join([datasets_path, 'feat_vecs.csv'])


class MLMatcherSelectionTestCases(unittest.TestCase):
    def setUp(self):
        cm.del_catalog()

    def tearDown(self):
        cm.del_catalog()

    # @nottest
    def test_select_matcher_valid_1(self):
import pandas as pd
import six
from contextlib import contextmanager

from py_entitymatching.utils.generic_helper import get_install_path
from py_entitymatching.io.parsers import read_csv_metadata
from py_entitymatching.feature.simfunctions import get_sim_funs_for_matching
from py_entitymatching.feature.tokenizers import get_tokenizers_for_matching

import py_entitymatching.feature.autofeaturegen as afg
import py_entitymatching.feature.attributeutils as au
import py_entitymatching.catalog.catalog_manager as cm
import py_entitymatching.feature.simfunctions as simfuncs
import py_entitymatching.feature.tokenizers as toks

datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets'])
bc_datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets', 'blockercombiner'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])

# proxy for user input. Since we use six, the way to proxy the input is to use the moves module.
# Specifically, for input we had to replace input with a function that will return the desired output
@contextmanager
def mockInput(mock):
    original_input = six.moves.input
    six.moves.input = lambda _: mock
    yield
    six.moves.input = original_input

class AutoFeatureGenerationTestCases(unittest.TestCase):
# coding=utf-8
import sys
import py_entitymatching
import os
from nose.tools import *
import unittest
import pandas as pd
import six

from py_entitymatching.utils.generic_helper import get_install_path
from py_entitymatching.sampler.down_sample import _inv_index, _probe_index, down_sample, _get_str_cols_list
import py_entitymatching.catalog.catalog_manager as cm
from py_entitymatching.io.parsers import read_csv_metadata

datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets'])
path_a = os.sep.join([datasets_path, 'restA.csv'])
path_b = os.sep.join([datasets_path, 'restB.csv'])


class DownSampleTestCases(unittest.TestCase):
    def setUp(self):
        self.A = read_csv_metadata(path_a, key='ID')
        self.B = read_csv_metadata(path_b, key='ID')

    def tearDown(self):
        del self.A
        del self.B

    def test_down_sample_table_valid_1(self):
        C, D = down_sample(self.A, self.B, 100, 10)
        self.assertEqual(len(D), 100)
Пример #7
0
import pandas as pd
import six
from contextlib import contextmanager

from py_entitymatching.utils.generic_helper import get_install_path
from py_entitymatching.io.parsers import read_csv_metadata
from py_entitymatching.feature.simfunctions import get_sim_funs_for_matching
from py_entitymatching.feature.tokenizers import get_tokenizers_for_matching

import py_entitymatching.feature.autofeaturegen as afg
import py_entitymatching.feature.attributeutils as au
import py_entitymatching.catalog.catalog_manager as cm
import py_entitymatching.feature.simfunctions as simfuncs
import py_entitymatching.feature.tokenizers as toks

datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets'])
bc_datasets_path = os.sep.join(
    [get_install_path(), 'tests', 'test_datasets', 'blockercombiner'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])


# proxy for user input. Since we use six, the way to proxy the input is to use the moves module.
# Specifically, for input we had to replace input with a function that will return the desired output
@contextmanager
def mockInput(mock):
    original_input = six.moves.input
    six.moves.input = lambda _: mock
    yield
    six.moves.input = original_input
import py_entitymatching as em
from py_entitymatching.utils.generic_helper import get_install_path
import py_entitymatching.catalog.catalog_manager as cm
import py_entitymatching.utils.catalog_helper as ch
from py_entitymatching.io.parsers import read_csv_metadata

#import sys
#sys.path.insert(0, '../debugblocker')
#import debugblocker as db
import py_entitymatching.debugblocker.debugblocker as db

from operator import itemgetter
from array import array


datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets'])
catalog_datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets', 'catalog'])
debugblocker_datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets', 'debugblocker'])

path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])

class DebugblockerTestCases(unittest.TestCase):
    def test_validate_types_1(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID',
                fk_rtable='rtable_ID', key = '_id')
        A_key = em.get_key(A)
        B_key = em.get_key(B)
# coding=utf-8
from __future__ import unicode_literals

import os
import unittest
import pandas as pd
from nose.tools import raises

from py_entitymatching.io.parsers import read_csv_metadata, to_csv_metadata, _get_metadata_from_file
from py_entitymatching.utils.generic_helper import get_install_path, del_files_in_dir, creat_dir_ifnot_exists
import py_entitymatching.catalog.catalog_manager as cm
datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets'])
io_datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets',
                                'io'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])
sndbx_path = os.sep.join([os.sep.join([get_install_path(), 'tests',
                                       'test_datasets']), 'sandbox'])

class ReadCSVMetadataTestCases(unittest.TestCase):
    def test_valid_path_wi_valid_metadata(self):
        cm.del_catalog()
        A = read_csv_metadata(path_a)
        pd_A = pd.read_csv(path_a)
        self.assertEqual(A.equals(pd_A), True)
        self.assertEqual(cm.get_key(A), 'ID')

    def test_valid_path_candset_wi_valid_metadata(self):
        cm.del_catalog()
        A = read_csv_metadata(path_a)
# coding=utf-8
from __future__ import unicode_literals

import os
import unittest
import pandas as pd
from nose.tools import raises

from py_entitymatching.io.parsers import read_csv_metadata, to_csv_metadata, _get_metadata_from_file
from py_entitymatching.utils.generic_helper import get_install_path, del_files_in_dir, creat_dir_ifnot_exists
import py_entitymatching.catalog.catalog_manager as cm
datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets'])
io_datasets_path = os.sep.join(
    [get_install_path(), 'tests', 'test_datasets', 'io'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])
sndbx_path = os.sep.join(
    [os.sep.join([get_install_path(), 'tests', 'test_datasets']), 'sandbox'])


class ReadCSVMetadataTestCases(unittest.TestCase):
    def test_valid_path_wi_valid_metadata(self):
        cm.del_catalog()
        A = read_csv_metadata(path_a)
        pd_A = pd.read_csv(path_a)
        self.assertEqual(A.equals(pd_A), True)
        self.assertEqual(cm.get_key(A), 'ID')

    def test_valid_path_candset_wi_valid_metadata(self):
        cm.del_catalog()
import py_entitymatching as em
from py_entitymatching.utils.generic_helper import get_install_path
import py_entitymatching.catalog.catalog_manager as cm
import py_entitymatching.utils.catalog_helper as ch
from py_entitymatching.io.parsers import read_csv_metadata

#import sys
#sys.path.insert(0, '../debugblocker')
#import debugblocker as db
import py_entitymatching.debugblocker.debugblocker as db

from operator import itemgetter
from array import array


datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets'])
catalog_datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets', 'catalog'])
debugblocker_datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets', 'debugblocker'])

path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])

class DebugblockerTestCases(unittest.TestCase):
    def test_validate_types_1(self):
        A = read_csv_metadata(path_a, key='ID')
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID',
                fk_rtable='rtable_ID', key = '_id')
        A_key = em.get_key(A)
        B_key = em.get_key(B)
Пример #12
0
import six

from py_entitymatching.matcher.dtmatcher import DTMatcher
from py_entitymatching.matcher.linregmatcher import LinRegMatcher
from py_entitymatching.matcher.logregmatcher import LogRegMatcher
from py_entitymatching.matcher.nbmatcher import NBMatcher
from py_entitymatching.matcher.rfmatcher import RFMatcher
from py_entitymatching.matcher.svmmatcher import SVMMatcher

from py_entitymatching.io.parsers import read_csv_metadata
import py_entitymatching.matcher.matcherutils as mu
import py_entitymatching.catalog.catalog_manager as cm
from py_entitymatching.utils.generic_helper import get_install_path, list_diff

datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets'])
path_a = os.sep.join([datasets_path, 'A.csv'])
path_b = os.sep.join([datasets_path, 'B.csv'])
path_c = os.sep.join([datasets_path, 'C.csv'])

feat_datasets_path = os.sep.join(
    [get_install_path(), 'tests', 'test_datasets', 'matcherselector'])
fpath_a = os.sep.join([feat_datasets_path, 'DBLP_demo.csv'])
fpath_b = os.sep.join([feat_datasets_path, 'ACM_demo.csv'])
fpath_c = os.sep.join([feat_datasets_path, 'dblp_acm_demo_labels.csv'])
fpath_f = os.sep.join([feat_datasets_path, 'feat_vecs.csv'])


class MLMatcherTestCases(unittest.TestCase):
    def test_valid_names_for_matchers(self):
        matchers1 = {
Пример #13
0
# Write the benchmarking functions here.                                        
# See "Writing benchmarks" in the asv docs for more information.

import os

import py_entitymatching as em
from py_entitymatching.utils.generic_helper import get_install_path

import sys

if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8")

PATH = get_install_path()
DATASET_PATH = os.sep.join([PATH, 'datasets', 'example_datasets'])


class TimeDownSampleRestaurants:
    def setup(self):
        path_for_a = os.sep.join([DATASET_PATH, 'restaurants', 'A.csv'])
        path_for_b = os.sep.join([DATASET_PATH, 'restaurants', 'B.csv'])
        try:
            self.A = em.read_csv_metadata(path_for_a)
            self.B = em.read_csv_metadata(path_for_b)
            self.size = 500
            self.y_param = 2
        except AssertionError:
            print("Dataset \'restaurants\' not found. Please visit the project website to download the dataset.")
            raise SystemExit