def test_persist(self): # alphabetic: adipisici amet consectetur dolor elit ipsum lorem sit docs = ['Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET', 'consectetur adipisici elit'] thesaurus = {'13542-1': {'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'], 'narrower': ['0n'], 'altLabel': []}, '13542-4': {'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'], 'narrower': ['1n'], 'altLabel': ['amet']}, } vocabulary = {'13542-1': 1, '13542-4': 0} tempdir_data = mkdtemp() fnames = [] for doc in docs: file = NamedTemporaryFile(mode='w', delete=False, dir=tempdir_data) fnames.append(file.name) print(doc, file=file) tempdir = mkdtemp() cf = ConceptAnalyzer(thesaurus, persist=True, persist_dir=tempdir, input='filename', file_path=tempdir_data) for fname in fnames: cf.analyze(fname) cf.persistence_file.close() cf2 = ConceptAnalyzer(thesaurus, persist=True, persist_dir=tempdir, input='filename', file_path=tempdir_data) for fname in fnames: with open(fname, mode='w') as file: print('bullshit', file=file) counter = CountVectorizer(analyzer=cf2.analyze, vocabulary=vocabulary) res = counter.fit_transform(fnames).todense() print(res) np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
def test_speed(self): doc = self.text() * 200 tempdir_data = mkdtemp() file = NamedTemporaryFile(mode='w', delete=False, dir=tempdir_data) print(doc, file=file) _, _, tr = load_dataset({ 'econ62k': { "X": "../../../Resources/Goldstandard/formatted_econbiz-annotation-62k-titles.csv", "y": "../../../Resources/Goldstandard/econbiz-stw9-formatted.csv", "thes": "../../../Resources/Ontologies/stw.json" } }) cf = ConceptAnalyzer(tr.thesaurus, persist=True, persist_dir=mkdtemp(), input='filename', file_path=tempdir_data) times = [] for _ in range(10): start = default_timer() cf.analyze(file.name) times.append(default_timer() - start) print(np.round(np.mean(times), decimals=2))
def test_simple_transform(self): # alphabetic: adipisici amet consectetur dolor elit ipsum lorem sit doc = 'Lorem Lorem ipsum dolor sit amet' thesaurus = {'00': {'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'], 'narrower': ['0n'], 'altLabel': []}, '01': {'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'], 'narrower': ['1n'], 'altLabel': ['amet']}, } cf = ConceptAnalyzer(thesaurus) expected = ['00', '01', '01'] self.assertEquals(expected, cf.analyze(doc))
def test_persist(self): # alphabetic: adipisici amet consectetur dolor elit ipsum lorem sit docs = [ 'Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET', 'consectetur adipisici elit' ] thesaurus = { '13542-1': { 'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'], 'narrower': ['0n'], 'altLabel': [] }, '13542-4': { 'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'], 'narrower': ['1n'], 'altLabel': ['amet'] }, } vocabulary = {'13542-1': 1, '13542-4': 0} tempdir_data = mkdtemp() fnames = [] for doc in docs: file = NamedTemporaryFile(mode='w', delete=False, dir=tempdir_data) fnames.append(file.name) print(doc, file=file) tempdir = mkdtemp() cf = ConceptAnalyzer(thesaurus, persist=True, persist_dir=tempdir, input='filename', file_path=tempdir_data) for fname in fnames: cf.analyze(fname) cf.persistence_file.close() cf2 = ConceptAnalyzer(thesaurus, persist=True, persist_dir=tempdir, input='filename', file_path=tempdir_data) for fname in fnames: with open(fname, mode='w') as file: print('bullshit', file=file) counter = CountVectorizer(analyzer=cf2.analyze, vocabulary=vocabulary) res = counter.fit_transform(fnames).todense() print(res) np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
def test_vocabulary_with_entity_ids(self): docs = [ 'Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET', 'consectetur adipisici elit' ] thesaurus = { '13542-1': { 'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'], 'narrower': ['0n'], 'altLabel': [] }, '13542-4': { 'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'], 'narrower': ['1n'], 'altLabel': ['amet'] }, } vocabulary = {'13542-1': 1, '13542-4': 0} cf = ConceptAnalyzer(thesaurus) counter = CountVectorizer(analyzer=cf.analyze, vocabulary=vocabulary) res = counter.fit_transform(docs).todense() np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
def test_read_files(self): docs = [ 'Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET', 'consectetur adipisici elit' ] thesaurus = { '13542-1': { 'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'], 'narrower': ['0n'], 'altLabel': [] }, '13542-4': { 'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'], 'narrower': ['1n'], 'altLabel': ['amet'] }, } vocabulary = {'13542-1': 1, '13542-4': 0} fnames = [] for doc in docs: file = NamedTemporaryFile(mode='w', delete=False) fnames.append(file.name) print(doc, file=file) cf = ConceptAnalyzer(thesaurus, input='filename') counter = CountVectorizer(analyzer=cf.analyze, vocabulary=vocabulary, input='filename') res = counter.fit_transform(fnames).todense() np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
def test_speed(self): doc = self.text() * 200 tempdir_data = mkdtemp() file = NamedTemporaryFile(mode='w', delete=False, dir=tempdir_data) print(doc, file=file) _, _, tr = load_dataset({'econ62k': { "X": "../../../Resources/Goldstandard/formatted_econbiz-annotation-62k-titles.csv", "y": "../../../Resources/Goldstandard/econbiz-stw9-formatted.csv", "thes": "../../../Resources/Ontologies/stw.json" }}) cf = ConceptAnalyzer(tr.thesaurus, persist=True, persist_dir=mkdtemp(), input='filename', file_path=tempdir_data) times = [] for _ in range(10): start = default_timer() cf.analyze(file.name) times.append(default_timer() - start) print(np.round(np.mean(times), decimals=2))
def test_simple_transform(self): # alphabetic: adipisici amet consectetur dolor elit ipsum lorem sit doc = 'Lorem Lorem ipsum dolor sit amet' thesaurus = { '00': { 'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'], 'narrower': ['0n'], 'altLabel': [] }, '01': { 'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'], 'narrower': ['1n'], 'altLabel': ['amet'] }, } cf = ConceptAnalyzer(thesaurus) expected = ['00', '01', '01'] self.assertEquals(expected, cf.analyze(doc))