def test_persist(self):
     # alphabetic: adipisici amet consectetur dolor elit ipsum lorem sit
     docs = ['Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET', 'consectetur adipisici elit']
     thesaurus = {'13542-1': {'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'],
                              'narrower': ['0n'], 'altLabel': []},
                  '13542-4': {'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'],
                              'narrower': ['1n'], 'altLabel': ['amet']},
                  }
     vocabulary = {'13542-1': 1, '13542-4': 0}
     tempdir_data = mkdtemp()
     fnames = []
     for doc in docs:
         file = NamedTemporaryFile(mode='w', delete=False, dir=tempdir_data)
         fnames.append(file.name)
         print(doc, file=file)
     tempdir = mkdtemp()
     cf = ConceptAnalyzer(thesaurus, persist=True, persist_dir=tempdir, input='filename', file_path=tempdir_data)
     for fname in fnames:
         cf.analyze(fname)
     cf.persistence_file.close()
     cf2 = ConceptAnalyzer(thesaurus, persist=True, persist_dir=tempdir, input='filename', file_path=tempdir_data)
     for fname in fnames:
         with open(fname, mode='w') as file:
             print('bullshit', file=file)
     counter = CountVectorizer(analyzer=cf2.analyze, vocabulary=vocabulary)
     res = counter.fit_transform(fnames).todense()
     print(res)
     np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
 def test_speed(self):
     doc = self.text() * 200
     tempdir_data = mkdtemp()
     file = NamedTemporaryFile(mode='w', delete=False, dir=tempdir_data)
     print(doc, file=file)
     _, _, tr = load_dataset({
         'econ62k': {
             "X":
             "../../../Resources/Goldstandard/formatted_econbiz-annotation-62k-titles.csv",
             "y":
             "../../../Resources/Goldstandard/econbiz-stw9-formatted.csv",
             "thes": "../../../Resources/Ontologies/stw.json"
         }
     })
     cf = ConceptAnalyzer(tr.thesaurus,
                          persist=True,
                          persist_dir=mkdtemp(),
                          input='filename',
                          file_path=tempdir_data)
     times = []
     for _ in range(10):
         start = default_timer()
         cf.analyze(file.name)
         times.append(default_timer() - start)
     print(np.round(np.mean(times), decimals=2))
 def test_simple_transform(self):
     # alphabetic: adipisici amet consectetur dolor elit ipsum lorem sit
     doc = 'Lorem Lorem ipsum dolor sit amet'
     thesaurus = {'00': {'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'],
                         'narrower': ['0n'], 'altLabel': []},
                  '01': {'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'],
                         'narrower': ['1n'], 'altLabel': ['amet']},
                  }
     cf = ConceptAnalyzer(thesaurus)
     expected = ['00', '01', '01']
     self.assertEquals(expected, cf.analyze(doc))
 def test_persist(self):
     # alphabetic: adipisici amet consectetur dolor elit ipsum lorem sit
     docs = [
         'Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET',
         'consectetur adipisici elit'
     ]
     thesaurus = {
         '13542-1': {
             'prefLabel': ['ipsum'],
             'broader': ['0b'],
             'related': ['0r'],
             'narrower': ['0n'],
             'altLabel': []
         },
         '13542-4': {
             'prefLabel': ['dolor'],
             'broader': ['1b'],
             'related': ['1r'],
             'narrower': ['1n'],
             'altLabel': ['amet']
         },
     }
     vocabulary = {'13542-1': 1, '13542-4': 0}
     tempdir_data = mkdtemp()
     fnames = []
     for doc in docs:
         file = NamedTemporaryFile(mode='w', delete=False, dir=tempdir_data)
         fnames.append(file.name)
         print(doc, file=file)
     tempdir = mkdtemp()
     cf = ConceptAnalyzer(thesaurus,
                          persist=True,
                          persist_dir=tempdir,
                          input='filename',
                          file_path=tempdir_data)
     for fname in fnames:
         cf.analyze(fname)
     cf.persistence_file.close()
     cf2 = ConceptAnalyzer(thesaurus,
                           persist=True,
                           persist_dir=tempdir,
                           input='filename',
                           file_path=tempdir_data)
     for fname in fnames:
         with open(fname, mode='w') as file:
             print('bullshit', file=file)
     counter = CountVectorizer(analyzer=cf2.analyze, vocabulary=vocabulary)
     res = counter.fit_transform(fnames).todense()
     print(res)
     np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
 def test_vocabulary_with_entity_ids(self):
     docs = [
         'Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET',
         'consectetur adipisici elit'
     ]
     thesaurus = {
         '13542-1': {
             'prefLabel': ['ipsum'],
             'broader': ['0b'],
             'related': ['0r'],
             'narrower': ['0n'],
             'altLabel': []
         },
         '13542-4': {
             'prefLabel': ['dolor'],
             'broader': ['1b'],
             'related': ['1r'],
             'narrower': ['1n'],
             'altLabel': ['amet']
         },
     }
     vocabulary = {'13542-1': 1, '13542-4': 0}
     cf = ConceptAnalyzer(thesaurus)
     counter = CountVectorizer(analyzer=cf.analyze, vocabulary=vocabulary)
     res = counter.fit_transform(docs).todense()
     np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
 def test_read_files(self):
     docs = [
         'Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET',
         'consectetur adipisici elit'
     ]
     thesaurus = {
         '13542-1': {
             'prefLabel': ['ipsum'],
             'broader': ['0b'],
             'related': ['0r'],
             'narrower': ['0n'],
             'altLabel': []
         },
         '13542-4': {
             'prefLabel': ['dolor'],
             'broader': ['1b'],
             'related': ['1r'],
             'narrower': ['1n'],
             'altLabel': ['amet']
         },
     }
     vocabulary = {'13542-1': 1, '13542-4': 0}
     fnames = []
     for doc in docs:
         file = NamedTemporaryFile(mode='w', delete=False)
         fnames.append(file.name)
         print(doc, file=file)
     cf = ConceptAnalyzer(thesaurus, input='filename')
     counter = CountVectorizer(analyzer=cf.analyze,
                               vocabulary=vocabulary,
                               input='filename')
     res = counter.fit_transform(fnames).todense()
     np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
 def test_speed(self):
     doc = self.text() * 200
     tempdir_data = mkdtemp()
     file = NamedTemporaryFile(mode='w', delete=False, dir=tempdir_data)
     print(doc, file=file)
     _, _, tr = load_dataset({'econ62k': {
         "X": "../../../Resources/Goldstandard/formatted_econbiz-annotation-62k-titles.csv",
         "y": "../../../Resources/Goldstandard/econbiz-stw9-formatted.csv",
         "thes": "../../../Resources/Ontologies/stw.json"
     }})
     cf = ConceptAnalyzer(tr.thesaurus, persist=True, persist_dir=mkdtemp(), input='filename',
                          file_path=tempdir_data)
     times = []
     for _ in range(10):
         start = default_timer()
         cf.analyze(file.name)
         times.append(default_timer() - start)
     print(np.round(np.mean(times), decimals=2))
 def test_simple_transform(self):
     # alphabetic: adipisici amet consectetur dolor elit ipsum lorem sit
     doc = 'Lorem Lorem ipsum dolor sit amet'
     thesaurus = {
         '00': {
             'prefLabel': ['ipsum'],
             'broader': ['0b'],
             'related': ['0r'],
             'narrower': ['0n'],
             'altLabel': []
         },
         '01': {
             'prefLabel': ['dolor'],
             'broader': ['1b'],
             'related': ['1r'],
             'narrower': ['1n'],
             'altLabel': ['amet']
         },
     }
     cf = ConceptAnalyzer(thesaurus)
     expected = ['00', '01', '01']
     self.assertEquals(expected, cf.analyze(doc))