def test_e001(self): """ Test: Error code E001 """ with pytest.raises(ValueError) as excinfo: excelcy = ExcelCy() excelcy.load(file_path='not_exist.xlsx') assert str(excinfo.value) == Errors.E001
def test_readme_04(self): """ Test: code snippet found in README.rst """ # load first and confirm Himalayas is PRODUCT excelcy = ExcelCy.execute(file_path=self.get_test_data_path( fs_path='test_data_05.xlsx')) gold = excelcy.storage.train.items.get('1').items.get('1.1') assert gold.subtext == 'Himalayas' and gold.entity == 'PRODUCT' # retrain and set the entity of Himalaya to PLACE excelcy = ExcelCy.execute(file_path=self.get_test_data_path( fs_path='test_data_05a.xlsx')) gold = excelcy.storage.train.items.get('1').items.get('1.1') assert gold.subtext == 'Himalayas' and gold.entity == 'FAC'
def prepare_excelcy_data(): excelcy = ExcelCy() add_stopwords(excelcy.nlp) excelcy.storage.config = Config(nlp_base='en_core_web_lg', train_iteration=20, train_drop=0.2) excelcy.storage.base_path = str(constants.MODEL_DATA_DIR) excelcy.storage.source.add(kind='textract', value='[base_path]/source/training_text.txt') excelcy.discover() excelcy.storage.prepare.add(kind='file', value='[base_path]/prepare/pers.xlsx', entity='') excelcy.storage.prepare.add(kind='file', value='[base_path]/prepare/orgs.xlsx', entity='') excelcy.storage.prepare.add(kind='file', value='[base_path]/prepare/locs.xlsx', entity='') excelcy.storage.prepare.add(kind='file', value='[base_path]/prepare/ships.xlsx', entity='') excelcy.storage.prepare.add(kind='file', value='[base_path]/prepare/misc.xlsx', entity='') excelcy.prepare() excelcy.storage.phase.add('discover') excelcy.storage.phase.add('prepare') excelcy.storage.phase.add('train') excelcy.storage.phase.add('retest') excelcy.storage.config.prepare_enabled = False excelcy.save_storage(str(constants.MODEL_DATA_DIR / 'train_model.xlsx'))
def test_readme_02(self): """ Test: code snippet found in README.rst """ excelcy = ExcelCy.execute(file_path=self.get_test_data_path( fs_path='test_data_01.xlsx')) doc = excelcy.nlp('Google rebrands its business apps') assert doc.ents[0].label_ == 'ORG'
def test_save(self): """ Test: save training """ excelcy = ExcelCy.execute(file_path=self.get_test_data_path( fs_path='test_data_01.xlsx')) file_path = self.get_test_tmp_path(fs_path='test_data_01.xlsx') excelcy.save(file_path=file_path) excelcy.load(file_path=file_path)
def assert_training(self, file_path: str, entity_tests: dict = None): excelcy = ExcelCy.execute(file_path=file_path) nlp = excelcy.nlp for idx, train in excelcy.storage.train.items.items(): train_ents = set([(gold.subtext, gold.entity) for _, gold in train.items.items()]) doc = nlp(train.text) ents = set([(ent.text, ent.label_) for ent in doc.ents]) for ent in ents: assert ent in train_ents
def test_readme_03(self): """ Test: code snippet found in README.rst """ excelcy = ExcelCy() excelcy.storage.base_path = self.test_data_path excelcy.storage.config = Config(nlp_base='en_core_web_sm', train_iteration=2, train_drop=0.2) excelcy.storage.source.add(kind='textract', value='source/source_01.txt') excelcy.storage.prepare.add(kind='phrase', value='Uber', entity='ORG') excelcy.discover() excelcy.prepare() excelcy.train() assert excelcy.nlp( 'Uber blew through $1 million a week').ents[0].label_ == 'ORG'
def test_save(self): """ Test: save training """ excelcy = ExcelCy.execute(file_path=self.get_test_data_path(fs_path='test_data_01.xlsx')) file_path = self.get_test_tmp_path(fs_path='test_data_01.xlsx') excelcy.save_storage(file_path=file_path) data = self.extract_storage(storage=excelcy.storage) excelcy.load(file_path=file_path) data2 = self.extract_storage(storage=excelcy.storage) assert data == data2
def test_matcher(self): """ Test: Matcher """ excelcy = ExcelCy() excelcy.storage.config = Config(nlp_base='en_core_web_sm', train_iteration=2, train_drop=0.2) nlp = excelcy.create_nlp() patterns = [{ 'kind': 'phrase', 'value': 'thisisrandom', 'entity': 'PRODUCT' }, { 'kind': 'regex', 'value': 'thatis(.+)', 'entity': 'PRODUCT' }] nlp.add_pipe(MatcherPipe(nlp=nlp, patterns=patterns)) # type: MatcherPipe doc = nlp('thisisrandom thatisrandom') assert doc.ents[0].label_ == 'PRODUCT' and doc.ents[ 1].label_ == 'PRODUCT'
def assert_training(self, file_path: str, entity_tests: dict = None): excelcy = ExcelCy.execute(file_path=file_path) nlp = excelcy.nlp for idx, train in excelcy.storage.train.items.items(): train_ents = set([(gold.subtext, gold.entity) for _, gold in train.items.items()]) doc = nlp(train.text) ents = set([(ent.text, ent.label_) for ent in doc.ents]) # verify based on data assert train_ents <= ents # verify if test given test = (entity_tests or {}).get(idx, set()) assert test <= ents
def train_excelcy(save=False): excelcy = ExcelCy() add_stopwords(excelcy.nlp) excelcy.execute(str(constants.MODEL_DATA_DIR / 'train_model.xlsx')) if save: excelcy.save_nlp(str(constants.MODEL_DIR)) doc = excelcy.nlp(load_book_by_nr(1).content()) ships = set([ re.sub('[tT]he ', '', ent.text) for ent in doc.ents if ent.label_ == 'SHIP' ]) persons = set([ent.text for ent in doc.ents if ent.label_ == 'PERSON']) print(ships) print(persons)
from excelcy import ExcelCy from excelcy.storage import Config # test_string = 'Android Pay expands to Canada' # excelcy = ExcelCy() # excelcy.storage.config = Config(nlp_base='en_core_web_sm', train_iteration=50, train_drop=0.2) # doc = excelcy.nlp(test_string) # # showing no ORG # print([(ent.label_, ent.text) for ent in doc.ents]) # excelcy.storage.source.add(kind='text', value=test_string) # excelcy.discover() # excelcy.storage.prepare.add(kind='phrase', value='Android Pay', entity='PRODUCT') # excelcy.prepare() # excelcy.train() # doc = excelcy.nlp(test_string) # print([(ent.label_, ent.text) for ent in doc.ents]) # FAILED tests/test_excelcy.py::ExcelCyTestCase::test_execute - AssertionError: assert ('$1', 'MONEY') in {('$1 million', 'MONEY'), ('Uber', 'ORG')} # FAILED tests/test_pipe.py::PipeTestCase::test_execute - AssertionError: assert ('$1', 'MONEY') in {('$1 million', 'MONEY'), ('Uber', 'ORG')} # FAILED tests/test_readme.py::ReadmeTestCase::test_readme_04 - AssertionError: assert ('China' == 'Himalayas' excelcy = ExcelCy() doc = excelcy.nlp('Android Pay expands to Canada') print([(ent.label_, ent.text) for ent in doc.ents]) excelcy = ExcelCy.execute(file_path='tests/data/test_data_03.xlsx') doc = excelcy.nlp('Android Pay expands to Canada') print([(ent.label_, ent.text) for ent in doc.ents])
def main(argv: list = None): # quick CLI execution args = argv or sys.argv if args[1] == 'execute': excelcy = ExcelCy.execute(file_path=args[2])