def test_big_vocab_single_expect_true(self): bgf = txtPY.big_text_files() bgf.vocabulary_accumulator( **params_big_files.tok_kwargs_single_vocab) # saves to folder lst_VOC_single = os.listdir(params_big_files.tok_write) assert 'VOCAB_single.txt' in lst_VOC_single
def test_big_parser_expect_true(self): bgf = txtPY.big_text_files() bgf.big_text_parser( **params_big_files.tok_kwargs_parse) # saves to folder lst_files = os.listdir(params_big_files.tok_write) for FILE in ['batch1.txt', 'batch2.txt']: assert FILE in lst_files
def test_big_splitter_error_handling(self): bgf = txtPY.big_text_files() for sub_dict in range(len(params_big_files.tok_kwargs_error_handling)): with pytest.raises(Exception) as excinfo: bgf.big_text_splitter( **params_big_files.tok_kwargs_error_handling[sub_dict]) assert params_big_files.list_of_error_messages[sub_dict] in str( excinfo.value)
def test_big_vocab_error_handling(self): bgf = txtPY.big_text_files() for sub_dict in range( len(params_big_files.tok_kwg_VOCAB_error_handling)): with pytest.raises(Exception) as excinfo: bgf.vocabulary_accumulator( **params_big_files.tok_kwg_VOCAB_error_handling[sub_dict]) assert params_big_files.list_of_error_messages_tok_VOCAB[ sub_dict] in str(excinfo.value)
def test_big_tokenizer_expect_true(self): bgf = txtPY.big_text_files() bgf.big_text_tokenizer( **params_big_files.tok_kwargs_bigf) # saves to folder bgf.big_text_tokenizer( **params_big_files.tok_kwargs_bigf_userdef) # saves to folder lst_files = os.listdir(params_big_files.tok_write) for FILE in ['batch1.txt', 'batch2.txt']: assert FILE in lst_files bgf.big_text_tokenizer( **params_big_files.tok_kwargs_bigf_vocab) # saves to folder lst_VOCAB = os.listdir(params_big_files.tok_vocab) for VOCB in ['batch1.txt', 'batch2.txt']: assert VOCB in lst_VOCAB