def test_splitting_sentences(): # Use temporary directory since mocking is hard to apply to # `_tokenize_sentences_worker`. input_file = random_filename(tempfile.gettempdir()) output_file = random_filename(tempfile.gettempdir()) # Write dummy file to `input_file`. dummy = '안녕하세요. 반갑습니다! 어떠신가요? 괜찮습니다...ㅎㅎ\n\n' with open(input_file, 'w') as fp: fp.write(dummy) # Split the sentences. namuwiki._tokenize_sentences_worker( input_file, output_file, min_len=0, max_len=100) # Check if sentences are splitted well. with open(output_file, 'r') as fp: lines = fp.readlines() assert len(lines) == 4 assert ' '.join([line.strip() for line in lines]) == dummy.strip() # Check if splitting into chuncks works well. namuwiki._tokenize_sentences_worker( input_file, output_file, 0, 100, split_sent=False) with open(output_file, 'r') as fp: lines = fp.readlines() assert lines[0].strip() == dummy.strip()
def test_tokenizing_corpus_well(): # Use temporary directory since `tokenizers` does not support mocking. input_file = random_filename(tempfile.gettempdir()) vocab_file = random_filename(tempfile.gettempdir()) output_file = random_filename(tempfile.gettempdir()) # Copy dummy corpus file to `input_file`. shutil.copyfile('tests/res/wikipedia.plain.txt', input_file) # First of all, train the tokenizer to tokenize corpus. train_tokenizer(input_file, vocab_file, tempfile.gettempdir(), vocab_size=100) # Next, tokenize the given corpus by using trained tokenizer. tokenize_corpus(input_file, output_file, vocab_file) # Check the corpus is tokenized well. with open(output_file, 'r') as output, \ open('tests/res/wikipedia.plain.txt', 'r') as dummy: assert (output.read().strip().count('\n') == dummy.read().strip().count('\n')) # Remove created temporary files. os.remove(vocab_file) os.remove(input_file)
def test_splitting_sentences(): # Use temporary directory since mocking is hard to apply to # `_tokenize_sentences_worker`. input_file = random_filename(tempfile.gettempdir()) output_file = random_filename(tempfile.gettempdir()) # Write dummy file to `input_file`. dummy = 'Nice to meet you Dr. John. Welcome! How are you?\n\n' with open(input_file, 'w') as fp: fp.write(dummy) # Split the sentences. wikipedia._prepare_tokenizing_sentences('en') wikipedia._tokenize_sentences_worker(input_file, output_file, 'en', 0, 100) # Check if sentences are splitted well. with open(output_file, 'r') as fp: lines = fp.readlines() assert len(lines) == 3 assert ' '.join([line.strip() for line in lines]) == dummy.strip() # Check if splitting into chuncks works well. wikipedia._tokenize_sentences_worker(input_file, output_file, 'en', 0, 100, split_sent=False) with open(output_file, 'r') as fp: lines = fp.readlines() assert lines[0].strip() == dummy.strip()
def test_generate_correct_filenames(): # Check for single generation. assert len(random_filename('')) == 16 assert random_filename('parent').startswith('parent') # Check for multiple generations. filenames = random_filenames('', n=4) assert len(filenames) == 4 for name in filenames: assert len(name) == 16 filenames = random_filenames('parent', n=4) for name in filenames: assert name.startswith('parent')
def test_training_tokenizer_well(): # Use temporary directory since `tokenizers` does not support mocking. input_file = random_filename(tempfile.gettempdir()) vocab_file = random_filename(tempfile.gettempdir()) # Copy dummy corpus file to `input_file`. shutil.copyfile('tests/res/wikipedia.plain.txt', input_file) # Train tokenizer with dummy corpus file. train_tokenizer(input_file, vocab_file, tempfile.gettempdir(), vocab_size=100) # Check that the tokenizer is trained well. with open(vocab_file, 'r') as fp: assert len(fp.readlines()) == 100 # Remove created temporary files. os.remove(input_file) os.remove(vocab_file)