def test_no_readline(): class InvalidSource: def tell(self): return 0 def seek(self, value): pass with pytest.raises(TypeError): list(woosh.tokenize(InvalidSource()))
def test_no_seek(): class InvalidSource: def tell(self): return 0 def readline(self, size=-1): return b'' with pytest.raises(TypeError): list(woosh.tokenize(InvalidSource()))
def test_readline_incorrect_type(invalid): class InvalidSource: def tell(self): return 0 def readline(self, size=-1): return invalid def seek(self, value): pass with pytest.raises(TypeError) as exinfo: list(woosh.tokenize(InvalidSource()))
def test_readline_error(error): class InvalidSource: def tell(self): return 0 def readline(self, size=-1): raise error def seek(self, value): pass with pytest.raises(type(error)) as exinfo: list(woosh.tokenize(InvalidSource())) assert exinfo.value is error
def test_invalid_args(): with pytest.raises(TypeError): woosh.tokenize() with pytest.raises(TypeError): woosh.tokenize(b'', True) with pytest.raises(TypeError): woosh.tokenize(source=b'', continue_on_error=True)
def test_weird_readline(good_line_count, weird): class InvalidSource: def __init__(self): self.q = ([b'123'] * good_line_count) + [weird] def tell(self): return 0 def readline(self, size=-1): return self.q.pop(0) def seek(self, value): pass with pytest.raises(TypeError) as exinfo: list(woosh.tokenize(InvalidSource()))
def test_no_cycle(): tokenizer = woosh.tokenize(b'hello world') weak_tokenizer = weakref.ref(tokenizer) token = next(tokenizer) weak_token = weakref.ref(token) weak_type = weakref.ref(token.type) del token gc.collect() assert weak_token() is None assert weak_type() is not None token = next(tokenizer) del tokenizer gc.collect() assert weak_tokenizer() is None
def test_source_tokenizer_readline_cycle(): class Source: def tell(self): return 0 def seek(self, index): pass class CycleReadline: def __init__(self): self.data = [b'', b'hello world'] def __call__(self, bytes=0): try: return self.data.pop(0) except IndexError: return b'' source = Source() source.readline = CycleReadline() tokenizer = woosh.tokenize(source) source.readline.tokenizer = tokenizer del source weak_tokenizer = weakref.ref(tokenizer) token = next(tokenizer) weak_token = weakref.ref(token) weak_type = weakref.ref(token.type) del token gc.collect() assert weak_token() is None assert weak_type() is not None token = next(tokenizer) del tokenizer gc.collect() assert weak_tokenizer() is None
SAMPLE_DIR = ROOT / 'sample' test_files = [] for directory, _, files in os.walk(SAMPLE_DIR): directory = pathlib.Path(directory) for sample_file_name in files: sample_file = (directory / sample_file_name).resolve() rel = len( str(pathlib.PurePosixPath(directory.relative_to(ROOT))).split('/')) sample_file_relative_sample = pathlib.PurePosixPath( sample_file.relative_to(SAMPLE_DIR)) if sample_file.suffix != '.py': continue with open(sample_file, 'rb') as f: tokens = list(woosh.tokenize(f)) expected = '\n'.join( f' woosh.Token(woosh.{token.type}, {token.value!r}, {token.start_line}, {token.start_column}, {token.end_line}, {token.end_column}),' for token in tokens) template = textwrap.dedent(f""" # this file was generated using test/python/sample/generate.py # python import io import pathlib # pytest import pytest # woosh import woosh def tokenize_file_like(source):
# this script is used as the data set to generate the profile data for profile # guided optimization of the c extension # woosh import woosh # python import io import os import pathlib DATA = (pathlib.Path(__file__).parent.absolute() / 'sample').resolve() for directory, _, files in os.walk(DATA): if directory.endswith('contrived'): continue directory = pathlib.Path(directory) for file in files: if not file.endswith('.py'): continue data_file = directory / file with open(data_file, 'rb') as f: source_bytes = f.read() source_file_like = io.BytesIO(source_bytes) print(data_file.relative_to(DATA)) list(woosh.tokenize(source_bytes)) list(woosh.tokenize(source_file_like))
def tokenize_bytes(source, continue_on_error=False): return list(woosh.tokenize(source, continue_on_error=continue_on_error))
def tokenize_file_like(source, continue_on_error=False): return list(woosh.tokenize( io.BytesIO(source), continue_on_error=continue_on_error ))
def _(source, source_file): for token in tokenize(source_file): pass
def test_incorrect_type(bad_source): with pytest.raises(TypeError): list(woosh.tokenize(bad_source))
def tokenize_file_like(source): return list(woosh.tokenize(io.BytesIO(source)))
def tokenize_bytes(source): return list(woosh.tokenize(source))
def tokenize_bytes(source): return list(woosh.tokenize(source, continue_on_error=True))