def test_matched_query_text_tokenizer_works_with_spdx_ids(self): text = u''' * SPDX-License-Identifier: GPL-2.0+ BSD-3-Clause * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT) ''' result = list(matched_query_text_tokenizer(text)) expected = [(False, u' * '), (True, u'SPDX'), (False, u'-'), (True, u'License'), (False, u'-'), (True, u'Identifier'), (False, u': '), (True, u'GPL'), (False, u'-'), (True, u'2'), (False, u'.'), (True, u'0+'), (False, u' '), (True, u'BSD'), (False, u'-'), (True, u'3'), (False, u'-'), (True, u'Clause'), (False, u'\n * '), (True, u'SPDX'), (False, u'-'), (True, u'License'), (False, u'-'), (True, u'Identifier'), (False, u': ('), (True, u'BSD'), (False, u'-'), (True, u'3'), (False, u'-'), (True, u'Clause'), (False, u' '), (True, u'OR'), (False, u' '), (True, u'EPL'), (False, u'-'), (True, u'1'), (False, u'.'), (True, u'0'), (False, u' '), (True, u'OR'), (False, u' '), (True, u'Apache'), (False, u'-'), (True, u'2'), (False, u'.'), (True, u'0'), (False, u' '), (True, u'OR'), (False, u' '), (True, u'MIT'), (False, u')\n ')] assert result == expected result_as_text = u''.join( itertools.chain.from_iterable( [v for v in m.groupdict().values() if v] for m in tokens_and_non_tokens(text))) assert result_as_text == text
def test_tokens_and_non_tokens_yield_properly_all_texts(self): text = u'''Redistribution+ ;and use in! + 2003 source and +binary forms, ()with or without modifi+cation, are permitted with İrəli .\t\n \r''' result = [m.groupdict() for m in tokens_and_non_tokens(text)] expected = [ {'punct': None, 'token': 'Redistribution+'}, {'punct': ' ;', 'token': None}, {'punct': None, 'token': 'and'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'use'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'in'}, {'punct': '! + ', 'token': None}, {'punct': None, 'token': '2003'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'source'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'and'}, {'punct': ' +', 'token': None}, {'punct': None, 'token': 'binary'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'forms'}, {'punct': ',\n ()', 'token': None}, {'punct': None, 'token': 'with'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'or'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'without'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'modifi+cation'}, {'punct': ', ', 'token': None}, {'punct': None, 'token': 'are'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'permitted'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'with'}, {'punct': ' ', 'token': None}, {'punct': None, 'token': 'İrəli'}, {'punct': ' .\t\n\n \r', 'token': None} ] assert result == expected result_as_text = u''.join(itertools.chain.from_iterable( [v for v in m.groupdict().values() if v] for m in tokens_and_non_tokens(text))) assert result_as_text == text
def test_tokens_and_non_tokens_yield_properly_all_texts(self): text = u'''Redistribution+ ;and use in! 2003 source and binary forms, ()with or without modification, are permitted.\t\n \r''' result = [m.groupdict() for m in tokens_and_non_tokens(text)] expected = [ {u'punct': None, u'token': u'Redistribution+'}, {u'punct': u' ;', u'token': None}, {u'punct': None, u'token': u'and'}, {u'punct': u' ', u'token': None}, {u'punct': None, u'token': u'use'}, {u'punct': u' ', u'token': None}, {u'punct': None, u'token': u'in'}, {u'punct': u'! ', u'token': None}, {u'punct': None, u'token': u'2003'}, {u'punct': u' ', u'token': None}, {u'punct': None, u'token': u'source'}, {u'punct': u' ', u'token': None}, {u'punct': None, u'token': u'and'}, {u'punct': u' ', u'token': None}, {u'punct': None, u'token': u'binary'}, {u'punct': u' ', u'token': None}, {u'punct': None, u'token': u'forms'}, {u'punct': u', \n ()', u'token': None}, {u'punct': None, u'token': u'with'}, {u'punct': u' ', u'token': None}, {u'punct': None, u'token': u'or'}, {u'punct': u' ', u'token': None}, {u'punct': None, u'token': u'without'}, {u'punct': u' ', u'token': None}, {u'punct': None, u'token': u'modification'}, {u'punct': u', ', u'token': None}, {u'punct': None, u'token': u'are'}, {u'punct': u' ', u'token': None}, {u'punct': None, u'token': u'permitted'}, {u'punct': u'.\t\n\n \r', u'token': None} ] assert expected == result result_as_text = u''.join(itertools.chain.from_iterable([v for v in m.groupdict().values() if v] for m in tokens_and_non_tokens(text))) assert text == result_as_text