示例#1
0
    def test_matched_query_text_tokenizer_works_with_spdx_ids(self):
        text = u''' * SPDX-License-Identifier: GPL-2.0+    BSD-3-Clause
         * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
        '''
        result = list(matched_query_text_tokenizer(text))
        expected = [(False, u' * '), (True, u'SPDX'), (False, u'-'),
                    (True, u'License'), (False, u'-'), (True, u'Identifier'),
                    (False, u': '),
                    (True, u'GPL'), (False, u'-'), (True, u'2'), (False, u'.'),
                    (True, u'0+'), (False, u'    '), (True, u'BSD'),
                    (False, u'-'), (True, u'3'), (False, u'-'),
                    (True, u'Clause'), (False, u'\n         * '),
                    (True, u'SPDX'), (False, u'-'), (True, u'License'),
                    (False, u'-'), (True, u'Identifier'), (False, u': ('),
                    (True, u'BSD'), (False, u'-'), (True, u'3'), (False, u'-'),
                    (True, u'Clause'), (False, u' '), (True, u'OR'),
                    (False, u' '), (True, u'EPL'), (False, u'-'), (True, u'1'),
                    (False, u'.'), (True, u'0'), (False, u' '), (True, u'OR'),
                    (False, u' '), (True, u'Apache'), (False, u'-'),
                    (True, u'2'), (False, u'.'), (True, u'0'), (False, u' '),
                    (True, u'OR'), (False, u' '), (True, u'MIT'),
                    (False, u')\n        ')]

        assert result == expected

        result_as_text = u''.join(
            itertools.chain.from_iterable(
                [v for v in m.groupdict().values() if v]
                for m in tokens_and_non_tokens(text)))
        assert result_as_text == text
    def test_tokens_and_non_tokens_yield_properly_all_texts(self):
        text = u'''Redistribution+ ;and use in! + 2003 source and +binary forms,
        ()with or without modifi+cation, are permitted with İrəli .\t\n
        \r'''
        result = [m.groupdict() for m in tokens_and_non_tokens(text)]
        expected = [
            {'punct': None, 'token': 'Redistribution+'},
            {'punct': ' ;', 'token': None},
            {'punct': None, 'token': 'and'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'use'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'in'},
            {'punct': '! + ', 'token': None},
            {'punct': None, 'token': '2003'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'source'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'and'},
            {'punct': ' +', 'token': None},
            {'punct': None, 'token': 'binary'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'forms'},
            {'punct': ',\n        ()', 'token': None},
            {'punct': None, 'token': 'with'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'or'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'without'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'modifi+cation'},
            {'punct': ', ', 'token': None},
            {'punct': None, 'token': 'are'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'permitted'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'with'},
            {'punct': ' ', 'token': None},
            {'punct': None, 'token': 'İrəli'},
            {'punct': ' .\t\n\n        \r', 'token': None}
        ]
        assert result == expected

        result_as_text = u''.join(itertools.chain.from_iterable(
            [v for v in m.groupdict().values() if v] for m in tokens_and_non_tokens(text)))
        assert result_as_text == text
    def test_tokens_and_non_tokens_yield_properly_all_texts(self):
        text = u'''Redistribution+ ;and use in! 2003 source and binary forms, 
        ()with or without modification, are permitted.\t\n
        \r'''
        result = [m.groupdict() for m in tokens_and_non_tokens(text)]
        expected = [
            {u'punct': None, u'token': u'Redistribution+'},
            {u'punct': u' ;', u'token': None},
            {u'punct': None, u'token': u'and'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'use'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'in'},
            {u'punct': u'! ', u'token': None},
            {u'punct': None, u'token': u'2003'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'source'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'and'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'binary'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'forms'},
            {u'punct': u', \n        ()', u'token': None},
            {u'punct': None, u'token': u'with'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'or'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'without'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'modification'},
            {u'punct': u', ', u'token': None},
            {u'punct': None, u'token': u'are'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'permitted'},
            {u'punct': u'.\t\n\n        \r', u'token': None}
        ]
        assert expected == result

        result_as_text = u''.join(itertools.chain.from_iterable([v for v in m.groupdict().values() if v] for m in tokens_and_non_tokens(text)))
        assert text == result_as_text
    def test_tokens_and_non_tokens_yield_properly_all_texts(self):
        text = u'''Redistribution+ ;and use in! 2003 source and binary forms, 
        ()with or without modification, are permitted.\t\n
        \r'''
        result = [m.groupdict() for m in tokens_and_non_tokens(text)]
        expected = [
            {u'punct': None, u'token': u'Redistribution+'},
            {u'punct': u' ;', u'token': None},
            {u'punct': None, u'token': u'and'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'use'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'in'},
            {u'punct': u'! ', u'token': None},
            {u'punct': None, u'token': u'2003'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'source'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'and'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'binary'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'forms'},
            {u'punct': u', \n        ()', u'token': None},
            {u'punct': None, u'token': u'with'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'or'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'without'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'modification'},
            {u'punct': u', ', u'token': None},
            {u'punct': None, u'token': u'are'},
            {u'punct': u' ', u'token': None},
            {u'punct': None, u'token': u'permitted'},
            {u'punct': u'.\t\n\n        \r', u'token': None}
        ]
        assert expected == result

        result_as_text = u''.join(itertools.chain.from_iterable([v for v in m.groupdict().values() if v] for m in tokens_and_non_tokens(text)))
        assert text == result_as_text