Python rule_tokenizer примеры, licensedcode.tokenize.rule_tokenizer Python примеры использования

Пример #1

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_can_handle_long_text(self):
     expected = [
         u'ist', u'freie', u'software', u'sie', u'k\xf6nnen', u'es',
         u'unter', u'den', u'bedingungen', u'der', u'gnu', u'general', u'n',
         u'public', u'license', u'wie', u'von', u'der', u'free', u'software',
         u'foundation', u'ver\xf6ffentlicht', u'weitergeben', u'und',
         u'oder', u'n', u'modifizieren', u'entweder', u'gem\xe4\xdf',
         u'version', u'3', u'der', u'lizenz', u'oder', u'nach', u'ihrer',
         u'option', u'jeder', u'sp\xe4teren', u'n', u'version', u'n', u'n',
         u'die', u'ver\xf6ffentlichung', u'von', u'erfolgt', u'in',
         u'der', u'hoffnung', u'da\xdf', u'es', u'ihnen', u'von', u'nutzen',
         u'n', u'sein', u'wird', u'aber', u'ohne', u'irgendeine',
         u'garantie', u'sogar', u'ohne', u'die', u'implizite', u'garantie',
         u'der', u'marktreife', u'n', u'oder', u'der', u'verwendbarkeit',
         u'f\xfcr', u'einen', u'bestimmten', u'zweck', u'details', u'finden',
         u'sie', u'in', u'der', u'gnu', u'general', u'n', u'public',
         u'license', u'n', u'n', u'sie', u'sollten', u'ein', u'exemplar',
         u'der', u'gnu', u'general', u'public', u'license', u'zusammen',
         u'mit', u'n', u'erhalten', u'haben', u'falls', u'nicht',
         u'schreiben', u'sie', u'an', u'die', u'free', u'software',
         u'foundation', u'n', u'inc', u'51', u'franklin', u'st', u'fifth',
         u'floor', u'boston', u'ma', u'02110', u'usa',
     ]
     test_file = self.get_test_loc('tokenize/unicode/12180.txt')
     with codecs.open(test_file, encoding='utf-8') as test:
         assert expected == list(rule_tokenizer(test.read()))

Пример #2

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_can_process_multiple_templatized_parts_strip_multiple_contig_templates_and_leading_and_trailing(self):
     text = u'''{{nexb}}{{nexb}}ab{{nexb Company}}{{nexb}}cd {{second}} {{nexb}}
     {{nexb}}
     {{nexb}}ef
     {{nexb}}
     '''
     expected = [u'ab', u'cd', u'ef', ]
     assert expected == list(rule_tokenizer(text))

Пример #3

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

    def test_rule_tokenizer_can_parse_ill_formed_template_from_file(self):
        test_file = self.get_test_loc('tokenize/ill_formed_template/text.txt')
        with codecs.open(test_file, 'rb', encoding='utf-8') as text:
            result = list(rule_tokenizer(text.read()))
        expected_file = self.get_test_loc('tokenize/ill_formed_template/expected.json')

        import json
        regen = False
        if regen:
            with codecs.open(expected_file, 'wb', encoding='utf-8') as ex:
                json.dump(result, ex, indent=2, separators=(',', ': '))

        with codecs.open(expected_file, encoding='utf-8') as ex:
            expected = json.load(ex, object_pairs_hook=OrderedDict)

        assert expected == result

Пример #4

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

    def test_tokenizers_regex_do_not_choke_on_some_text(self):
        # somehow this text was making the regex choke.
        tf = self.get_test_loc('tokenize/parser.js')
        with codecs.open(tf, 'rb', encoding='utf-8') as text:
            content = text.read()

        start = time()
        list(rule_tokenizer(content))
        duration = time() - start
        assert duration < 5

        start = time()
        list(query_tokenizer(content))
        duration = time() - start
        assert duration < 5

        start = time()
        list(matched_query_text_tokenizer(content))
        duration = time() - start
        assert duration < 5

Пример #5

0

Показать файл

Файл: models.py Проект: ocabrisses/scancode-toolkit

    def tokens(self, lower=True):
        """
        Return an iterable of token strings for this rule. Length is recomputed as a
        side effect. Tokens inside double curly braces (eg. {{ignored}}) are skipped
        and ignored.
        """
        length = 0
        text = self.text()
        text = text.strip()

        # FIXME: this is weird:
        # We tag this rule as being a bare URL if it starts with a scheme and is on one line: this is used to determine a matching approach
        if text.startswith(('http://', 'https://', 'ftp://')) and '\n' not in text[:1000]:
            self.minimum_coverage = 100

        for token in rule_tokenizer(self.text(), lower=lower):
            length += 1
            yield token

        self.length = length
        self.compute_relevance()

Пример #6

0

Показать файл

Файл: models.py Проект: balusarakesh/dje_license_search

    def tokens(self, lower=True):
        """
        Return an iterable of tokens and keep track of gaps by position. Gaps
        and length are recomputed. Tokens inside gaps are tracked but not part
        of the returned stream.
        """
        gaps = set()
        # Note: we track the pos instead of enumerating it because we create
        # positions abstracting gaps
        pos = 0
        length = 0
        for token in rule_tokenizer(self.text(), lower=lower):
            if token is None:
                gaps.add(pos - 1)
            else:
                length += 1
                yield token
                # only increment pos if we are not in a gap
                pos += 1

        self.length = length
        self.gaps = gaps
        self.gaps_count = len(gaps)

Пример #7

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_handles_empty_templates(self):
     text = u'ab{{}}cd'
     expected = [u'ab', u'cd']
     assert expected == list(rule_tokenizer(text))

Пример #8

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_handles_template_with_spaces(self):
     text = u'ab{{       10 }}cd'
     expected = [u'ab', u'cd']
     assert expected == list(rule_tokenizer(text))

Пример #9

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_can_split(self):
     text = u'abc def \n GHI'
     result = list(rule_tokenizer(text))
     assert [u'abc', u'def', u'ghi'] == result

Пример #10

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_merges_contiguous_gaps(self):
     text = u'abc{{temp}}{{xzy}}def'
     result = list(rule_tokenizer(text))
     expected = [u'abc', u'def']
     assert expected == result

Пример #11

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_handles_empty_string(self):
     text = ''
     result = list(rule_tokenizer(text))
     assert [] == result

Пример #12

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_handles_empty_string(self):
     text = ''
     result = list(rule_tokenizer(text))
     assert [] == result

Пример #13

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_handles_empty_templates(self):
     text = u'ab{{}}cd'
     expected = [u'ab', u'cd']
     assert expected == list(rule_tokenizer(text))

Пример #14

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_does_not_throw_exception_for_illegal_pystache_templates(self):
     text = u'''Permission to use, copy, modify, and {{ /or : the
                 text exist without or }} distribute this software...'''
     assert list(rule_tokenizer(text))

Пример #15

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_merges_contiguous_gaps(self):
     text = u'abc{{temp}}{{xzy}}def'
     result = list(rule_tokenizer(text))
     expected = [u'abc', u'def']
     assert expected == result

Пример #16

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_does_not_return_leading_and_trailing_gaps(self):
     text = u'{{xzy}}{{xzy}}abc{{temp}}def{{xzy}}{{xzy}}'
     result = list(rule_tokenizer(text))
     expected = [u'abc', u'def']
     assert expected == result

Пример #17

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_can_split_templates(self):
     text = u'abc def \n {{temp}} GHI'
     result = list(rule_tokenizer(text))
     expected = [u'abc', u'def', u'ghi', ]
     assert expected == result

Пример #18

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_can_split(self):
     text = u'abc def \n GHI'
     result = list(rule_tokenizer(text))
     assert [u'abc', u'def', u'ghi'] == result

Пример #19

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_handles_blank_lines(self):
     text = ' \n\t  '
     result = list(rule_tokenizer(text))
     assert [] == result

Пример #20

0

Показать файл

 def test_rule_tokenizer_handles_combination_of_well_formed_and_ill_formed_templates_2(
         self):
     text = u'}}{{{{abcd}}ddd}}{{'
     assert [u'ddd'] == list(rule_tokenizer(text))

Пример #21

0

Показать файл

 def test_rule_tokenizer_can_handle_long_text(self):
     expected = [
         u'ist',
         u'freie',
         u'software',
         u'sie',
         u'k\xf6nnen',
         u'es',
         u'unter',
         u'den',
         u'bedingungen',
         u'der',
         u'gnu',
         u'general',
         u'n',
         u'public',
         u'license',
         u'wie',
         u'von',
         u'der',
         u'free',
         u'software',
         u'foundation',
         u'ver\xf6ffentlicht',
         u'weitergeben',
         u'und',
         u'oder',
         u'n',
         u'modifizieren',
         u'entweder',
         u'gem\xe4\xdf',
         u'version',
         u'3',
         u'der',
         u'lizenz',
         u'oder',
         u'nach',
         u'ihrer',
         u'option',
         u'jeder',
         u'sp\xe4teren',
         u'n',
         u'version',
         u'n',
         u'n',
         u'die',
         u'ver\xf6ffentlichung',
         u'von',
         u'erfolgt',
         u'in',
         u'der',
         u'hoffnung',
         u'da\xdf',
         u'es',
         u'ihnen',
         u'von',
         u'nutzen',
         u'n',
         u'sein',
         u'wird',
         u'aber',
         u'ohne',
         u'irgendeine',
         u'garantie',
         u'sogar',
         u'ohne',
         u'die',
         u'implizite',
         u'garantie',
         u'der',
         u'marktreife',
         u'n',
         u'oder',
         u'der',
         u'verwendbarkeit',
         u'f\xfcr',
         u'einen',
         u'bestimmten',
         u'zweck',
         u'details',
         u'finden',
         u'sie',
         u'in',
         u'der',
         u'gnu',
         u'general',
         u'n',
         u'public',
         u'license',
         u'n',
         u'n',
         u'sie',
         u'sollten',
         u'ein',
         u'exemplar',
         u'der',
         u'gnu',
         u'general',
         u'public',
         u'license',
         u'zusammen',
         u'mit',
         u'n',
         u'erhalten',
         u'haben',
         u'falls',
         u'nicht',
         u'schreiben',
         u'sie',
         u'an',
         u'die',
         u'free',
         u'software',
         u'foundation',
         u'n',
         u'inc',
         u'51',
         u'franklin',
         u'st',
         u'fifth',
         u'floor',
         u'boston',
         u'ma',
         u'02110',
         u'usa',
     ]
     test_file = self.get_test_loc('tokenize/unicode/12180.txt')
     with codecs.open(test_file, encoding='utf-8') as test:
         assert expected == list(rule_tokenizer(test.read()))

Пример #22

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_handles_empty_lines(self):
     text = u'\n\n'
     expected = []
     assert expected == list(rule_tokenizer(text))

Пример #23

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_and_query_tokenizer_have_the_same_behavior4(self):
     text , expected = 'license_Dual+BSD-GPL', ['license_dual+bsd', 'gpl']
     assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))

Пример #24

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_handles_multi_word_templates(self):
     text = u'ab{{10 nexb Company}}cd'
     expected = [u'ab', u'cd']
     assert expected == list(rule_tokenizer(text))

Пример #25

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_and_query_tokenizer_have_the_same_behavior2(self):
     test_file = self.get_test_loc('tokenize/freertos/gpl-2.0-freertos.RULE')
     with codecs.open(test_file, encoding='utf-8') as test:
         text = test.read()
         assert list(rule_tokenizer(text)) == list(query_tokenizer(text))

Пример #26

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_and_query_tokenizer_have_the_same_behavior3(self):
     text , expected = 'license=Dual BSD/GPL', ['license', 'dual', 'bsd', 'gpl']
     assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))

Пример #27

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_handles_template_with_spaces(self):
     text = u'ab{{       10 }}cd'
     expected = [u'ab', u'cd']
     assert expected == list(rule_tokenizer(text))

Пример #28

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_and_query_tokenizer_have_the_same_behavior_from_file(self):
     test_file = self.get_test_loc('tokenize/freertos/gpl-2.0-freertos.RULE')
     with codecs.open(test_file, encoding='utf-8') as test:
         text = test.read()
         assert list(rule_tokenizer(text)) == list(query_tokenizer(text))

Пример #29

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_can_process_multiple_templatized_parts(self):
     text = u'ab{{nexb Company}}cd {{second}}ef'
     expected = [u'ab', u'cd', u'ef', ]
     assert expected == list(rule_tokenizer(text))

Пример #30

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_handles_blank_lines(self):
     text = ' \n\t  '
     result = list(rule_tokenizer(text))
     assert [] == result

Пример #31

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_can_process_multiple_templatized_parts_with_default_gap_and_custom_gaps(self):
     text = u'ab{{nexb Company}}cd{{12 second}}ef{{12 second}}gh'
     expected = [u'ab', u'cd', u'ef', u'gh']
     assert expected == list(rule_tokenizer(text))

Пример #32

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_can_split_templates(self):
     text = u'abc def \n {{temp}} GHI'
     result = list(rule_tokenizer(text))
     expected = [u'abc', u'def', u'ghi', ]
     assert expected == result

Пример #33

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_handles_empty_lines(self):
     text = u'\n\n'
     expected = []
     assert expected == list(rule_tokenizer(text))

Пример #34

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_does_not_return_leading_and_trailing_gaps(self):
     text = u'{{xzy}}{{xzy}}abc{{temp}}def{{xzy}}{{xzy}}'
     result = list(rule_tokenizer(text))
     expected = [u'abc', u'def']
     assert expected == result

Пример #35

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_can_parse_simple_line(self):
     text = u'Licensed by {{12 nexB}} to you '
     expected = [u'licensed', u'by', u'to', u'you']
     assert expected == list(rule_tokenizer(text))

Пример #36

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_handles_multi_word_templates(self):
     text = u'ab{{10 nexb Company}}cd'
     expected = [u'ab', u'cd']
     assert expected == list(rule_tokenizer(text))

Пример #37

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_does_not_throw_exception_for_illegal_pystache_templates(self):
     text = u'''Permission to use, copy, modify, and {{ /or : the
                 text exist without or }} distribute this software...'''
     assert list(rule_tokenizer(text))

Пример #38

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_can_process_multiple_templatized_parts(self):
     text = u'ab{{nexb Company}}cd {{second}}ef'
     expected = [u'ab', u'cd', u'ef', ]
     assert expected == list(rule_tokenizer(text))

Пример #39

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_does_not_crash_on_unicode_rules_text_5(self):
     test_file = self.get_test_loc('tokenize/unicode/12420.txt')
     with codecs.open(test_file, encoding='utf-8') as test:
         list(rule_tokenizer(test.read()))

Пример #40

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_can_process_multiple_templatized_parts_with_default_gap_and_custom_gaps(self):
     text = u'ab{{nexb Company}}cd{{12 second}}ef{{12 second}}gh'
     expected = [u'ab', u'cd', u'ef', u'gh']
     assert expected == list(rule_tokenizer(text))

Пример #41

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_does_not_crash_on_unicode_rules_text_5(self):
     test_file = self.get_test_loc('tokenize/unicode/12420.txt')
     with codecs.open(test_file, encoding='utf-8') as test:
         list(rule_tokenizer(test.read()))

Пример #42

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_can_parse_simple_line(self):
     text = u'Licensed by {{12 nexB}} to you '
     expected = [u'licensed', u'by', u'to', u'you']
     assert expected == list(rule_tokenizer(text))

Пример #43

0

Показать файл

Файл: models.py Проект: ocabrisses/scancode-toolkit

    def validate(licenses, verbose=False, no_dupe_urls=False):
        """
        Check that licenses are valid. `licenses` is a mapping of key ->
        License. Return dictionaries of infos, errors and warnings mapping a
        license key to validation issue messages. Print messages if verbose is
        True.
        """
        infos = defaultdict(list)
        warnings = defaultdict(list)
        errors = defaultdict(list)

        # used for global dedupe of texts
        by_spdx_key = defaultdict(list)
        by_text = defaultdict(list)

        for key, lic in licenses.items():
            err = errors[key].append
            warn = warnings[key].append
            info = infos[key].append

            if not lic.short_name:
                warn('No short name')
            if not lic.name:
                warn('No name')
            if not lic.category:
                warn('No category')
            if not lic.owner:
                warn('No owner')

            if lic.next_version and lic.next_version not in licenses:
                err('License next version is unknown')

            if (lic.is_or_later and
                lic.base_license and
                lic.base_license not in licenses):
                err('Base license for an "or later" license is unknown')

            # URLS dedupe and consistency
            if no_dupe_urls:
                if lic.text_urls and not all(lic.text_urls):
                    warn('Some empty license text_urls')

                if lic.other_urls and not all(lic.other_urls):
                    warn('Some empty license other_urls')

                # redundant URLs used multiple times
                if lic.homepage_url:
                    if lic.homepage_url in lic.text_urls:
                        warn('Homepage URL also in text_urls')
                    if lic.homepage_url in lic.other_urls:
                        warn('Homepage URL also in other_urls')
                    if lic.homepage_url == lic.faq_url:
                        warn('Homepage URL same as faq_url')
                    if lic.homepage_url == lic.osi_url:
                        warn('Homepage URL same as osi_url')

                if lic.osi_url or lic.faq_url:
                    if lic.osi_url == lic.faq_url:
                        warn('osi_url same as faq_url')

                all_licenses = lic.text_urls + lic.other_urls
                for url in lic.osi_url, lic.faq_url, lic.homepage_url:
                    if url: all_licenses.append(url)

                if not len(all_licenses) == len(set(all_licenses)):
                    warn('Some duplicated URLs')

            # local text consistency
            text = lic.text

            license_qtokens = tuple(query_tokenizer(text, lower=True))
            license_rtokens = tuple(rule_tokenizer(text, lower=True))
            if license_qtokens != license_rtokens:
                info('License text contains rule templated region with  {{}}')
            if not license_qtokens:
                info('No license text')
            else:
                # for global dedupe
                by_text[license_qtokens].append(key + ': TEXT')


            # SPDX consistency
            if lic.spdx_license_key:
                by_spdx_key[lic.spdx_license_key].append(key)

        # global SPDX consistency
        multiple_spdx_keys_used = {k: v for k, v in by_spdx_key.items() if len(v) > 1}
        if multiple_spdx_keys_used:
            for k, lkeys in multiple_spdx_keys_used.items():
                infos['GLOBAL'].append('SPDX key: ' + k + ' used in multiple licenses: ' + ', '.join(sorted(lkeys)))

        # global text dedupe
        multiple_texts = {k: v for k, v in by_text.items() if len(v) > 1}
        if multiple_texts:
            for k, msgs in multiple_texts.items():
                errors['GLOBAL'].append('Duplicate texts in multiple licenses:' + ', '.join(sorted(msgs)))

        errors = {k: v for k, v in errors.items() if v}
        warnings = {k: v for k, v in warnings.items() if v}
        infos = {k: v for k, v in infos.items() if v}

        if verbose:
            print('Licenses validation errors:')
            for key, msgs in sorted(errors.items()):
                print('ERRORS for:', key, ':', '\n'.join(msgs))

            print('Licenses validation warnings:')
            for key, msgs in sorted(warnings.items()):
                print('WARNINGS for:', key, ':', '\n'.join(msgs))

            print('Licenses validation infos:')
            for key, msgs in sorted(infos.items()):
                print('INFOS for:', key, ':', '\n'.join(msgs))

        return errors, warnings, infos

Пример #44

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_does_not_crash_with_non_well_formed_templatized_parts(self):
     text = u'abcd{{ddd'
     assert [u'abcd', u'ddd'] == list(rule_tokenizer(text))

Пример #45

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_handles_combination_of_well_formed_and_ill_formed_templates_2(self):
     text = u'}}{{{{abcd}}ddd}}{{'
     assert [u'ddd'] == list(rule_tokenizer(text))

Пример #46

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_does_not_crash_with_non_well_formed_templatized_parts(self):
     text = u'abcd{{ddd'
     assert [u'abcd', u'ddd'] == list(rule_tokenizer(text))

Пример #47

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_can_parse_ill_formed_template(self):
     tf = self.get_test_loc('tokenize/ill_formed_template/text.txt')
     with codecs.open(tf, 'rb', encoding='utf-8') as text:
         result = list(rule_tokenizer(text.read()))
         assert 3875 == len(result)

Пример #48

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_tokenizer_handles_combination_of_well_formed_and_ill_formed_templates(self):
     text = u'ab{{c}}d}}ef'
     expected = [u'ab', u'd', u'ef']
     assert expected == list(rule_tokenizer(text))

Пример #49

0

Показать файл

Файл: test_tokenize.py Проект: yashdsaraf/scancode-toolkit

 def test_rule_tokenizer_handles_combination_of_well_formed_and_ill_formed_templates(self):
     text = u'ab{{c}}d}}ef'
     expected = [u'ab', u'd', u'ef']
     assert expected == list(rule_tokenizer(text))

Пример #50

0

Показать файл

Файл: test_tokenize.py Проект: ocabrisses/scancode-toolkit

 def test_rule_and_query_tokenizer_have_the_same_behavior1(self):
     text , expected = 'MODULE_LICENSE("Dual BSD/GPL");', ['module_license', 'dual', 'bsd', 'gpl']
     assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))

Пример #51

0

Показать файл

Файл: models.py Проект: jarnugirdhar/scancode-toolkit

    def validate(licenses, verbose=False, no_dupe_urls=False):
        """
        Check that licenses are valid. `licenses` is a mapping of key ->
        License. Return dictionaries of infos, errors and warnings mapping a
        license key to validation issue messages. Print messages if verbose is
        True.
        """
        infos = defaultdict(list)
        warnings = defaultdict(list)
        errors = defaultdict(list)

        # used for global dedupe of texts
        by_spdx_key = defaultdict(list)
        by_text = defaultdict(list)

        for key, lic in licenses.items():
            err = errors[key].append
            warn = warnings[key].append
            info = infos[key].append

            # names
            if not lic.short_name:
                warn('No short name')
            if not lic.name:
                warn('No name')
            if not lic.category:
                warn('No category')

            if lic.next_version and lic.next_version not in licenses:
                err('License next version is unknown')

            if (lic.is_or_later and
                lic.base_license and
                lic.base_license not in licenses):
                err('Base license for an "or later" license is unknown')

            # URLS dedupe and consistency
            if no_dupe_urls:
                if lic.text_urls and not all(lic.text_urls):
                    warn('Some empty license text_urls')

                if lic.other_urls and not all(lic.other_urls):
                    warn('Some empty license other_urls')

                # redundant URLs used multiple times
                if lic.homepage_url:
                    if lic.homepage_url in lic.text_urls:
                        warn('Homepage URL also in text_urls')
                    if lic.homepage_url in lic.other_urls:
                        warn('Homepage URL also in other_urls')
                    if lic.homepage_url == lic.faq_url:
                        warn('Homepage URL same as faq_url')
                    if lic.homepage_url == lic.osi_url:
                        warn('Homepage URL same as osi_url')

                if lic.osi_url or lic.faq_url:
                    if lic.osi_url == lic.faq_url:
                        warn('osi_url same as faq_url')

                all_licenses = lic.text_urls + lic.other_urls
                for url in lic.osi_url, lic.faq_url, lic.homepage_url:
                    if url: all_licenses.append(url)

                if not len(all_licenses) == len(set(all_licenses)):
                    warn('Some duplicated URLs')

            # local text consistency
            text = lic.text

            license_qtokens = tuple(query_tokenizer(text, lower=True))
            license_rtokens = tuple(rule_tokenizer(text, lower=True))
            if license_qtokens != license_rtokens:
                info('License text contains rule templated region with  {{}}')
            if not license_qtokens:
                info('No license text')
            else:
                # for global dedupe
                by_text[license_qtokens].append(key + ': TEXT')


            # SPDX consistency
            if lic.spdx_license_key:
                by_spdx_key[lic.spdx_license_key].append(key)

        # global SPDX consistency
        multiple_spdx_keys_used = {k: v for k, v in by_spdx_key.items() if len(v) > 1}
        if multiple_spdx_keys_used:
            for k, lkeys in multiple_spdx_keys_used.items():
                infos['GLOBAL'].append('SPDX key: ' + k + ' used in multiple licenses: ' + ', '.join(sorted(lkeys)))

        # global text dedupe
        multiple_texts = {k: v for k, v in by_text.items() if len(v) > 1}
        if multiple_texts:
            for k, msgs in multiple_texts.items():
                errors['GLOBAL'].append('Duplicate texts in multiple licenses:' + ', '.join(sorted(msgs)))

        errors = {k: v for k, v in errors.items() if v}
        warnings = {k: v for k, v in warnings.items() if v}
        infos = {k: v for k, v in infos.items() if v}

        if verbose:
            print('Licenses validation errors:')
            for key, msgs in sorted(errors.items()):
                print('ERRORS for:', key, ':', '\n'.join(msgs))

            print('Licenses validation warnings:')
            for key, msgs in sorted(warnings.items()):
                print('WARNINGS for:', key, ':', '\n'.join(msgs))

            print('Licenses validation infos:')
            for key, msgs in sorted(infos.items()):
                print('INFOS for:', key, ':', '\n'.join(msgs))

        return errors, warnings, infos

Python rule_tokenizer примеры использования