Python query_tokenizer 예제들, licensedcode.tokenize.query_tokenizer Python 예제들

예제 #1

0

파일 보기

파일: test_tokenize.py 프로젝트: xavierfigueroav/scancode-toolkit

    def test_query_tokenizer_behavior_from_file(self, regen=False):
        test_file = self.get_test_loc(
            'tokenize/freertos/gpl-2.0-freertos.RULE')
        with io.open(test_file, encoding='utf-8') as test:
            text = test.read()
        result = list(query_tokenizer(text))

        expected_file = test_file + '.json'
        if regen:
            with open(expected_file, mode) as exc_test:
                json.dump(result, exc_test, indent=2)

        with io.open(expected_file, encoding='utf-8') as exc_test:
            expected = json.load(exc_test)
        assert expected == list(query_tokenizer(text))

예제 #2

0

파일 보기

파일: test_tokenize.py 프로젝트: sthagen/scancode-toolkit

 def test_query_tokenizer_lines_on_html_like_texts_2(
         self, regen=REGEN_TEST_FIXTURES):
     test_file = self.get_test_loc('tokenize/htmlish.html')
     expected_file = test_file + '.expected.query_tokenizer.json'
     lines = query_lines(test_file)
     result = [list(query_tokenizer(line)) for _ln, line in lines]
     check_results(result, expected_file, regen=regen)

예제 #3

0

파일 보기

파일: test_tokenize.py 프로젝트: Siddhant-K-code/scancode-toolkit

 def test_query_tokenizer_handles_rarer_unicode_codepoints(self):
     # NOTE: we are not catching the heart as a proper token, but this is
     # rare enough that we do not care
     text = '♡ Copying Art is an act of love. Love is not subject to law.'
     expected = [u'copying', u'art', u'is', u'an', u'act', u'of', u'love',
         u'love', u'is', u'not', u'subject', u'to', u'law']
     assert list(query_tokenizer(text)) == expected

예제 #4

0

파일 보기

파일: test_tokenize.py 프로젝트: Siddhant-K-code/scancode-toolkit

 def test_query_tokenizer_can_handle_long_text(self):
     expected = [
         u'pychess',
         u'ist', u'freie', u'software', u'sie', u'k\xf6nnen', u'es',
         u'unter', u'den', u'bedingungen', u'der', u'gnu', u'general', u'n',
         u'public', u'license', u'wie', u'von', u'der', u'free', u'software',
         u'foundation', u'ver\xf6ffentlicht', u'weitergeben', u'und',
         u'oder', u'n', u'modifizieren', u'entweder', u'gem\xe4\xdf',
         u'version', u'3', u'der', u'lizenz', u'oder', u'nach', u'ihrer',
         u'option', u'jeder', u'sp\xe4teren', u'n', u'version', u'n', u'n',
         u'die', u'ver\xf6ffentlichung', u'von', u'pychess', u'erfolgt', u'in',
         u'der', u'hoffnung', u'da\xdf', u'es', u'ihnen', u'von', u'nutzen',
         u'n', u'sein', u'wird', u'aber', u'ohne', u'irgendeine',
         u'garantie', u'sogar', u'ohne', u'die', u'implizite', u'garantie',
         u'der', u'marktreife', u'n', u'oder', u'der', u'verwendbarkeit',
         u'f\xfcr', u'einen', u'bestimmten', u'zweck', u'details', u'finden',
         u'sie', u'in', u'der', u'gnu', u'general', u'n', u'public',
         u'license', u'n', u'n', u'sie', u'sollten', u'ein', u'exemplar',
         u'der', u'gnu', u'general', u'public', u'license', u'zusammen',
         u'mit', u'pychess', u'n', u'erhalten', u'haben', u'falls', u'nicht',
         u'schreiben', u'sie', u'an', u'die', u'free', u'software',
         u'foundation', u'n', u'inc', u'51', u'franklin', u'st', u'fifth',
         u'floor', u'boston', u'ma', u'02110', u'usa',
     ]
     test_file = self.get_test_loc('tokenize/unicode/12180.txt')
     with io.open(test_file, encoding='utf-8') as test:
         assert list(query_tokenizer(test.read())) == expected

예제 #5

0

파일 보기

파일: test_tokenize.py 프로젝트: Siddhant-K-code/scancode-toolkit

 def test_query_tokenizer_behavior_from_file(self, regen=False):
     test_file = self.get_test_loc('tokenize/freertos/gpl-2.0-freertos.RULE')
     with io.open(test_file, encoding='utf-8') as test:
         text = test.read()
     result = list(query_tokenizer(text))
     expected_file = test_file + '.json'
     check_results(result, expected_file, regen=regen)

예제 #6

0

파일 보기

    def tokens(self):
        """
        Return an iterable of token strings for this rule. Length, relevance and
        minimum_coverage may be recomputed as a side effect.
        """
        length = 0
        text = self.text()
        text = text.strip()

        # FIXME: this is weird:

        # We tag this rule as being a bare URL if it starts with a scheme and is
        # on one line: this is used to determine a matching approach

        # FIXME: this does not lower the text first??
        if text.startswith(('http://', 'https://',
                            'ftp://')) and '\n' not in text[:1000].lower():
            self.minimum_coverage = 100

        for token in query_tokenizer(self.text()):
            length += 1
            yield token

        self.length = length
        self.compute_relevance()

예제 #7

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_can_parse_ill_formed_legacy_template_from_file(
         self, regen=False):
     test_file = self.get_test_loc('tokenize/ill_formed_template/text.txt')
     with io.open(test_file, encoding='utf-8') as text:
         result = list(query_tokenizer(text.read()))
     expected_file = self.get_test_loc(
         'tokenize/ill_formed_template/expected.json')
     check_results(result, expected_file, regen=regen)

예제 #8

0

파일 보기

파일: test_tokenize.py 프로젝트: ocabrisses/scancode-toolkit

 def test_query_tokenizer_can_split(self):
     text = u'abc def \n GHI'
     result = list(query_tokenizer(text))
     expected = [
         u'abc',
         u'def',
         u'ghi',
     ]
     assert expected == result

예제 #9

0

파일 보기

 def test_query_tokenizer_can_split(self):
     text = u'abc def \n GHI'
     result = list(query_tokenizer(text))
     expected = [
         u'abc',
         u'def',
         u'ghi',
     ]
     assert expected == result

예제 #10

0

파일 보기

파일: test_tokenize.py 프로젝트: yashdsaraf/scancode-toolkit

 def test_rule_and_query_tokenizer_have_the_same_behavior(self):
     texts = [
         ('MODULE_LICENSE("Dual BSD/GPL");', ['module', 'license', 'dual', 'bsd', 'gpl']),
         ('Dual BSD/GPL', ['dual', 'bsd', 'gpl']),
         ('license=Dual BSD/GPL', ['license', 'dual', 'bsd', 'gpl']),
         ('license_Dual+BSD-GPL', ['license', 'dual+bsd', 'gpl']),
     ]
     for text , expected in texts:
         assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))

예제 #11

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_can_split_legacy_templates(self):
     text = u'abc def \n {{temp}} GHI'
     result = list(query_tokenizer(text))
     expected = [
         u'abc',
         u'def',
         u'temp',
         u'ghi',
     ]
     assert result == expected

예제 #12

0

파일 보기

파일: models.py 프로젝트: victorcruceru/scancode-toolkit

def get_all_spdx_key_tokens(licenses):
    """
    Yield token strings collected from a `licenses` iterable of license objects'
    SPDX license keys.
    """
    for tok in get_essential_spdx_tokens():
        yield tok

    for spdx_key in get_all_spdx_keys(licenses):
        for token in query_tokenizer(spdx_key):
            yield token

예제 #13

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

    def test_tokenizers_regex_do_not_choke_on_some_text(self):
        # somehow this text was making the regex choke.
        tf = self.get_test_loc('tokenize/parser.js')
        with io.open(tf, encoding='utf-8') as text:
            content = text.read()

        start = time()
        list(query_tokenizer(content))
        duration = time() - start
        assert duration < 5

        start = time()
        list(query_tokenizer(content))
        duration = time() - start
        assert duration < 5

        start = time()
        list(matched_query_text_tokenizer(content))
        duration = time() - start
        assert duration < 5

예제 #14

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

    def test_matched_query_text_tokenizer_and_query_tokenizer_should_yield_the_same_texts(
            self):
        text = u'''Redistribution+ ;and use in! + 2003 source and +binary forms,
        ()with or without modifi+cation, are permitted with İrəli .\t\n
        \r'''

        mqtt_result = [
            t for is_tok, t in matched_query_text_tokenizer(text) if is_tok
        ]
        qt_result = list(query_tokenizer(text))
        mqtt_expected = [
            'Redistribution+',
            'and',
            'use',
            'in',
            '2003',
            'source',
            'and',
            'binary',
            'forms',
            'with',
            'or',
            'without',
            'modifi+cation',
            'are',
            'permitted',
            'with',
            'İrəli',
        ]

        qt_expected = [
            'redistribution+',
            'and',
            'use',
            'in',
            '2003',
            'source',
            'and',
            'binary',
            'forms',
            'with',
            'or',
            'without',
            'modifi+cation',
            'are',
            'permitted',
            'with',
            # this is NOT the same as above...
            # See https://github.com/nexB/scancode-toolkit/issues/1872
            'i',
            'rəli'
        ]
        assert mqtt_expected == mqtt_result
        assert qt_expected == qt_result

예제 #15

0

파일 보기

파일: test_tokenize.py 프로젝트: sthagen/scancode-toolkit

 def test_query_tokenizer_handles_rarer_unicode_typographic_quotes(self):
     text = 'a “bar” is “open„ not “closed” ‘free‚ not ‘foo’ „Gänsefüßchen“'
     expected = [
         'a',
         'bar',
         'is',
         'open',
         'not',
         'closed',
         'free',
         'not',
         'foo',
         'gänsefüßchen',
     ]
     assert list(query_tokenizer(text)) == expected

예제 #16

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

    def test_query_tokenizer(self):
        text = u'''Redistribution and use in source and binary forms, with or
        without modification, are permitted provided that the following
        conditions are met:
        Redistributions of source code must retain the above
        copyright notice, this list of conditions and the following
        disclaimer.'''

        result = list(query_tokenizer(text))

        expected = u'''redistribution and use in source and binary forms with or
        without modification are permitted provided that the following
        conditions are met redistributions of source code must retain the above
        copyright notice this list of conditions and the following
        disclaimer'''.split()
        assert result == expected

예제 #17

0

파일 보기

파일: test_tokenize.py 프로젝트: xavierfigueroav/scancode-toolkit

    def test_query_tokenizer_can_parse_ill_formed_legacy_template_from_file(
            self, regen=False):
        test_file = self.get_test_loc('tokenize/ill_formed_template/text.txt')
        with io.open(test_file, encoding='utf-8') as text:
            result = list(query_tokenizer(text.read()))
        expected_file = self.get_test_loc(
            'tokenize/ill_formed_template/expected.json')

        if regen:
            with open(expected_file, mode) as ex:
                json.dump(result, ex, indent=2, separators=(',', ': '))

        with io.open(expected_file, encoding='utf-8') as ex:
            expected = json.load(ex, object_pairs_hook=OrderedDict)

        assert expected == result

예제 #18

0

파일 보기

파일: test_tokenize.py 프로젝트: ocabrisses/scancode-toolkit

    def test_query_tokenizer(self):
        text = u'''Redistribution and use in source and binary forms, with or
        without modification, are permitted provided that the following
        conditions are met:
        Redistributions of source code must retain the above
        copyright notice, this list of conditions and the following
        disclaimer.'''

        result = list(query_tokenizer(text))
        assert 39 == len(result)

        expected = u'''redistribution and use in source and binary forms with or
        without modification are permitted provided that the following
        conditions are met redistributions of source code must retain the above
        copyright notice this list of conditions and the following
        disclaimer'''.split()
        assert expected == result

예제 #19

0

파일 보기

파일: test_tokenize.py 프로젝트: xavierfigueroav/scancode-toolkit

    def test_query_tokenizer_lines_on_html_like_texts_2(self, regen=False):
        test_file = self.get_test_loc('tokenize/htmlish.html')
        expected_file = test_file + '.expected.tokenized_lines.json'

        lines = query_lines(test_file)
        tokens = list(list(query_tokenizer(line)) for _ln, line in lines)

        # we dumps/loads to normalize tuples/etc
        result = json.loads(json.dumps(tokens))

        if regen:
            with open(expected_file, mode) as exc_test:
                json.dump(result, exc_test, indent=2)

        with io.open(expected_file, encoding='utf-8') as exc_test:
            expected = json.load(exc_test)

        assert expected == result

예제 #20

0

파일 보기

파일: test_tokenize.py 프로젝트: ocabrisses/scancode-toolkit

    def test_tokenizers_regex_do_not_choke_on_some_text(self):
        # somehow this text was making the regex choke.
        tf = self.get_test_loc('tokenize/parser.js')
        with codecs.open(tf, 'rb', encoding='utf-8') as text:
            content = text.read()

        start = time()
        list(rule_tokenizer(content))
        duration = time() - start
        assert duration < 5

        start = time()
        list(query_tokenizer(content))
        duration = time() - start
        assert duration < 5

        start = time()
        list(matched_query_text_tokenizer(content))
        duration = time() - start
        assert duration < 5

예제 #21

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_merges_contiguous_gaps(self):
     text = u'abc{{temp}}{{xzy}}def'
     result = list(query_tokenizer(text))
     expected = [u'abc', u'temp', u'xzy', u'def']
     assert result == expected

예제 #22

0

파일 보기

파일: test_tokenize.py 프로젝트: ocabrisses/scancode-toolkit

 def test_query_tokenizer_handles_blank_lines(self):
     text = u' \n\n\t  '
     result = list(query_tokenizer(text))
     assert [] == result

예제 #23

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_behavior4(self):
     text, expected = 'license_Dual+BSD-GPL', ['license', 'dual+bsd', 'gpl']
     assert list(query_tokenizer(text)) == expected

예제 #24

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_behavior3(self):
     text, expected = 'license=Dual BSD/GPL', [
         'license', 'dual', 'bsd', 'gpl'
     ]
     assert list(query_tokenizer(text)) == expected

예제 #25

0

파일 보기

파일: test_tokenize.py 프로젝트: ocabrisses/scancode-toolkit

 def test_query_tokenizer_handles_empty_string(self):
     text = ''
     result = list(query_tokenizer(text))
     assert [] == result

예제 #26

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_does_not_crash_on_unicode_rules_text_5(self):
     test_file = self.get_test_loc('tokenize/unicode/12420.txt')
     with io.open(test_file, encoding='utf-8') as test:
         list(query_tokenizer(test.read()))

예제 #27

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_handles_empty_lines(self):
     text = u'\n\n'
     expected = []
     assert list(query_tokenizer(text)) == expected

예제 #28

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_handles_blank_lines2(self):
     text = ' \n\t  '
     result = list(query_tokenizer(text))
     assert result == []

예제 #29

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_handles_empty_string(self):
     text = ''
     result = list(query_tokenizer(text))
     assert result == []

예제 #30

0

파일 보기

파일: test_tokenize.py 프로젝트: ocabrisses/scancode-toolkit

 def test_rule_and_query_tokenizer_have_the_same_behavior1(self):
     text , expected = 'MODULE_LICENSE("Dual BSD/GPL");', ['module_license', 'dual', 'bsd', 'gpl']
     assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))

예제 #31

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_handles_empty_legacy_templates(self):
     text = u'ab{{}}cd'
     expected = [u'ab', u'cd']
     assert list(query_tokenizer(text)) == expected

예제 #32

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_can_split(self):
     text = u'abc def \n GHI'
     result = list(query_tokenizer(text))
     assert result == [u'abc', u'def', u'ghi']

예제 #33

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_does_not_throw_exception_for_pystache_templates(
         self):
     text = u'''Permission to use, copy, modify, and {{ /or : the
                 text exist without or }} distribute this software...'''
     assert list(query_tokenizer(text))

예제 #34

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_on_html_like_texts(self, regen=False):
     test_file = self.get_test_loc('tokenize/htmlish.txt')
     expected_file = test_file + '.expected.query_tokenizer.json'
     lines = query_lines(test_file)
     result = [list(query_tokenizer(line)) for _ln, line in lines]
     check_results(result, expected_file, regen=regen)

예제 #35

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_does_not_crash_with_non_well_formed_legacy_templatized_parts(
         self):
     text = u'abcd{{ddd'
     assert list(query_tokenizer(text)) == [u'abcd', u'ddd']

예제 #36

0

파일 보기

파일: test_tokenize.py 프로젝트: ocabrisses/scancode-toolkit

 def test_rule_and_query_tokenizer_have_the_same_behavior4(self):
     text , expected = 'license_Dual+BSD-GPL', ['license_dual+bsd', 'gpl']
     assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))

예제 #37

0

파일 보기

파일: models.py 프로젝트: ocabrisses/scancode-toolkit

    def validate(licenses, verbose=False, no_dupe_urls=False):
        """
        Check that licenses are valid. `licenses` is a mapping of key ->
        License. Return dictionaries of infos, errors and warnings mapping a
        license key to validation issue messages. Print messages if verbose is
        True.
        """
        infos = defaultdict(list)
        warnings = defaultdict(list)
        errors = defaultdict(list)

        # used for global dedupe of texts
        by_spdx_key = defaultdict(list)
        by_text = defaultdict(list)

        for key, lic in licenses.items():
            err = errors[key].append
            warn = warnings[key].append
            info = infos[key].append

            if not lic.short_name:
                warn('No short name')
            if not lic.name:
                warn('No name')
            if not lic.category:
                warn('No category')
            if not lic.owner:
                warn('No owner')

            if lic.next_version and lic.next_version not in licenses:
                err('License next version is unknown')

            if (lic.is_or_later and
                lic.base_license and
                lic.base_license not in licenses):
                err('Base license for an "or later" license is unknown')

            # URLS dedupe and consistency
            if no_dupe_urls:
                if lic.text_urls and not all(lic.text_urls):
                    warn('Some empty license text_urls')

                if lic.other_urls and not all(lic.other_urls):
                    warn('Some empty license other_urls')

                # redundant URLs used multiple times
                if lic.homepage_url:
                    if lic.homepage_url in lic.text_urls:
                        warn('Homepage URL also in text_urls')
                    if lic.homepage_url in lic.other_urls:
                        warn('Homepage URL also in other_urls')
                    if lic.homepage_url == lic.faq_url:
                        warn('Homepage URL same as faq_url')
                    if lic.homepage_url == lic.osi_url:
                        warn('Homepage URL same as osi_url')

                if lic.osi_url or lic.faq_url:
                    if lic.osi_url == lic.faq_url:
                        warn('osi_url same as faq_url')

                all_licenses = lic.text_urls + lic.other_urls
                for url in lic.osi_url, lic.faq_url, lic.homepage_url:
                    if url: all_licenses.append(url)

                if not len(all_licenses) == len(set(all_licenses)):
                    warn('Some duplicated URLs')

            # local text consistency
            text = lic.text

            license_qtokens = tuple(query_tokenizer(text, lower=True))
            license_rtokens = tuple(rule_tokenizer(text, lower=True))
            if license_qtokens != license_rtokens:
                info('License text contains rule templated region with  {{}}')
            if not license_qtokens:
                info('No license text')
            else:
                # for global dedupe
                by_text[license_qtokens].append(key + ': TEXT')


            # SPDX consistency
            if lic.spdx_license_key:
                by_spdx_key[lic.spdx_license_key].append(key)

        # global SPDX consistency
        multiple_spdx_keys_used = {k: v for k, v in by_spdx_key.items() if len(v) > 1}
        if multiple_spdx_keys_used:
            for k, lkeys in multiple_spdx_keys_used.items():
                infos['GLOBAL'].append('SPDX key: ' + k + ' used in multiple licenses: ' + ', '.join(sorted(lkeys)))

        # global text dedupe
        multiple_texts = {k: v for k, v in by_text.items() if len(v) > 1}
        if multiple_texts:
            for k, msgs in multiple_texts.items():
                errors['GLOBAL'].append('Duplicate texts in multiple licenses:' + ', '.join(sorted(msgs)))

        errors = {k: v for k, v in errors.items() if v}
        warnings = {k: v for k, v in warnings.items() if v}
        infos = {k: v for k, v in infos.items() if v}

        if verbose:
            print('Licenses validation errors:')
            for key, msgs in sorted(errors.items()):
                print('ERRORS for:', key, ':', '\n'.join(msgs))

            print('Licenses validation warnings:')
            for key, msgs in sorted(warnings.items()):
                print('WARNINGS for:', key, ':', '\n'.join(msgs))

            print('Licenses validation infos:')
            for key, msgs in sorted(infos.items()):
                print('INFOS for:', key, ':', '\n'.join(msgs))

        return errors, warnings, infos

예제 #38

0

파일 보기

파일: test_tokenize.py 프로젝트: ocabrisses/scancode-toolkit

 def test_rule_and_query_tokenizer_have_the_same_behavior_from_file(self):
     test_file = self.get_test_loc('tokenize/freertos/gpl-2.0-freertos.RULE')
     with codecs.open(test_file, encoding='utf-8') as test:
         text = test.read()
         assert list(rule_tokenizer(text)) == list(query_tokenizer(text))

예제 #39

0

파일 보기

파일: test_tokenize.py 프로젝트: akugarg/scancode-toolkit

 def test_query_tokenizer_behavior1(self):
     text, expected = 'MODULE_LICENSE("Dual BSD/GPL");', [
         'module', 'license', 'dual', 'bsd', 'gpl'
     ]
     assert list(query_tokenizer(text)) == expected

예제 #40

0

파일 보기

파일: test_tokenize.py 프로젝트: ocabrisses/scancode-toolkit

 def test_rule_and_query_tokenizer_have_the_same_behavior3(self):
     text , expected = 'license=Dual BSD/GPL', ['license', 'dual', 'bsd', 'gpl']
     assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))