def test_query_tokenizer_behavior_from_file(self, regen=False): test_file = self.get_test_loc( 'tokenize/freertos/gpl-2.0-freertos.RULE') with io.open(test_file, encoding='utf-8') as test: text = test.read() result = list(query_tokenizer(text)) expected_file = test_file + '.json' if regen: with open(expected_file, mode) as exc_test: json.dump(result, exc_test, indent=2) with io.open(expected_file, encoding='utf-8') as exc_test: expected = json.load(exc_test) assert expected == list(query_tokenizer(text))
def test_query_tokenizer_lines_on_html_like_texts_2( self, regen=REGEN_TEST_FIXTURES): test_file = self.get_test_loc('tokenize/htmlish.html') expected_file = test_file + '.expected.query_tokenizer.json' lines = query_lines(test_file) result = [list(query_tokenizer(line)) for _ln, line in lines] check_results(result, expected_file, regen=regen)
def test_query_tokenizer_handles_rarer_unicode_codepoints(self): # NOTE: we are not catching the heart as a proper token, but this is # rare enough that we do not care text = '♡ Copying Art is an act of love. Love is not subject to law.' expected = [u'copying', u'art', u'is', u'an', u'act', u'of', u'love', u'love', u'is', u'not', u'subject', u'to', u'law'] assert list(query_tokenizer(text)) == expected
def test_query_tokenizer_can_handle_long_text(self): expected = [ u'pychess', u'ist', u'freie', u'software', u'sie', u'k\xf6nnen', u'es', u'unter', u'den', u'bedingungen', u'der', u'gnu', u'general', u'n', u'public', u'license', u'wie', u'von', u'der', u'free', u'software', u'foundation', u'ver\xf6ffentlicht', u'weitergeben', u'und', u'oder', u'n', u'modifizieren', u'entweder', u'gem\xe4\xdf', u'version', u'3', u'der', u'lizenz', u'oder', u'nach', u'ihrer', u'option', u'jeder', u'sp\xe4teren', u'n', u'version', u'n', u'n', u'die', u'ver\xf6ffentlichung', u'von', u'pychess', u'erfolgt', u'in', u'der', u'hoffnung', u'da\xdf', u'es', u'ihnen', u'von', u'nutzen', u'n', u'sein', u'wird', u'aber', u'ohne', u'irgendeine', u'garantie', u'sogar', u'ohne', u'die', u'implizite', u'garantie', u'der', u'marktreife', u'n', u'oder', u'der', u'verwendbarkeit', u'f\xfcr', u'einen', u'bestimmten', u'zweck', u'details', u'finden', u'sie', u'in', u'der', u'gnu', u'general', u'n', u'public', u'license', u'n', u'n', u'sie', u'sollten', u'ein', u'exemplar', u'der', u'gnu', u'general', u'public', u'license', u'zusammen', u'mit', u'pychess', u'n', u'erhalten', u'haben', u'falls', u'nicht', u'schreiben', u'sie', u'an', u'die', u'free', u'software', u'foundation', u'n', u'inc', u'51', u'franklin', u'st', u'fifth', u'floor', u'boston', u'ma', u'02110', u'usa', ] test_file = self.get_test_loc('tokenize/unicode/12180.txt') with io.open(test_file, encoding='utf-8') as test: assert list(query_tokenizer(test.read())) == expected
def test_query_tokenizer_behavior_from_file(self, regen=False): test_file = self.get_test_loc('tokenize/freertos/gpl-2.0-freertos.RULE') with io.open(test_file, encoding='utf-8') as test: text = test.read() result = list(query_tokenizer(text)) expected_file = test_file + '.json' check_results(result, expected_file, regen=regen)
def tokens(self): """ Return an iterable of token strings for this rule. Length, relevance and minimum_coverage may be recomputed as a side effect. """ length = 0 text = self.text() text = text.strip() # FIXME: this is weird: # We tag this rule as being a bare URL if it starts with a scheme and is # on one line: this is used to determine a matching approach # FIXME: this does not lower the text first?? if text.startswith(('http://', 'https://', 'ftp://')) and '\n' not in text[:1000].lower(): self.minimum_coverage = 100 for token in query_tokenizer(self.text()): length += 1 yield token self.length = length self.compute_relevance()
def test_query_tokenizer_can_parse_ill_formed_legacy_template_from_file( self, regen=False): test_file = self.get_test_loc('tokenize/ill_formed_template/text.txt') with io.open(test_file, encoding='utf-8') as text: result = list(query_tokenizer(text.read())) expected_file = self.get_test_loc( 'tokenize/ill_formed_template/expected.json') check_results(result, expected_file, regen=regen)
def test_query_tokenizer_can_split(self): text = u'abc def \n GHI' result = list(query_tokenizer(text)) expected = [ u'abc', u'def', u'ghi', ] assert expected == result
def test_rule_and_query_tokenizer_have_the_same_behavior(self): texts = [ ('MODULE_LICENSE("Dual BSD/GPL");', ['module', 'license', 'dual', 'bsd', 'gpl']), ('Dual BSD/GPL', ['dual', 'bsd', 'gpl']), ('license=Dual BSD/GPL', ['license', 'dual', 'bsd', 'gpl']), ('license_Dual+BSD-GPL', ['license', 'dual+bsd', 'gpl']), ] for text , expected in texts: assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))
def test_query_tokenizer_can_split_legacy_templates(self): text = u'abc def \n {{temp}} GHI' result = list(query_tokenizer(text)) expected = [ u'abc', u'def', u'temp', u'ghi', ] assert result == expected
def get_all_spdx_key_tokens(licenses): """ Yield token strings collected from a `licenses` iterable of license objects' SPDX license keys. """ for tok in get_essential_spdx_tokens(): yield tok for spdx_key in get_all_spdx_keys(licenses): for token in query_tokenizer(spdx_key): yield token
def test_tokenizers_regex_do_not_choke_on_some_text(self): # somehow this text was making the regex choke. tf = self.get_test_loc('tokenize/parser.js') with io.open(tf, encoding='utf-8') as text: content = text.read() start = time() list(query_tokenizer(content)) duration = time() - start assert duration < 5 start = time() list(query_tokenizer(content)) duration = time() - start assert duration < 5 start = time() list(matched_query_text_tokenizer(content)) duration = time() - start assert duration < 5
def test_matched_query_text_tokenizer_and_query_tokenizer_should_yield_the_same_texts( self): text = u'''Redistribution+ ;and use in! + 2003 source and +binary forms, ()with or without modifi+cation, are permitted with İrəli .\t\n \r''' mqtt_result = [ t for is_tok, t in matched_query_text_tokenizer(text) if is_tok ] qt_result = list(query_tokenizer(text)) mqtt_expected = [ 'Redistribution+', 'and', 'use', 'in', '2003', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modifi+cation', 'are', 'permitted', 'with', 'İrəli', ] qt_expected = [ 'redistribution+', 'and', 'use', 'in', '2003', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modifi+cation', 'are', 'permitted', 'with', # this is NOT the same as above... # See https://github.com/nexB/scancode-toolkit/issues/1872 'i', 'rəli' ] assert mqtt_expected == mqtt_result assert qt_expected == qt_result
def test_query_tokenizer_handles_rarer_unicode_typographic_quotes(self): text = 'a “bar” is “open„ not “closed” ‘free‚ not ‘foo’ „Gänsefüßchen“' expected = [ 'a', 'bar', 'is', 'open', 'not', 'closed', 'free', 'not', 'foo', 'gänsefüßchen', ] assert list(query_tokenizer(text)) == expected
def test_query_tokenizer(self): text = u'''Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.''' result = list(query_tokenizer(text)) expected = u'''redistribution and use in source and binary forms with or without modification are permitted provided that the following conditions are met redistributions of source code must retain the above copyright notice this list of conditions and the following disclaimer'''.split() assert result == expected
def test_query_tokenizer_can_parse_ill_formed_legacy_template_from_file( self, regen=False): test_file = self.get_test_loc('tokenize/ill_formed_template/text.txt') with io.open(test_file, encoding='utf-8') as text: result = list(query_tokenizer(text.read())) expected_file = self.get_test_loc( 'tokenize/ill_formed_template/expected.json') if regen: with open(expected_file, mode) as ex: json.dump(result, ex, indent=2, separators=(',', ': ')) with io.open(expected_file, encoding='utf-8') as ex: expected = json.load(ex, object_pairs_hook=OrderedDict) assert expected == result
def test_query_tokenizer(self): text = u'''Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.''' result = list(query_tokenizer(text)) assert 39 == len(result) expected = u'''redistribution and use in source and binary forms with or without modification are permitted provided that the following conditions are met redistributions of source code must retain the above copyright notice this list of conditions and the following disclaimer'''.split() assert expected == result
def test_query_tokenizer_lines_on_html_like_texts_2(self, regen=False): test_file = self.get_test_loc('tokenize/htmlish.html') expected_file = test_file + '.expected.tokenized_lines.json' lines = query_lines(test_file) tokens = list(list(query_tokenizer(line)) for _ln, line in lines) # we dumps/loads to normalize tuples/etc result = json.loads(json.dumps(tokens)) if regen: with open(expected_file, mode) as exc_test: json.dump(result, exc_test, indent=2) with io.open(expected_file, encoding='utf-8') as exc_test: expected = json.load(exc_test) assert expected == result
def test_tokenizers_regex_do_not_choke_on_some_text(self): # somehow this text was making the regex choke. tf = self.get_test_loc('tokenize/parser.js') with codecs.open(tf, 'rb', encoding='utf-8') as text: content = text.read() start = time() list(rule_tokenizer(content)) duration = time() - start assert duration < 5 start = time() list(query_tokenizer(content)) duration = time() - start assert duration < 5 start = time() list(matched_query_text_tokenizer(content)) duration = time() - start assert duration < 5
def test_query_tokenizer_merges_contiguous_gaps(self): text = u'abc{{temp}}{{xzy}}def' result = list(query_tokenizer(text)) expected = [u'abc', u'temp', u'xzy', u'def'] assert result == expected
def test_query_tokenizer_handles_blank_lines(self): text = u' \n\n\t ' result = list(query_tokenizer(text)) assert [] == result
def test_query_tokenizer_behavior4(self): text, expected = 'license_Dual+BSD-GPL', ['license', 'dual+bsd', 'gpl'] assert list(query_tokenizer(text)) == expected
def test_query_tokenizer_behavior3(self): text, expected = 'license=Dual BSD/GPL', [ 'license', 'dual', 'bsd', 'gpl' ] assert list(query_tokenizer(text)) == expected
def test_query_tokenizer_handles_empty_string(self): text = '' result = list(query_tokenizer(text)) assert [] == result
def test_query_tokenizer_does_not_crash_on_unicode_rules_text_5(self): test_file = self.get_test_loc('tokenize/unicode/12420.txt') with io.open(test_file, encoding='utf-8') as test: list(query_tokenizer(test.read()))
def test_query_tokenizer_handles_empty_lines(self): text = u'\n\n' expected = [] assert list(query_tokenizer(text)) == expected
def test_query_tokenizer_handles_blank_lines2(self): text = ' \n\t ' result = list(query_tokenizer(text)) assert result == []
def test_query_tokenizer_handles_empty_string(self): text = '' result = list(query_tokenizer(text)) assert result == []
def test_rule_and_query_tokenizer_have_the_same_behavior1(self): text , expected = 'MODULE_LICENSE("Dual BSD/GPL");', ['module_license', 'dual', 'bsd', 'gpl'] assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))
def test_query_tokenizer_handles_empty_legacy_templates(self): text = u'ab{{}}cd' expected = [u'ab', u'cd'] assert list(query_tokenizer(text)) == expected
def test_query_tokenizer_can_split(self): text = u'abc def \n GHI' result = list(query_tokenizer(text)) assert result == [u'abc', u'def', u'ghi']
def test_query_tokenizer_does_not_throw_exception_for_pystache_templates( self): text = u'''Permission to use, copy, modify, and {{ /or : the text exist without or }} distribute this software...''' assert list(query_tokenizer(text))
def test_query_tokenizer_on_html_like_texts(self, regen=False): test_file = self.get_test_loc('tokenize/htmlish.txt') expected_file = test_file + '.expected.query_tokenizer.json' lines = query_lines(test_file) result = [list(query_tokenizer(line)) for _ln, line in lines] check_results(result, expected_file, regen=regen)
def test_query_tokenizer_does_not_crash_with_non_well_formed_legacy_templatized_parts( self): text = u'abcd{{ddd' assert list(query_tokenizer(text)) == [u'abcd', u'ddd']
def test_rule_and_query_tokenizer_have_the_same_behavior4(self): text , expected = 'license_Dual+BSD-GPL', ['license_dual+bsd', 'gpl'] assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))
def validate(licenses, verbose=False, no_dupe_urls=False): """ Check that licenses are valid. `licenses` is a mapping of key -> License. Return dictionaries of infos, errors and warnings mapping a license key to validation issue messages. Print messages if verbose is True. """ infos = defaultdict(list) warnings = defaultdict(list) errors = defaultdict(list) # used for global dedupe of texts by_spdx_key = defaultdict(list) by_text = defaultdict(list) for key, lic in licenses.items(): err = errors[key].append warn = warnings[key].append info = infos[key].append if not lic.short_name: warn('No short name') if not lic.name: warn('No name') if not lic.category: warn('No category') if not lic.owner: warn('No owner') if lic.next_version and lic.next_version not in licenses: err('License next version is unknown') if (lic.is_or_later and lic.base_license and lic.base_license not in licenses): err('Base license for an "or later" license is unknown') # URLS dedupe and consistency if no_dupe_urls: if lic.text_urls and not all(lic.text_urls): warn('Some empty license text_urls') if lic.other_urls and not all(lic.other_urls): warn('Some empty license other_urls') # redundant URLs used multiple times if lic.homepage_url: if lic.homepage_url in lic.text_urls: warn('Homepage URL also in text_urls') if lic.homepage_url in lic.other_urls: warn('Homepage URL also in other_urls') if lic.homepage_url == lic.faq_url: warn('Homepage URL same as faq_url') if lic.homepage_url == lic.osi_url: warn('Homepage URL same as osi_url') if lic.osi_url or lic.faq_url: if lic.osi_url == lic.faq_url: warn('osi_url same as faq_url') all_licenses = lic.text_urls + lic.other_urls for url in lic.osi_url, lic.faq_url, lic.homepage_url: if url: all_licenses.append(url) if not len(all_licenses) == len(set(all_licenses)): warn('Some duplicated URLs') # local text consistency text = lic.text license_qtokens = tuple(query_tokenizer(text, lower=True)) license_rtokens = tuple(rule_tokenizer(text, lower=True)) if license_qtokens != license_rtokens: info('License text contains rule templated region with {{}}') if not license_qtokens: info('No license text') else: # for global dedupe by_text[license_qtokens].append(key + ': TEXT') # SPDX consistency if lic.spdx_license_key: by_spdx_key[lic.spdx_license_key].append(key) # global SPDX consistency multiple_spdx_keys_used = {k: v for k, v in by_spdx_key.items() if len(v) > 1} if multiple_spdx_keys_used: for k, lkeys in multiple_spdx_keys_used.items(): infos['GLOBAL'].append('SPDX key: ' + k + ' used in multiple licenses: ' + ', '.join(sorted(lkeys))) # global text dedupe multiple_texts = {k: v for k, v in by_text.items() if len(v) > 1} if multiple_texts: for k, msgs in multiple_texts.items(): errors['GLOBAL'].append('Duplicate texts in multiple licenses:' + ', '.join(sorted(msgs))) errors = {k: v for k, v in errors.items() if v} warnings = {k: v for k, v in warnings.items() if v} infos = {k: v for k, v in infos.items() if v} if verbose: print('Licenses validation errors:') for key, msgs in sorted(errors.items()): print('ERRORS for:', key, ':', '\n'.join(msgs)) print('Licenses validation warnings:') for key, msgs in sorted(warnings.items()): print('WARNINGS for:', key, ':', '\n'.join(msgs)) print('Licenses validation infos:') for key, msgs in sorted(infos.items()): print('INFOS for:', key, ':', '\n'.join(msgs)) return errors, warnings, infos
def test_rule_and_query_tokenizer_have_the_same_behavior_from_file(self): test_file = self.get_test_loc('tokenize/freertos/gpl-2.0-freertos.RULE') with codecs.open(test_file, encoding='utf-8') as test: text = test.read() assert list(rule_tokenizer(text)) == list(query_tokenizer(text))
def test_query_tokenizer_behavior1(self): text, expected = 'MODULE_LICENSE("Dual BSD/GPL");', [ 'module', 'license', 'dual', 'bsd', 'gpl' ] assert list(query_tokenizer(text)) == expected
def test_rule_and_query_tokenizer_have_the_same_behavior3(self): text , expected = 'license=Dual BSD/GPL', ['license', 'dual', 'bsd', 'gpl'] assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))