def test_rule_tokenizer_can_handle_long_text(self): expected = [ u'ist', u'freie', u'software', u'sie', u'k\xf6nnen', u'es', u'unter', u'den', u'bedingungen', u'der', u'gnu', u'general', u'n', u'public', u'license', u'wie', u'von', u'der', u'free', u'software', u'foundation', u'ver\xf6ffentlicht', u'weitergeben', u'und', u'oder', u'n', u'modifizieren', u'entweder', u'gem\xe4\xdf', u'version', u'3', u'der', u'lizenz', u'oder', u'nach', u'ihrer', u'option', u'jeder', u'sp\xe4teren', u'n', u'version', u'n', u'n', u'die', u'ver\xf6ffentlichung', u'von', u'erfolgt', u'in', u'der', u'hoffnung', u'da\xdf', u'es', u'ihnen', u'von', u'nutzen', u'n', u'sein', u'wird', u'aber', u'ohne', u'irgendeine', u'garantie', u'sogar', u'ohne', u'die', u'implizite', u'garantie', u'der', u'marktreife', u'n', u'oder', u'der', u'verwendbarkeit', u'f\xfcr', u'einen', u'bestimmten', u'zweck', u'details', u'finden', u'sie', u'in', u'der', u'gnu', u'general', u'n', u'public', u'license', u'n', u'n', u'sie', u'sollten', u'ein', u'exemplar', u'der', u'gnu', u'general', u'public', u'license', u'zusammen', u'mit', u'n', u'erhalten', u'haben', u'falls', u'nicht', u'schreiben', u'sie', u'an', u'die', u'free', u'software', u'foundation', u'n', u'inc', u'51', u'franklin', u'st', u'fifth', u'floor', u'boston', u'ma', u'02110', u'usa', ] test_file = self.get_test_loc('tokenize/unicode/12180.txt') with codecs.open(test_file, encoding='utf-8') as test: assert expected == list(rule_tokenizer(test.read()))
def test_rule_tokenizer_can_process_multiple_templatized_parts_strip_multiple_contig_templates_and_leading_and_trailing(self): text = u'''{{nexb}}{{nexb}}ab{{nexb Company}}{{nexb}}cd {{second}} {{nexb}} {{nexb}} {{nexb}}ef {{nexb}} ''' expected = [u'ab', u'cd', u'ef', ] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_can_parse_ill_formed_template_from_file(self): test_file = self.get_test_loc('tokenize/ill_formed_template/text.txt') with codecs.open(test_file, 'rb', encoding='utf-8') as text: result = list(rule_tokenizer(text.read())) expected_file = self.get_test_loc('tokenize/ill_formed_template/expected.json') import json regen = False if regen: with codecs.open(expected_file, 'wb', encoding='utf-8') as ex: json.dump(result, ex, indent=2, separators=(',', ': ')) with codecs.open(expected_file, encoding='utf-8') as ex: expected = json.load(ex, object_pairs_hook=OrderedDict) assert expected == result
def test_tokenizers_regex_do_not_choke_on_some_text(self): # somehow this text was making the regex choke. tf = self.get_test_loc('tokenize/parser.js') with codecs.open(tf, 'rb', encoding='utf-8') as text: content = text.read() start = time() list(rule_tokenizer(content)) duration = time() - start assert duration < 5 start = time() list(query_tokenizer(content)) duration = time() - start assert duration < 5 start = time() list(matched_query_text_tokenizer(content)) duration = time() - start assert duration < 5
def tokens(self, lower=True): """ Return an iterable of token strings for this rule. Length is recomputed as a side effect. Tokens inside double curly braces (eg. {{ignored}}) are skipped and ignored. """ length = 0 text = self.text() text = text.strip() # FIXME: this is weird: # We tag this rule as being a bare URL if it starts with a scheme and is on one line: this is used to determine a matching approach if text.startswith(('http://', 'https://', 'ftp://')) and '\n' not in text[:1000]: self.minimum_coverage = 100 for token in rule_tokenizer(self.text(), lower=lower): length += 1 yield token self.length = length self.compute_relevance()
def tokens(self, lower=True): """ Return an iterable of tokens and keep track of gaps by position. Gaps and length are recomputed. Tokens inside gaps are tracked but not part of the returned stream. """ gaps = set() # Note: we track the pos instead of enumerating it because we create # positions abstracting gaps pos = 0 length = 0 for token in rule_tokenizer(self.text(), lower=lower): if token is None: gaps.add(pos - 1) else: length += 1 yield token # only increment pos if we are not in a gap pos += 1 self.length = length self.gaps = gaps self.gaps_count = len(gaps)
def test_rule_tokenizer_handles_empty_templates(self): text = u'ab{{}}cd' expected = [u'ab', u'cd'] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_handles_template_with_spaces(self): text = u'ab{{ 10 }}cd' expected = [u'ab', u'cd'] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_can_split(self): text = u'abc def \n GHI' result = list(rule_tokenizer(text)) assert [u'abc', u'def', u'ghi'] == result
def test_rule_tokenizer_merges_contiguous_gaps(self): text = u'abc{{temp}}{{xzy}}def' result = list(rule_tokenizer(text)) expected = [u'abc', u'def'] assert expected == result
def test_rule_tokenizer_handles_empty_string(self): text = '' result = list(rule_tokenizer(text)) assert [] == result
def test_rule_tokenizer_handles_empty_string(self): text = '' result = list(rule_tokenizer(text)) assert [] == result
def test_rule_tokenizer_handles_empty_templates(self): text = u'ab{{}}cd' expected = [u'ab', u'cd'] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_does_not_throw_exception_for_illegal_pystache_templates(self): text = u'''Permission to use, copy, modify, and {{ /or : the text exist without or }} distribute this software...''' assert list(rule_tokenizer(text))
def test_rule_tokenizer_merges_contiguous_gaps(self): text = u'abc{{temp}}{{xzy}}def' result = list(rule_tokenizer(text)) expected = [u'abc', u'def'] assert expected == result
def test_rule_tokenizer_does_not_return_leading_and_trailing_gaps(self): text = u'{{xzy}}{{xzy}}abc{{temp}}def{{xzy}}{{xzy}}' result = list(rule_tokenizer(text)) expected = [u'abc', u'def'] assert expected == result
def test_rule_tokenizer_can_split_templates(self): text = u'abc def \n {{temp}} GHI' result = list(rule_tokenizer(text)) expected = [u'abc', u'def', u'ghi', ] assert expected == result
def test_rule_tokenizer_can_split(self): text = u'abc def \n GHI' result = list(rule_tokenizer(text)) assert [u'abc', u'def', u'ghi'] == result
def test_rule_tokenizer_handles_blank_lines(self): text = ' \n\t ' result = list(rule_tokenizer(text)) assert [] == result
def test_rule_tokenizer_handles_combination_of_well_formed_and_ill_formed_templates_2( self): text = u'}}{{{{abcd}}ddd}}{{' assert [u'ddd'] == list(rule_tokenizer(text))
def test_rule_tokenizer_can_handle_long_text(self): expected = [ u'ist', u'freie', u'software', u'sie', u'k\xf6nnen', u'es', u'unter', u'den', u'bedingungen', u'der', u'gnu', u'general', u'n', u'public', u'license', u'wie', u'von', u'der', u'free', u'software', u'foundation', u'ver\xf6ffentlicht', u'weitergeben', u'und', u'oder', u'n', u'modifizieren', u'entweder', u'gem\xe4\xdf', u'version', u'3', u'der', u'lizenz', u'oder', u'nach', u'ihrer', u'option', u'jeder', u'sp\xe4teren', u'n', u'version', u'n', u'n', u'die', u'ver\xf6ffentlichung', u'von', u'erfolgt', u'in', u'der', u'hoffnung', u'da\xdf', u'es', u'ihnen', u'von', u'nutzen', u'n', u'sein', u'wird', u'aber', u'ohne', u'irgendeine', u'garantie', u'sogar', u'ohne', u'die', u'implizite', u'garantie', u'der', u'marktreife', u'n', u'oder', u'der', u'verwendbarkeit', u'f\xfcr', u'einen', u'bestimmten', u'zweck', u'details', u'finden', u'sie', u'in', u'der', u'gnu', u'general', u'n', u'public', u'license', u'n', u'n', u'sie', u'sollten', u'ein', u'exemplar', u'der', u'gnu', u'general', u'public', u'license', u'zusammen', u'mit', u'n', u'erhalten', u'haben', u'falls', u'nicht', u'schreiben', u'sie', u'an', u'die', u'free', u'software', u'foundation', u'n', u'inc', u'51', u'franklin', u'st', u'fifth', u'floor', u'boston', u'ma', u'02110', u'usa', ] test_file = self.get_test_loc('tokenize/unicode/12180.txt') with codecs.open(test_file, encoding='utf-8') as test: assert expected == list(rule_tokenizer(test.read()))
def test_rule_tokenizer_handles_empty_lines(self): text = u'\n\n' expected = [] assert expected == list(rule_tokenizer(text))
def test_rule_and_query_tokenizer_have_the_same_behavior4(self): text , expected = 'license_Dual+BSD-GPL', ['license_dual+bsd', 'gpl'] assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))
def test_rule_tokenizer_handles_multi_word_templates(self): text = u'ab{{10 nexb Company}}cd' expected = [u'ab', u'cd'] assert expected == list(rule_tokenizer(text))
def test_rule_and_query_tokenizer_have_the_same_behavior2(self): test_file = self.get_test_loc('tokenize/freertos/gpl-2.0-freertos.RULE') with codecs.open(test_file, encoding='utf-8') as test: text = test.read() assert list(rule_tokenizer(text)) == list(query_tokenizer(text))
def test_rule_and_query_tokenizer_have_the_same_behavior3(self): text , expected = 'license=Dual BSD/GPL', ['license', 'dual', 'bsd', 'gpl'] assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))
def test_rule_tokenizer_handles_template_with_spaces(self): text = u'ab{{ 10 }}cd' expected = [u'ab', u'cd'] assert expected == list(rule_tokenizer(text))
def test_rule_and_query_tokenizer_have_the_same_behavior_from_file(self): test_file = self.get_test_loc('tokenize/freertos/gpl-2.0-freertos.RULE') with codecs.open(test_file, encoding='utf-8') as test: text = test.read() assert list(rule_tokenizer(text)) == list(query_tokenizer(text))
def test_rule_tokenizer_can_process_multiple_templatized_parts(self): text = u'ab{{nexb Company}}cd {{second}}ef' expected = [u'ab', u'cd', u'ef', ] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_handles_blank_lines(self): text = ' \n\t ' result = list(rule_tokenizer(text)) assert [] == result
def test_rule_tokenizer_can_process_multiple_templatized_parts_with_default_gap_and_custom_gaps(self): text = u'ab{{nexb Company}}cd{{12 second}}ef{{12 second}}gh' expected = [u'ab', u'cd', u'ef', u'gh'] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_can_split_templates(self): text = u'abc def \n {{temp}} GHI' result = list(rule_tokenizer(text)) expected = [u'abc', u'def', u'ghi', ] assert expected == result
def test_rule_tokenizer_handles_empty_lines(self): text = u'\n\n' expected = [] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_does_not_return_leading_and_trailing_gaps(self): text = u'{{xzy}}{{xzy}}abc{{temp}}def{{xzy}}{{xzy}}' result = list(rule_tokenizer(text)) expected = [u'abc', u'def'] assert expected == result
def test_rule_tokenizer_can_parse_simple_line(self): text = u'Licensed by {{12 nexB}} to you ' expected = [u'licensed', u'by', u'to', u'you'] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_handles_multi_word_templates(self): text = u'ab{{10 nexb Company}}cd' expected = [u'ab', u'cd'] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_does_not_throw_exception_for_illegal_pystache_templates(self): text = u'''Permission to use, copy, modify, and {{ /or : the text exist without or }} distribute this software...''' assert list(rule_tokenizer(text))
def test_rule_tokenizer_can_process_multiple_templatized_parts(self): text = u'ab{{nexb Company}}cd {{second}}ef' expected = [u'ab', u'cd', u'ef', ] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_does_not_crash_on_unicode_rules_text_5(self): test_file = self.get_test_loc('tokenize/unicode/12420.txt') with codecs.open(test_file, encoding='utf-8') as test: list(rule_tokenizer(test.read()))
def test_rule_tokenizer_can_process_multiple_templatized_parts_with_default_gap_and_custom_gaps(self): text = u'ab{{nexb Company}}cd{{12 second}}ef{{12 second}}gh' expected = [u'ab', u'cd', u'ef', u'gh'] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_does_not_crash_on_unicode_rules_text_5(self): test_file = self.get_test_loc('tokenize/unicode/12420.txt') with codecs.open(test_file, encoding='utf-8') as test: list(rule_tokenizer(test.read()))
def test_rule_tokenizer_can_parse_simple_line(self): text = u'Licensed by {{12 nexB}} to you ' expected = [u'licensed', u'by', u'to', u'you'] assert expected == list(rule_tokenizer(text))
def validate(licenses, verbose=False, no_dupe_urls=False): """ Check that licenses are valid. `licenses` is a mapping of key -> License. Return dictionaries of infos, errors and warnings mapping a license key to validation issue messages. Print messages if verbose is True. """ infos = defaultdict(list) warnings = defaultdict(list) errors = defaultdict(list) # used for global dedupe of texts by_spdx_key = defaultdict(list) by_text = defaultdict(list) for key, lic in licenses.items(): err = errors[key].append warn = warnings[key].append info = infos[key].append if not lic.short_name: warn('No short name') if not lic.name: warn('No name') if not lic.category: warn('No category') if not lic.owner: warn('No owner') if lic.next_version and lic.next_version not in licenses: err('License next version is unknown') if (lic.is_or_later and lic.base_license and lic.base_license not in licenses): err('Base license for an "or later" license is unknown') # URLS dedupe and consistency if no_dupe_urls: if lic.text_urls and not all(lic.text_urls): warn('Some empty license text_urls') if lic.other_urls and not all(lic.other_urls): warn('Some empty license other_urls') # redundant URLs used multiple times if lic.homepage_url: if lic.homepage_url in lic.text_urls: warn('Homepage URL also in text_urls') if lic.homepage_url in lic.other_urls: warn('Homepage URL also in other_urls') if lic.homepage_url == lic.faq_url: warn('Homepage URL same as faq_url') if lic.homepage_url == lic.osi_url: warn('Homepage URL same as osi_url') if lic.osi_url or lic.faq_url: if lic.osi_url == lic.faq_url: warn('osi_url same as faq_url') all_licenses = lic.text_urls + lic.other_urls for url in lic.osi_url, lic.faq_url, lic.homepage_url: if url: all_licenses.append(url) if not len(all_licenses) == len(set(all_licenses)): warn('Some duplicated URLs') # local text consistency text = lic.text license_qtokens = tuple(query_tokenizer(text, lower=True)) license_rtokens = tuple(rule_tokenizer(text, lower=True)) if license_qtokens != license_rtokens: info('License text contains rule templated region with {{}}') if not license_qtokens: info('No license text') else: # for global dedupe by_text[license_qtokens].append(key + ': TEXT') # SPDX consistency if lic.spdx_license_key: by_spdx_key[lic.spdx_license_key].append(key) # global SPDX consistency multiple_spdx_keys_used = {k: v for k, v in by_spdx_key.items() if len(v) > 1} if multiple_spdx_keys_used: for k, lkeys in multiple_spdx_keys_used.items(): infos['GLOBAL'].append('SPDX key: ' + k + ' used in multiple licenses: ' + ', '.join(sorted(lkeys))) # global text dedupe multiple_texts = {k: v for k, v in by_text.items() if len(v) > 1} if multiple_texts: for k, msgs in multiple_texts.items(): errors['GLOBAL'].append('Duplicate texts in multiple licenses:' + ', '.join(sorted(msgs))) errors = {k: v for k, v in errors.items() if v} warnings = {k: v for k, v in warnings.items() if v} infos = {k: v for k, v in infos.items() if v} if verbose: print('Licenses validation errors:') for key, msgs in sorted(errors.items()): print('ERRORS for:', key, ':', '\n'.join(msgs)) print('Licenses validation warnings:') for key, msgs in sorted(warnings.items()): print('WARNINGS for:', key, ':', '\n'.join(msgs)) print('Licenses validation infos:') for key, msgs in sorted(infos.items()): print('INFOS for:', key, ':', '\n'.join(msgs)) return errors, warnings, infos
def test_rule_tokenizer_does_not_crash_with_non_well_formed_templatized_parts(self): text = u'abcd{{ddd' assert [u'abcd', u'ddd'] == list(rule_tokenizer(text))
def test_rule_tokenizer_handles_combination_of_well_formed_and_ill_formed_templates_2(self): text = u'}}{{{{abcd}}ddd}}{{' assert [u'ddd'] == list(rule_tokenizer(text))
def test_rule_tokenizer_does_not_crash_with_non_well_formed_templatized_parts(self): text = u'abcd{{ddd' assert [u'abcd', u'ddd'] == list(rule_tokenizer(text))
def test_rule_tokenizer_can_parse_ill_formed_template(self): tf = self.get_test_loc('tokenize/ill_formed_template/text.txt') with codecs.open(tf, 'rb', encoding='utf-8') as text: result = list(rule_tokenizer(text.read())) assert 3875 == len(result)
def test_rule_tokenizer_handles_combination_of_well_formed_and_ill_formed_templates(self): text = u'ab{{c}}d}}ef' expected = [u'ab', u'd', u'ef'] assert expected == list(rule_tokenizer(text))
def test_rule_tokenizer_handles_combination_of_well_formed_and_ill_formed_templates(self): text = u'ab{{c}}d}}ef' expected = [u'ab', u'd', u'ef'] assert expected == list(rule_tokenizer(text))
def test_rule_and_query_tokenizer_have_the_same_behavior1(self): text , expected = 'MODULE_LICENSE("Dual BSD/GPL");', ['module_license', 'dual', 'bsd', 'gpl'] assert expected == list(rule_tokenizer(text)) == list(query_tokenizer(text))
def validate(licenses, verbose=False, no_dupe_urls=False): """ Check that licenses are valid. `licenses` is a mapping of key -> License. Return dictionaries of infos, errors and warnings mapping a license key to validation issue messages. Print messages if verbose is True. """ infos = defaultdict(list) warnings = defaultdict(list) errors = defaultdict(list) # used for global dedupe of texts by_spdx_key = defaultdict(list) by_text = defaultdict(list) for key, lic in licenses.items(): err = errors[key].append warn = warnings[key].append info = infos[key].append # names if not lic.short_name: warn('No short name') if not lic.name: warn('No name') if not lic.category: warn('No category') if lic.next_version and lic.next_version not in licenses: err('License next version is unknown') if (lic.is_or_later and lic.base_license and lic.base_license not in licenses): err('Base license for an "or later" license is unknown') # URLS dedupe and consistency if no_dupe_urls: if lic.text_urls and not all(lic.text_urls): warn('Some empty license text_urls') if lic.other_urls and not all(lic.other_urls): warn('Some empty license other_urls') # redundant URLs used multiple times if lic.homepage_url: if lic.homepage_url in lic.text_urls: warn('Homepage URL also in text_urls') if lic.homepage_url in lic.other_urls: warn('Homepage URL also in other_urls') if lic.homepage_url == lic.faq_url: warn('Homepage URL same as faq_url') if lic.homepage_url == lic.osi_url: warn('Homepage URL same as osi_url') if lic.osi_url or lic.faq_url: if lic.osi_url == lic.faq_url: warn('osi_url same as faq_url') all_licenses = lic.text_urls + lic.other_urls for url in lic.osi_url, lic.faq_url, lic.homepage_url: if url: all_licenses.append(url) if not len(all_licenses) == len(set(all_licenses)): warn('Some duplicated URLs') # local text consistency text = lic.text license_qtokens = tuple(query_tokenizer(text, lower=True)) license_rtokens = tuple(rule_tokenizer(text, lower=True)) if license_qtokens != license_rtokens: info('License text contains rule templated region with {{}}') if not license_qtokens: info('No license text') else: # for global dedupe by_text[license_qtokens].append(key + ': TEXT') # SPDX consistency if lic.spdx_license_key: by_spdx_key[lic.spdx_license_key].append(key) # global SPDX consistency multiple_spdx_keys_used = {k: v for k, v in by_spdx_key.items() if len(v) > 1} if multiple_spdx_keys_used: for k, lkeys in multiple_spdx_keys_used.items(): infos['GLOBAL'].append('SPDX key: ' + k + ' used in multiple licenses: ' + ', '.join(sorted(lkeys))) # global text dedupe multiple_texts = {k: v for k, v in by_text.items() if len(v) > 1} if multiple_texts: for k, msgs in multiple_texts.items(): errors['GLOBAL'].append('Duplicate texts in multiple licenses:' + ', '.join(sorted(msgs))) errors = {k: v for k, v in errors.items() if v} warnings = {k: v for k, v in warnings.items() if v} infos = {k: v for k, v in infos.items() if v} if verbose: print('Licenses validation errors:') for key, msgs in sorted(errors.items()): print('ERRORS for:', key, ':', '\n'.join(msgs)) print('Licenses validation warnings:') for key, msgs in sorted(warnings.items()): print('WARNINGS for:', key, ':', '\n'.join(msgs)) print('Licenses validation infos:') for key, msgs in sorted(infos.items()): print('INFOS for:', key, ':', '\n'.join(msgs)) return errors, warnings, infos