def test_dump_rules(self): test_dir = self.get_test_loc('models/rules', copy=True) rules = list(models.load_rules(test_dir)) for r in rules: r.dump() rules = list(models.load_rules(test_dir)) results = as_sorted_mapping_seq(rules) expected = self.get_test_loc('models/rules.expected.json') check_json(expected, results)
def test_load_rules(self): test_dir = self.get_test_loc('models/rules') rules = list(models.load_rules(test_dir)) assert all(isinstance(r, models.Rule) for r in rules) results = as_sorted_mapping_seq(rules) expected = self.get_test_loc('models/rules.expected.json') check_json(expected, results)
def test_query_run_has_correct_offset(self): rule_dir = self.get_test_loc('query/runs/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('query/runs/query.txt') q = Query(location=query_doc, idx=idx, line_threshold=4) result = [qr.to_dict() for qr in q.query_runs] expected = [ {'end': 0, 'start': 0, 'tokens': u'inc'}, {'end': 123, 'start': 1, 'tokens': ( u'this library is free software you can redistribute it and or modify ' u'it under the terms of the gnu library general public license as ' u'published by the free software foundation either version 2 of the ' u'license or at your option any later version this library is ' u'distributed in the hope that it will be useful but without any ' u'warranty without even the implied warranty of merchantability or ' u'fitness for a particular purpose see the gnu library general public ' u'license for more details you should have received a copy of the gnu ' u'library general public license along with this library see the file ' u'copying lib if not write to the free software foundation inc 51 ' u'franklin street fifth floor boston ma 02110 1301 usa') } ] assert expected == result
def test_index_fails_on_duplicated_rules(self): rule_dir = self.get_test_loc('index/no_duplicated_rule') try: MiniLicenseIndex(models.load_rules(rule_dir)) self.fail('Exception on dupes not raised') except AssertionError as e: assert u'Duplicate rules' in str(e)
def test_match_license_performance_profiling_on_index_with_single_license(self): from time import time from licensedcode import query # pre-index : we are profiling only the detection, not the indexing rule_dir = self.get_test_loc('perf/idx/rules') rules = models.load_rules(rule_dir) idx = index.LicenseIndex(rules) location = self.get_test_loc('perf/idx/query.txt') querys = open(location, 'rb').read() qry = query.build_query(query_string=querys, idx=idx) def mini_seq_match(idx): list(idx.get_approximate_matches(qry, [], [])) # qtokens_as_str = array('h', tokens).tostring() start = time() for _ in range(100): mini_seq_match(idx) duration = time() - start values = ('ScanCode diff:', duration) print(*values) raise Exception(values)
def test_query_run_and_tokenizing_breaking_works__with_plus_as_expected(self): rule_dir = self.get_test_loc('query/run_breaking/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('query/run_breaking/query.txt') q = Query(query_doc, idx=idx) result = [qr.to_dict() for qr in q.query_runs] expected = [ {'end': 121, 'start': 0, 'tokens': 'this library is free software you can redistribute it ' 'and or modify it under the terms of the gnu library ' 'general public license as published by the free software ' 'foundation either version 2 of the license or at your ' 'option any later version this library is distributed in ' 'the hope that it will be useful but without any warranty ' 'without even the implied warranty of merchantability or ' 'fitness for a particular purpose see the gnu library ' 'general public license for more details you should have ' 'received a copy of the gnu library general public ' 'license along with this library see the file copying lib ' 'if not write to the free software foundation 51 franklin ' 'street fifth floor boston ma 02110 1301 usa'} ] assert expected == result q.tokens # check rules token are the same exact set as the set of the last query run txtid = idx.tokens_by_tid qrt = [txtid[t] for t in q.query_runs[-1].tokens] irt = [txtid[t] for t in idx.tids_by_rid[0]] assert irt == qrt
def test_query_run_has_correct_offset(self): rule_dir = self.get_test_loc('query/runs/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('query/runs/query.txt') q = Query(location=query_doc, idx=idx, line_threshold=4) result = [qr.to_dict() for qr in q.query_runs] expected = [{ 'end': 0, 'start': 0, 'tokens': u'inc' }, { 'end': 123, 'start': 1, 'tokens': (u'this library is free software you can redistribute it and or modify ' u'it under the terms of the gnu library general public license as ' u'published by the free software foundation either version 2 of the ' u'license or at your option any later version this library is ' u'distributed in the hope that it will be useful but without any ' u'warranty without even the implied warranty of merchantability or ' u'fitness for a particular purpose see the gnu library general public ' u'license for more details you should have received a copy of the gnu ' u'library general public license along with this library see the file ' u'copying lib if not write to the free software foundation inc 51 ' u'franklin street fifth floor boston ma 02110 1301 usa') }] assert expected == result
def test_index_fails_on_duplicated_rules(self): rule_dir = self.get_test_loc('index/no_duplicated_rule') try: index.LicenseIndex(models.load_rules(rule_dir)) self.fail('Exception on dupes not raised') except AssertionError as e: assert u'Duplicate rules' in str(e)
def test_match_seq_are_correct_on_apache(self): rule_dir = self.get_test_loc('match_seq/rules') legalese = (mini_legalese | set([ 'redistributions', 'written', 'registered', 'derived', 'damage', 'due', 'alternately', 'nor' ])) idx = index.LicenseIndex(load_rules(rule_dir), _legalese=legalese) query_loc = self.get_test_loc('match_seq/query') matches = idx.match(location=query_loc) assert len(matches) == 1 match = matches[0] assert match.matcher == match_seq.MATCH_SEQ qtext, _itext = get_texts(match) expected = u''' The OpenSymphony Group. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The end-user documentation included with the redistribution, if any, must include the following acknowledgment: [4]. "[This] [product] [includes] [software] [developed] [by] [the] [OpenSymphony] [Group] ([http]://[www].[opensymphony].[com]/)." [5]. Alternately, this acknowledgment may appear in the software itself, if and wherever such third-party acknowledgments normally appear. The names "OpenSymphony" and "The OpenSymphony Group" must not be used to endorse or promote products derived from this software without prior written permission. For written permission, please contact [email protected] . Products derived from this software may not be called "OpenSymphony" or "[OsCore]", nor may "OpenSymphony" or "[OsCore]" appear in their name, without prior written permission of the OpenSymphony Group. THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ''' assert qtext.split() == expected.split()
def test_load_rules(self): test_dir = self.get_test_loc('models/rules') rules = models.load_rules(test_dir) # one license is obsolete and not loaded assert 3 == len(rules) assert all(isinstance(r, models.Rule) for r in rules) # test a sample of a licenses field expected = [[u'lzma-sdk-original'], [u'gpl-2.0'], [u'oclc-2.0']] assert sorted(expected) == sorted(r.licenses for r in rules)
def test_match_license_performance_profiling_on_limited_index(self): # pre-index : we are profiling only the detection, not the indexing rule_dir = self.get_test_loc('detect/rule_template/rules') rules = models.load_rules(rule_dir) idx = index.LicenseIndex(rules) stats_file = 'license_match_limited_index_profile_log.txt' locations = [self.get_test_loc('detect/rule_template/query.txt')] self.profile_match(idx, locations, stats_file)
def test_template_detection_works_for_sun_bcl(self): # setup rule_dir = self.get_test_loc('detect/rule_template/rules') rules = models.load_rules(rule_dir) index = detect.get_license_index(rules) # test qdoc = self.get_test_loc('detect/rule_template/query.txt') matches = index.match(qdoc) assert 1 == len(matches)
def get_rules(source, replacement): """ Yield tuple of (rule, new text) for non-false positive existing Rules with a text that contains source. """ for rule in models.load_rules(): if rule.is_false_positive: continue text = rule.text() if source in text: yield rule, text.replace(source, replacement)
def test_match_can_match_with_index_built_from_rule_directory_with_sun_bcls(self): rule_dir = self.get_test_loc('detect/rule_template/rules') idx = index.LicenseIndex(load_rules(rule_dir)) # at line 151 the query has an extra "Software" word inserted to avoid hash matching query_loc = self.get_test_loc('detect/rule_template/query.txt') matches = idx.match(location=query_loc) assert 1 == len(matches) match = matches[0] assert Span(0, 957) | Span(959, 1756) == match.qspan assert match_seq.MATCH_SEQ == match.matcher
def test_filter_matches_handles_interlaced_matches_with_overlap_and_same_license(self): rule_dir = self.get_test_loc('match_filter/rules') idx = index.LicenseIndex(load_rules(rule_dir)) rules = {r.identifier: r for r in idx.rules_by_rid} query_loc = self.get_test_loc('match_filter/query') matches = idx.match(location=query_loc) expected = [ # filtered: LicenseMatch(matcher='3-seq', rule=rules['rule1.RULE'], qspan=Span(4, 47) | Span(50, 59), ispan=Span(1, 53)), LicenseMatch(matcher='2-aho', rule=rules['rule2.RULE'], qspan=Span(24, 86), ispan=Span(0, 62)), ] assert expected == matches
def test_rules_types_has_only_boolean_values(self): rules = list(models.load_rules(rules_data_dir)) rule_consitency_errors = [] for r in rules: list_rule_types = [r.is_license_text, r.is_license_notice, r.is_license_tag, r.is_license_reference] if any(type(rule_type) != bool for rule_type in list_rule_types): rule_consitency_errors.append((r.data_file, r.text_file)) assert rule_consitency_errors == []
def test_rules_have_only_one_rule_type(self): rules = list(models.load_rules(rules_data_dir)) rule_consitency_errors = [] for r in rules: list_rule_types = [r.is_license_text, r.is_license_notice, r.is_license_tag, r.is_license_reference] if sum(list_rule_types) > 1: rule_consitency_errors.append(r.data_file) assert rule_consitency_errors == []
def test_match_freertos(self): rule_dir = self.get_test_loc('mach_aho/rtos_exact/') idx = index.LicenseIndex(models.load_rules(rule_dir)) query_loc = self.get_test_loc('mach_aho/rtos_exact/gpl-2.0-freertos.RULE') qry = query.build_query(location=query_loc, idx=idx) matches = match_aho.exact_match(idx, qry.whole_query_run(), idx.rules_automaton) assert 1 == len(matches) match = matches[0] assert match_aho.MATCH_AHO_EXACT == match.matcher
def test_match_hash_returns_correct_offset(self): rule_dir = self.get_test_loc('hash/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('hash/query.txt') matches = idx.match(query_doc) assert len(matches) == 1 match = matches[0] assert match.matcher == match_hash.MATCH_HASH assert match.coverage() == 100 assert match.rule == rules[0] assert match.qspan == Span(0, 119) assert match.ispan == Span(0, 119)
def test_Rule__validate_with_invalid_language(self): rule_dir = self.get_test_loc('models/rule_validate_lang') validations = [] for rule in sorted(models.load_rules(rule_dir)): validations.extend(rule.validate()) expected = [ 'Unknown language: foobar', 'Invalid rule is_license_* flags. Only one allowed.', 'At least one is_license_* flag is needed.', 'Invalid rule is_license_* flags. Only one allowed.', 'At least one is_license_* flag is needed.', ] assert validations == expected
def test_match_hash_returns_correct_offset(self): rule_dir = self.get_test_loc('hash/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('hash/query.txt') matches = idx.match(query_doc) assert 1 == len(matches) match = matches[0] assert match_hash.MATCH_HASH == match.matcher assert 100 == match.coverage() assert rules[0] == match.rule assert Span(0, 121) == match.qspan assert Span(0, 121) == match.ispan
def test_match_hash_can_match_exactly(self): rule_dir = self.get_test_loc('hash/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('hash/rules/lgpl-2.0-plus_23.RULE') matches = idx.match(query_doc) assert len(matches) == 1 match = matches[0] assert match.coverage() == 100 assert match.matcher == match_hash.MATCH_HASH assert match.rule == rules[0] assert match.qspan == Span(0, 119) assert match.ispan == Span(0, 119)
def cli(path=(), update=True): """ Update licenses and rules with ignorable copyrights, holders, authors URLs and emails. """ licensish = list(cache.get_licenses_db().values()) + list( models.load_rules()) if path: licensish = [ l for l in licensish if l.text_file.endswith(path) or l.data_file.endswith(path) ] refresh_ignorables(licensish)
def test_match_hash_can_match_exactly(self): rule_dir = self.get_test_loc('hash/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('hash/rules/lgpl-2.0-plus_23.RULE') matches = idx.match(query_doc) assert 1 == len(matches) match = matches[0] assert 100 == match.coverage() assert match_hash.MATCH_HASH == match.matcher assert rules[0] == match.rule assert Span(0, 121) == match.qspan assert Span(0, 121) == match.ispan
def test_match_freertos(self): rule_dir = self.get_test_loc('mach_aho/rtos_exact/') idx = index.LicenseIndex(models.load_rules(rule_dir)) query_loc = self.get_test_loc( 'mach_aho/rtos_exact/gpl-2.0-freertos.RULE') qry = query.build_query(location=query_loc, idx=idx) matches = match_aho.exact_match(idx, qry.whole_query_run(), idx.rules_automaton) assert len(matches) == 1 match = matches[0] assert match.matcher == match_aho.MATCH_AHO_EXACT
def test_query_and_index_tokens_are_identical_for_same_text(self): rule_dir = self.get_test_loc('query/rtos_exact/') from licensedcode.models import load_rules idx = index.LicenseIndex(load_rules(rule_dir)) query_loc = self.get_test_loc('query/rtos_exact/gpl-2.0-freertos.RULE') index_text_tokens = [idx.tokens_by_tid[t] for t in idx.tids_by_rid[0]] qry = Query(location=query_loc, idx=idx, line_threshold=4) wqry = qry.whole_query_run() query_text_tokens = [idx.tokens_by_tid[t] for t in wqry.tokens] assert index_text_tokens == query_text_tokens assert u' '.join(index_text_tokens) == u' '.join(query_text_tokens)
def test_match_seq_are_correct_on_apache(self): rule_dir = self.get_test_loc('match_seq/rules') idx = index.LicenseIndex(load_rules(rule_dir)) query_loc = self.get_test_loc('match_seq/query') matches = idx.match(location=query_loc) assert 1 == len(matches) match = matches[0] assert match_seq.MATCH_SEQ == match.matcher qtext, _itext = get_texts(match, location=query_loc, idx=idx) expected = u''' Redistribution and use in source and binary forms with or without modification are permitted provided that the following conditions are met <1> Redistributions of source code must retain the above copyright notice this of conditions and the following disclaimer <2> Redistributions in binary form must reproduce the above copyright notice this of conditions and the following disclaimer in the documentation and or other materials provided with the distribution <3> The end user documentation included with the redistribution if any must include the following acknowledgment <4> <This> <product> <includes> <software> <developed> <by> <the> <OpenSymphony> <Group> <http> <www> <opensymphony> <com> <5> Alternately this acknowledgment may appear in the software itself if and wherever such third party acknowledgments normally appear The names OpenSymphony and The OpenSymphony Group must not be used to endorse or promote products derived from this software without prior written permission For written permission please contact license opensymphony com Products derived from this software may not be called OpenSymphony or OsCore nor may OpenSymphony or OsCore appear in their name without prior written permission of the OpenSymphony Group THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE ''' assert expected.split() == qtext.split()
def test_match_seq_are_correct_on_apache(self): rule_dir = self.get_test_loc('match_seq/rules') idx = index.LicenseIndex(load_rules(rule_dir)) query_loc = self.get_test_loc('match_seq/query') matches = idx.match(location=query_loc) assert 1 == len(matches) match = matches[0] assert match_seq.MATCH_SEQ == match.matcher qtext, _itext = get_texts(match, location=query_loc, idx=idx) expected = u''' The OpenSymphony Group All rights reserved Redistribution and use in source and binary forms with or without modification are permitted provided that the following conditions are met 1 Redistributions of source code must retain the above copyright notice this list of conditions and the following disclaimer 2 Redistributions in binary form must reproduce the above copyright notice this list of conditions and the following disclaimer in the documentation and or other materials provided with the distribution 3 The end user documentation included with the redistribution if any must include the following acknowledgment <4> <This> <product> <includes> <software> <developed> <by> <the> <OpenSymphony> <Group> <http> <www> <opensymphony> <com> <5> Alternately this acknowledgment may appear in the software itself if and wherever such third party acknowledgments normally appear The names OpenSymphony and The OpenSymphony Group must not be used to endorse or promote products derived from this software without prior written permission For written permission please contact license opensymphony com Products derived from this software may not be called OpenSymphony or [OsCore] nor may OpenSymphony or [OsCore] appear in their name without prior written permission of the OpenSymphony Group THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE ''' assert expected.split() == qtext.split()
def test_template_rule_is_loaded_correctly(self): test_dir = self.get_test_loc('models/rule_template') rules = list(models.load_rules(test_dir)) assert 1 == len(rules)
def test_template_rule_is_loaded_correctly(self): test_dir = self.get_test_loc('models/rule_template') rules = models.load_rules(test_dir) assert 1 == len(rules) rule = rules[0] assert rule.template
def test_load_rules_loads_file_content_at_path_and_not_path_as_string(self): rule_dir = self.get_test_loc('models/similar_names') rules = list(models.load_rules(rule_dir)) result = [' '.join(list(r.tokens())[-4:]) for r in rules] assert not any([r == 'rules proprietary 10 rule' for r in result])
def test_Rule__validate_with_false_positive_rule(self): rule_dir = self.get_test_loc('models/rule_validate') rule = list(models.load_rules(rule_dir))[0] assert list(rule.validate()) == []
def test_template_rule_is_loaded_correctly(self): test_dir = self.get_test_loc('models/rule_template') rules = list(models.load_rules(test_dir)) assert len(rules) == 1
def build_rule_validation_tests(rules, cls): """ Dynamically build an individual test method for each rule texts in a rules `data_set` then mapping attaching the test method to the `cls` test class. """ for rule in rules: if rule.negative: continue expected_identifier = rule.identifier test_name = ('test_validate_self_detection_of_rule_for_' + text.python_safe_name(expected_identifier)) test_method = make_license_test_function( rule.licenses, rule.text_file, rule.data_file, test_name, detect_negative=not rule.negative, trace_text=True) setattr(cls, test_name, test_method) class TestValidateLicenseRuleSelfDetection(unittest.TestCase): # Test functions are attached to this class at import time pass build_rule_validation_tests(models.load_rules(), TestValidateLicenseRuleSelfDetection)
class TestValidateLicenseTextDetection(unittest.TestCase): # Test functions are attached to this class at import time pass build_license_validation_tests(cache.get_licenses_db(), TestValidateLicenseTextDetection) def build_rule_validation_tests(rules, cls): """ Dynamically build an individual test method for each rule texts in a rules `data_set` then mapping attaching the test method to the `cls` test class. """ for rule in rules: if rule.negative: continue expected_identifier = rule.identifier test_name = ('test_validate_self_detection_of_rule_for_' + text.python_safe_name(expected_identifier)) test_method = make_license_test_function( rule.licenses, rule.text_file, rule.data_file, test_name, detect_negative=not rule.negative, trace_text=True ) setattr(cls, test_name, test_method) class TestValidateLicenseRuleSelfDetection(unittest.TestCase): # Test functions are attached to this class at import time pass build_rule_validation_tests(models.load_rules(), TestValidateLicenseRuleSelfDetection)
def cli(licenses, rules, category, license_key, with_text): """ Write Licenses/Rules from scancode into a CSV file with all details. Output can be optionally filtered by category/license-key. """ licenses_output = [] rules_output = [] licenses_data = load_licenses() if licenses: for lic in licenses_data.values(): license_data = lic.to_dict() if with_text: license_data["text"] = lic.text[:200] license_data["is_unknown"] = lic.is_unknown license_data["words_count"] = len(lic.text) license_data["reference_url"] = SCANCODE_LICENSEDB_URL.format(lic.key) licenses_output.append(license_data) if category: licenses_output = filter_by_attribute( data=licenses_output, attribute="category", required_key=category ) if license_key: licenses_output = filter_by_attribute( data=licenses_output, attribute="key", required_key=license_key, ) licenses_output = flatten_output(data=licenses_output) write_data_to_csv(data=licenses_output, output_csv=licenses, fieldnames=LICENSES_FIELDNAMES) if rules: rules_data = list(load_rules()) for rule in rules_data: rule_data = rule.to_dict() rule_data["identifier"] = rule.identifier rule_data["referenced_filenames"] = rule.referenced_filenames if with_text: rule_data["text"] = rule.text()[:200] rule_data["has_unknown"] = rule.has_unknown rule_data["words_count"] = len(rule.text()) try: rule_data["category"] = licenses_data[rule_data["license_expression"]].category except KeyError: pass rules_output.append(rule_data) if category: rules_output = filter_by_attribute( data=rules_output, attribute="category", required_key=category, ) if license_key: rules_output = filter_by_attribute( data=rules_output, attribute="license_expression", required_key=license_key, ) rules_output = flatten_output(rules_output) write_data_to_csv(data=rules_output, output_csv=rules, fieldnames=RULES_FIELDNAMES)