def get_licenses(self, scancode_licenses=None, **kwargs): """ Return a mapping of key -> ScanCode License objects either fetched externally or loaded from the existing `self.original_dir` """ print('Fetching and storing external licenses in:', self.original_dir) licenses = [] for lic, text in self.fetch_licenses(scancode_licenses=scancode_licenses, **kwargs): try: with io.open(lic.text_file, 'w', encoding='utf-8')as tf: tf.write(text) models.update_ignorables(lic, verbose=False) lic.dump() licenses.append(lic) except: if TRACE: print() print(repr(lic)) raise print('Stored %d external licenses in: %r.' % (len(licenses), self.original_dir,)) print('Modified (or not modified) external licenses will be in: %r.' % (self.update_dir,)) fileutils.copytree(self.original_dir, self.update_dir) print('New external licenses will be in: %r.' % (self.new_dir,)) return load_licenses(self.update_dir, with_deprecated=True)
def _clean(licenses): for lic in licenses.values(): updated = False if lic.standard_notice: updated = True lic.standard_notice = clean_text(lic.standard_notice) if lic.notes: updated = True lic.notes = clean_text(lic.notes) if updated: models.update_ignorables(lic, verbose=False) lic.dump()
def cli(source, replacement): """ Create new license detection rules from existing rules by replacing a SOURCE string by a REPLACEMENT string in any rule text that contains this SOURCE string. """ for rule, new_text in get_rules(source, replacement): existing = rule_exists(new_text) if existing: continue if rule.is_license_intro: base_name = 'license-intro' else: base_name = rule.license_expression base_loc = find_rule_base_loc(base_name) rd = rule.to_dict() rd['stored_text'] = new_text rd['has_stored_relevance'] = rule.has_stored_relevance rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage rulerec = models.Rule(**rd) # force recomputing relevance to remove junk stored relevance for long rules rulerec.set_relevance() rulerec.data_file = base_loc + '.yml' rulerec.text_file = base_loc + '.RULE' print('Adding new rule:') print(' file://' + rulerec.data_file) print(' file://' + rulerec.text_file, ) rulerec.dump() models.update_ignorables(rulerec, verbose=False) rulerec.dump()
def refresh_ignorables(licensishes): for i, lic in enumerate(sorted(licensishes)): print(i, end=' ') lic = models.update_ignorables(lic, verbose=True) lic.dump()
def cli(licenses_file): """ Create rules from a text file with delimited blocks of metadata and texts. As an example a file would contains one of more blocks such as this: \b ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rules_data = load_data(licenses_file) rule_by_tokens = all_rule_by_tokens() licenses_by_key = cache.get_licenses_db() skinny_rules = [] for rdata in rules_data: relevance = rdata.data.get("relevance") rdata.data["has_stored_relevance"] = bool(relevance) license_expression = rdata.data.get("license_expression") if license_expression: rdata.data["license_expression"] = license_expression.lower( ).strip() minimum_coverage = rdata.data.get("minimum_coverage") rdata.data["has_stored_minimum_coverage"] = bool(minimum_coverage) rl = models.BasicRule(**rdata.data) rl.stored_text = rdata.text skinny_rules.append(rl) models.validate_rules(skinny_rules, licenses_by_key, with_text=True) print() for rule in skinny_rules: if rule.is_false_positive: base_name = "false-positive" elif rule.is_license_intro: base_name = "license-intro" else: base_name = rule.license_expression text = rule.text() existing_rule = rule_exists(text) skinny_text = " ".join(text[:80].split()).replace("{", " ").replace( "}", " ") existing_msg = (f"Skipping rule for: {base_name!r}, " "dupe of: {existing_rule} " f"with text: {skinny_text!r}...") if existing_rule: print(existing_msg.format(**locals())) continue base_loc = find_rule_base_loc(base_name) rd = rule.to_dict() rd["stored_text"] = rule.stored_text rd["has_stored_relevance"] = rule.has_stored_relevance rd["has_stored_minimum_coverage"] = rule.has_stored_minimum_coverage rulerec = models.Rule(**rd) # force recomputing relevance to remove junk stored relevance for long rules rulerec.set_relevance() rulerec.data_file = base_loc + ".yml" rulerec.text_file = base_loc + ".RULE" rule_tokens = tuple(rulerec.tokens()) existing_rule = rule_by_tokens.get(rule_tokens) if existing_rule: print(existing_msg.format(**locals())) continue else: print(f"Adding new rule: {base_name}") print(" file://" + rulerec.data_file) print(" file://" + rulerec.text_file, ) rulerec.dump() models.update_ignorables(rulerec, verbose=False) rulerec.dump() rule_by_tokens[rule_tokens] = base_name
def cli(licenses_file): """ Create rules from a text file with delimited blocks of metadata and texts. As an example a file would contains one of more blocks such as this: \b ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rules_data = load_data(licenses_file) rules_tokens = all_rule_tokens() licenses_by_key = cache.get_licenses_db() skinny_rules = [] for rdata in rules_data: relevance = rdata.data.get('relevance') rdata.data['has_stored_relevance'] = bool(relevance) minimum_coverage = rdata.data.get('minimum_coverage') rdata.data['has_stored_minimum_coverage'] = bool(minimum_coverage) rl = models.BasicRule(**rdata.data) rl.stored_text = rdata.text skinny_rules.append(rl) models.validate_rules(skinny_rules, licenses_by_key, with_text=True) print() for rule in skinny_rules: existing = rule_exists(rule.text()) if existing: print('Skipping existing rule:', existing, 'with text:\n', rule.text()[:50].strip(), '...') continue if rule.is_false_positive: base_name = 'false-positive' elif rule.is_license_intro: base_name = 'license-intro' else: base_name = rule.license_expression base_loc = find_rule_base_loc(base_name) rd = rule.to_dict() rd['stored_text'] = rule.stored_text rd['has_stored_relevance'] = rule.has_stored_relevance rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage rulerec = models.Rule(**rd) # force recomputing relevance to remove junk stored relevance for long rules rulerec.compute_relevance(_threshold=18.0) rulerec.data_file = base_loc + '.yml' rulerec.text_file = base_loc + '.RULE' rule_tokens = tuple(rulerec.tokens()) if rule_tokens in rules_tokens: print('Skipping already added rule with text for:', base_name) else: print('Adding new rule:') print(' file://' + rulerec.data_file) print(' file://' + rulerec.text_file, ) rules_tokens.add(rule_tokens) rulerec.dump() models.update_ignorables(rulerec, verbose=False) rulerec.dump()
def cli(licenses_file): """ Create rules from a structured text file For instance: ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rule_data = load_data(licenses_file) rules_tokens = set() licenses = cache.get_licenses_db() licensing = Licensing(licenses.values()) print() for data, text in rule_data: rdat = '\n'.join(data) rtxt = '\n'.join(text) existing = rule_exists(rtxt) if existing: print('Skipping existing rule:', existing, 'with text:\n', rtxt[:50].strip(), '...') continue # validate YAML syntax parsed = saneyaml.load(rdat) if parsed.get('is_negative'): license_expression = 'not-a-license' else: _, _, license_expression = data[0].partition(': ') license_expression = license_expression.strip() if not license_expression: raise Exception('Missing license_expression for text:', rtxt) licensing.parse(license_expression, validate=True, simple=True) base_loc = find_rule_base_loc(license_expression) data_file = base_loc + '.yml' with io.open(data_file, 'w', encoding='utf-8') as o: o.write(rdat) text_file = base_loc + '.RULE' with io.open(text_file, 'w', encoding='utf-8') as o: o.write(rtxt) rule = models.Rule(data_file=data_file, text_file=text_file) rule_tokens = tuple(rule.tokens()) if rule_tokens in rules_tokens: # cleanup os.remove(text_file) os.remove(data_file) print('Skipping already added rule with text for:', license_expression) else: rules_tokens.add(rule_tokens) rule.dump() models.update_ignorables(rule, verbose=True) print('Rule added:', rule.identifier)
def cli(licenses_file): """ Create rules from a text file with delimited blocks of metadata and texts. As an example a file would contains one of more blocks such as this: \b ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rules_data = load_data(licenses_file) rules_tokens = all_rule_tokens() licenses = cache.get_licenses_db() licensing = Licensing(licenses.values()) print() errors = validate_license_rules(rules_data, licensing) if errors: print('Invalid rules: exiting....') for error in errors: print(error) print() raise Exception('Invalid rules: exiting....') print() for rule in rules_data: is_negative = rule.data.get('is_negative') is_false_positive = rule.data.get('is_false_positive') existing = rule_exists(rule.text) if existing and not is_negative: print('Skipping existing non-negative rule:', existing, 'with text:\n', rule.text[:50].strip(), '...') continue if is_negative: base_name = 'not-a-license' else: license_expression = rule.data.get('license_expression') license_expression = str( licensing.parse(license_expression, validate=True, simple=True)) base_name = license_expression if is_false_positive: base_name = 'false-positive_' + base_name base_loc = find_rule_base_loc(base_name) data_file = base_loc + '.yml' with io.open(data_file, 'w', encoding='utf-8') as o: o.write(rule.raw_data) text_file = base_loc + '.RULE' with io.open(text_file, 'w', encoding='utf-8') as o: o.write(rule.text) rulerec = models.Rule(data_file=data_file, text_file=text_file) rule_tokens = tuple(rulerec.tokens()) if rule_tokens in rules_tokens: # cleanup os.remove(text_file) os.remove(data_file) print('Skipping already added rule with text for:', base_name) else: rules_tokens.add(rule_tokens) rulerec.dump() models.update_ignorables(rulerec, verbose=False) print( 'Rule added:', 'file://' + rulerec.data_file, '\n', 'file://' + rulerec.text_file, )
def synchronize_licenses(scancode_licenses, external_source, use_spdx_key=False, match_text=False, match_approx=False, commitish=None): """ Update the `scancode_licenses` ScanCodeLicenses licenses and texts in-place (e.g. in their current storage directory) from an `external_source` ExternalLicensesSource. Also update the external_source licenses this way: - original licenses from the external source are left unmodified in the original sub dir - new licenses found in scancode that are not in the external source are created in the new sub dir - external licenses that are modified from Scancode are created in the updated sub dir - external licenses that are not found in Scancode are created in the deleted sub dir The process is this: fetch external remote licenses build external license objects in memory, save in the original sub directory load scancode licenses for each scancode license: find a possible exact key match with an external license if there is a match, update and save the external license object in an "updated" directory if there is no match, save the external license object in a "new" directory for each external license: find a possible exact key or spdx key match with a scancode license if there is a match, update and save the scancode license object if there is no match, create and save a new scancode license object then later: find a possible license text match with a license """ if TRACE: print('synchronize_licenses using SPDX keys:', use_spdx_key) # mappings of key -> License scancodes_by_key = scancode_licenses.by_key externals_by_key = external_source.get_licenses(scancode_licenses, commitish=commitish) if use_spdx_key: scancodes_by_key = scancode_licenses.by_spdx_key externals_by_key = get_licenses_by_spdx_key(externals_by_key.values()) externals_by_spdx_key = get_licenses_by_spdx_key(externals_by_key.values()) # track changes with sets of license keys same = set() added_to_scancode = set() added_to_external = set() updated_in_scancode = set() updated_in_external = set() unmatched_scancode_by_key = {} # FIXME: track deprecated # removed = set() # 1. iterate scancode licenses and compare with other for matching_key, scancode_license in scancodes_by_key.items(): if not TRACE:print('.', end='') # does this scancode license exists in others based on the matching key? external_license = externals_by_key.get(matching_key) if not external_license: if TRACE_DEEP: print('ScanCode license not in External:', matching_key) unmatched_scancode_by_key[scancode_license.key] = scancode_license continue # the matching key exists on both sides: merge/update both licenses scancode_updated, external_updated = merge_licenses( scancode_license, external_license, external_source.updatable_attributes, from_spdx=use_spdx_key) if not scancode_updated and not external_updated: if TRACE_DEEP: print('License attributes are identical:', matching_key) same.add(matching_key) if scancode_updated: if TRACE: print('ScanCode license updated: SPDX:', use_spdx_key, matching_key, end='. Attributes: ') for attrib, oldv, newv in scancode_updated: if TRACE: print(' %(attrib)s: %(oldv)r -> %(newv)r' % locals()) updated_in_scancode.add(matching_key) if external_updated: if TRACE: print('External license updated: SPDX:', use_spdx_key, matching_key, end='. Attributes: ') for attrib, oldv, newv in external_updated: if TRACE: print(' %(attrib)s: %(oldv)r -> %(newv)r' % locals()) updated_in_external.add(matching_key) """ if not external_license: matched_key = get_key_through_text_match( matching_key, scancode_license.text, scancode_licenses, match_approx=True) if matched_key: print('\nScanCode license not in External:', matching_key, 'but matched to:', matched_key) external_license else: print('\nScanCode license not in External:', matching_key, ' and added to external') external_license = scancode_license.relocate(external_source.new_dir) added_to_external.add(matching_key) externals_by_key[matching_key] = external_license continue """ # 2. iterate other licenses and compare with ScanCode if TRACE: print() for matching_key, external_license in externals_by_key.items(): # does this key exists in scancode? scancode_license = scancodes_by_key.get(matching_key) if scancode_license: # we already dealt with this in the first loop continue if not TRACE: print('.', end='') if match_text: matched_key = get_key_through_text_match( matching_key, external_license.text, scancode_licenses, match_approx=match_approx) if TRACE: print('External license with different key:', matching_key, 'and text matched to ScanCode key:', matched_key) if matched_key: print('External license with different key:', matching_key, 'and text matched to ScanCode key:', matched_key) if matched_key in unmatched_scancode_by_key: del unmatched_scancode_by_key[matched_key] scancode_license = scancodes_by_key.get(matched_key) if TRACE: print('scancode_license:', matching_key, scancode_license) scancode_updated, external_updated = merge_licenses( scancode_license=scancode_license, external_license=external_license, updatable_attributes=external_source.updatable_attributes, from_spdx=use_spdx_key, ) if not scancode_updated and not external_updated: if TRACE_DEEP: print('License attributes are identical:', matching_key) same.add(matching_key) if scancode_updated: if TRACE: print('ScanCode license updated: SPDX:', use_spdx_key, matching_key, end='. Attributes: ') for attrib, oldv, newv in scancode_updated: if TRACE: print(' %(attrib)s: %(oldv)r -> %(newv)r' % locals()) updated_in_scancode.add(matching_key) if external_updated: if TRACE: print('External license updated: SPDX:', use_spdx_key, matching_key, end='. Attributes: ') for attrib, oldv, newv in external_updated: if TRACE: print(' %(attrib)s: %(oldv)r -> %(newv)r' % locals()) updated_in_external.add(matching_key) else: # Create a new ScanCode license scancode_license = external_license.relocate(licensedcode.models.licenses_data_dir, matching_key) added_to_scancode.add(matching_key) scancodes_by_key[matching_key] = scancode_license if TRACE: print('External license key not in ScanCode:', matching_key, 'created in ScanCode.', 'SPDX:', use_spdx_key) # 3. For scancode licenses that were not matched to anything in external add them in external if TRACE: print() print('Processing unmatched_scancode_by_key.') for lkey, scancode_license in unmatched_scancode_by_key.items(): if lkey in set(['here-proprietary']): continue if scancode_license.is_deprecated: continue external_license = scancode_license.relocate(external_source.new_dir) added_to_external.add(lkey) externals_by_key[lkey] = external_license if TRACE: print('ScanCode license key not in External:', lkey, 'created in External.') # finally write changes in place for updates and news for k in updated_in_scancode | added_to_scancode: lic = scancodes_by_key[k] models.update_ignorables(lic, verbose=False) lic.dump() for k in updated_in_external | added_to_external: lic = externals_by_key[k] # models.update_ignorables(lic, verbose=False) lic.dump() # TODO: at last: print report of incorrect OTHER licenses to submit # updates eg. make API calls to DejaCode to create or update # licenses and submit review request e.g. submit requests to SPDX # for addition for key in sorted(added_to_external): lic = externals_by_key[key] if not lic.owner: print('New external license without owner:', key) print() print('#####################################################') print('Same licenses: ', len(same)) print('Add to ScanCode: ', len(added_to_scancode)) print('Updated in ScanCode: ', len(updated_in_scancode)) print('Added to External:: ', len(added_to_external)) print('Updated in External: ', len(updated_in_external)) print('#####################################################') return [externals_by_key[k] for k in added_to_external]