Exemplo n.º 1
0
    def get_licenses(self, scancode_licenses=None, **kwargs):
        """
        Return a mapping of key -> ScanCode License objects either fetched
        externally or loaded from the existing `self.original_dir`
        """
        print('Fetching and storing external licenses in:', self.original_dir)

        licenses = []
        for lic, text in self.fetch_licenses(scancode_licenses=scancode_licenses, **kwargs):
            try:
                with io.open(lic.text_file, 'w', encoding='utf-8')as tf:
                    tf.write(text)
                models.update_ignorables(lic, verbose=False)
                lic.dump()
                licenses.append(lic)
            except:
                if TRACE:
                    print()
                    print(repr(lic))
                raise

        print('Stored %d external licenses in: %r.' % (len(licenses), self.original_dir,))

        print('Modified (or not modified) external licenses will be in: %r.' % (self.update_dir,))
        fileutils.copytree(self.original_dir, self.update_dir)

        print('New external licenses will be in: %r.' % (self.new_dir,))

        return load_licenses(self.update_dir, with_deprecated=True)
Exemplo n.º 2
0
        def _clean(licenses):
            for lic in licenses.values():
                updated = False
                if lic.standard_notice:
                    updated = True
                    lic.standard_notice = clean_text(lic.standard_notice)
                if lic.notes:
                    updated = True
                    lic.notes = clean_text(lic.notes)

                if updated:
                    models.update_ignorables(lic, verbose=False)
                    lic.dump()
Exemplo n.º 3
0
def cli(source, replacement):
    """
    Create new license detection rules from existing rules by replacing a SOURCE
    string by a REPLACEMENT string in any rule text that contains this SOURCE string.
    """

    for rule, new_text in get_rules(source, replacement):
        existing = rule_exists(new_text)
        if existing:
            continue

        if rule.is_license_intro:
            base_name = 'license-intro'
        else:
            base_name = rule.license_expression

        base_loc = find_rule_base_loc(base_name)

        rd = rule.to_dict()
        rd['stored_text'] = new_text
        rd['has_stored_relevance'] = rule.has_stored_relevance
        rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage

        rulerec = models.Rule(**rd)

        # force recomputing relevance to remove junk stored relevance for long rules
        rulerec.set_relevance()

        rulerec.data_file = base_loc + '.yml'
        rulerec.text_file = base_loc + '.RULE'

        print('Adding new rule:')
        print('  file://' + rulerec.data_file)
        print('  file://' + rulerec.text_file, )
        rulerec.dump()
        models.update_ignorables(rulerec, verbose=False)
        rulerec.dump()
Exemplo n.º 4
0
def refresh_ignorables(licensishes):
    for i, lic in enumerate(sorted(licensishes)):
        print(i, end=' ')
        lic = models.update_ignorables(lic, verbose=True)
        lic.dump()
Exemplo n.º 5
0
def cli(licenses_file):
    """
        Create rules from a text file with delimited blocks of metadata and texts.

        As an example a file would contains one of more blocks such as this:

    \b
            ----------------------------------------
            license_expression: lgpl-2.1
            relevance: 100
            is_license_notice: yes
            ---
            This program is free software; you can redistribute it and/or modify
            it under the terms of the GNU Lesser General Public License
            version 2.1 as published by the Free Software Foundation;
            ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rule_by_tokens = all_rule_by_tokens()

    licenses_by_key = cache.get_licenses_db()
    skinny_rules = []

    for rdata in rules_data:
        relevance = rdata.data.get("relevance")
        rdata.data["has_stored_relevance"] = bool(relevance)

        license_expression = rdata.data.get("license_expression")
        if license_expression:
            rdata.data["license_expression"] = license_expression.lower(
            ).strip()

        minimum_coverage = rdata.data.get("minimum_coverage")
        rdata.data["has_stored_minimum_coverage"] = bool(minimum_coverage)

        rl = models.BasicRule(**rdata.data)
        rl.stored_text = rdata.text
        skinny_rules.append(rl)

    models.validate_rules(skinny_rules, licenses_by_key, with_text=True)

    print()
    for rule in skinny_rules:

        if rule.is_false_positive:
            base_name = "false-positive"
        elif rule.is_license_intro:
            base_name = "license-intro"
        else:
            base_name = rule.license_expression

        text = rule.text()

        existing_rule = rule_exists(text)
        skinny_text = " ".join(text[:80].split()).replace("{", " ").replace(
            "}", " ")

        existing_msg = (f"Skipping rule for: {base_name!r}, "
                        "dupe of: {existing_rule} "
                        f"with text: {skinny_text!r}...")

        if existing_rule:
            print(existing_msg.format(**locals()))
            continue

        base_loc = find_rule_base_loc(base_name)

        rd = rule.to_dict()
        rd["stored_text"] = rule.stored_text
        rd["has_stored_relevance"] = rule.has_stored_relevance
        rd["has_stored_minimum_coverage"] = rule.has_stored_minimum_coverage

        rulerec = models.Rule(**rd)

        # force recomputing relevance to remove junk stored relevance for long rules
        rulerec.set_relevance()

        rulerec.data_file = base_loc + ".yml"
        rulerec.text_file = base_loc + ".RULE"

        rule_tokens = tuple(rulerec.tokens())

        existing_rule = rule_by_tokens.get(rule_tokens)
        if existing_rule:
            print(existing_msg.format(**locals()))
            continue
        else:
            print(f"Adding new rule: {base_name}")
            print("  file://" + rulerec.data_file)
            print("  file://" + rulerec.text_file, )
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            rulerec.dump()

            rule_by_tokens[rule_tokens] = base_name
Exemplo n.º 6
0
def cli(licenses_file):
    """
    Create rules from a text file with delimited blocks of metadata and texts.

    As an example a file would contains one of more blocks such as this:

\b
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rules_tokens = all_rule_tokens()

    licenses_by_key = cache.get_licenses_db()
    skinny_rules = []

    for rdata in rules_data:
        relevance = rdata.data.get('relevance')
        rdata.data['has_stored_relevance'] = bool(relevance)

        minimum_coverage = rdata.data.get('minimum_coverage')
        rdata.data['has_stored_minimum_coverage'] = bool(minimum_coverage)

        rl = models.BasicRule(**rdata.data)
        rl.stored_text = rdata.text
        skinny_rules.append(rl)

    models.validate_rules(skinny_rules, licenses_by_key, with_text=True)

    print()
    for rule in skinny_rules:
        existing = rule_exists(rule.text())
        if existing:
            print('Skipping existing rule:', existing, 'with text:\n',
                  rule.text()[:50].strip(), '...')
            continue

        if rule.is_false_positive:
            base_name = 'false-positive'
        elif rule.is_license_intro:
            base_name = 'license-intro'
        else:
            base_name = rule.license_expression

        base_loc = find_rule_base_loc(base_name)

        rd = rule.to_dict()
        rd['stored_text'] = rule.stored_text
        rd['has_stored_relevance'] = rule.has_stored_relevance
        rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage

        rulerec = models.Rule(**rd)

        # force recomputing relevance to remove junk stored relevance for long rules
        rulerec.compute_relevance(_threshold=18.0)

        rulerec.data_file = base_loc + '.yml'
        rulerec.text_file = base_loc + '.RULE'

        rule_tokens = tuple(rulerec.tokens())

        if rule_tokens in rules_tokens:
            print('Skipping already added rule with text for:', base_name)
        else:
            print('Adding new rule:')
            print('  file://' + rulerec.data_file)
            print('  file://' + rulerec.text_file, )
            rules_tokens.add(rule_tokens)
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            rulerec.dump()
Exemplo n.º 7
0
def cli(licenses_file):
    """
    Create rules from a structured text file

    For instance:
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rule_data = load_data(licenses_file)
    rules_tokens = set()

    licenses = cache.get_licenses_db()
    licensing = Licensing(licenses.values())

    print()
    for data, text in rule_data:
        rdat = '\n'.join(data)
        rtxt = '\n'.join(text)
        existing = rule_exists(rtxt)
        if existing:
            print('Skipping existing rule:', existing, 'with text:\n', rtxt[:50].strip(), '...')
            continue

        # validate YAML syntax
        parsed = saneyaml.load(rdat)
        if parsed.get('is_negative'):
            license_expression = 'not-a-license'
        else:
            _, _, license_expression = data[0].partition(': ')
            license_expression = license_expression.strip()
            if not license_expression:
                raise Exception('Missing license_expression for text:', rtxt)
            licensing.parse(license_expression, validate=True, simple=True)

        base_loc = find_rule_base_loc(license_expression)

        data_file = base_loc + '.yml'
        with io.open(data_file, 'w', encoding='utf-8') as o:
            o.write(rdat)

        text_file = base_loc + '.RULE'
        with io.open(text_file, 'w', encoding='utf-8') as o:
            o.write(rtxt)
        rule = models.Rule(data_file=data_file, text_file=text_file)
        rule_tokens = tuple(rule.tokens())
        if rule_tokens in rules_tokens:
            # cleanup
            os.remove(text_file)
            os.remove(data_file)
            print('Skipping already added rule with text for:', license_expression)
        else:
            rules_tokens.add(rule_tokens)
            rule.dump()
            models.update_ignorables(rule, verbose=True)
            print('Rule added:', rule.identifier)
Exemplo n.º 8
0
def cli(licenses_file):
    """
    Create rules from a text file with delimited blocks of metadata and texts.

    As an example a file would contains one of more blocks such as this:

\b
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rules_tokens = all_rule_tokens()

    licenses = cache.get_licenses_db()
    licensing = Licensing(licenses.values())

    print()
    errors = validate_license_rules(rules_data, licensing)
    if errors:
        print('Invalid rules: exiting....')
        for error in errors:
            print(error)
            print()

        raise Exception('Invalid rules: exiting....')

    print()
    for rule in rules_data:
        is_negative = rule.data.get('is_negative')
        is_false_positive = rule.data.get('is_false_positive')
        existing = rule_exists(rule.text)
        if existing and not is_negative:
            print('Skipping existing non-negative rule:', existing,
                  'with text:\n', rule.text[:50].strip(), '...')
            continue

        if is_negative:
            base_name = 'not-a-license'
        else:
            license_expression = rule.data.get('license_expression')
            license_expression = str(
                licensing.parse(license_expression, validate=True,
                                simple=True))
            base_name = license_expression
            if is_false_positive:
                base_name = 'false-positive_' + base_name

        base_loc = find_rule_base_loc(base_name)

        data_file = base_loc + '.yml'
        with io.open(data_file, 'w', encoding='utf-8') as o:
            o.write(rule.raw_data)

        text_file = base_loc + '.RULE'
        with io.open(text_file, 'w', encoding='utf-8') as o:
            o.write(rule.text)

        rulerec = models.Rule(data_file=data_file, text_file=text_file)
        rule_tokens = tuple(rulerec.tokens())
        if rule_tokens in rules_tokens:
            # cleanup
            os.remove(text_file)
            os.remove(data_file)
            print('Skipping already added rule with text for:', base_name)
        else:
            rules_tokens.add(rule_tokens)
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            print(
                'Rule added:',
                'file://' + rulerec.data_file,
                '\n',
                'file://' + rulerec.text_file,
            )
Exemplo n.º 9
0
def synchronize_licenses(scancode_licenses, external_source, use_spdx_key=False,
                         match_text=False, match_approx=False, commitish=None):
    """
    Update the `scancode_licenses` ScanCodeLicenses licenses and texts in-place
    (e.g. in their current storage directory) from an `external_source`
    ExternalLicensesSource.

    Also update the external_source licenses this way:
    - original licenses from the external source are left unmodified in the original sub dir
    - new licenses found in scancode that are not in the external source are created in the new sub dir
    - external licenses that are modified from Scancode are created in the updated sub dir
    - external licenses that are not found in Scancode are created in the deleted sub dir

    The process is this:
    fetch external remote licenses
    build external license objects in memory, save in the original sub directory
    load scancode licenses

    for each scancode license:
        find a possible exact key match with an external license
        if there is a match, update and save the external license object in an "updated" directory
        if there is no match, save the external license object in a "new" directory

    for each external license:
        find a possible exact key or spdx key match with a scancode license
        if there is a match, update and save the scancode license object
        if there is no match, create and save a new scancode license object

    then later:
        find a possible license text match with a license

    """
    if TRACE: print('synchronize_licenses using SPDX keys:', use_spdx_key)

    # mappings of key -> License
    scancodes_by_key = scancode_licenses.by_key
    externals_by_key = external_source.get_licenses(scancode_licenses, commitish=commitish)

    if use_spdx_key:
        scancodes_by_key = scancode_licenses.by_spdx_key
        externals_by_key = get_licenses_by_spdx_key(externals_by_key.values())

    externals_by_spdx_key = get_licenses_by_spdx_key(externals_by_key.values())

    # track changes with sets of license keys
    same = set()
    added_to_scancode = set()
    added_to_external = set()
    updated_in_scancode = set()
    updated_in_external = set()

    unmatched_scancode_by_key = {}

    # FIXME: track deprecated
    # removed = set()

    # 1. iterate scancode licenses and compare with other
    for matching_key, scancode_license in scancodes_by_key.items():

        if not TRACE:print('.', end='')

        # does this scancode license exists in others based on the matching key?
        external_license = externals_by_key.get(matching_key)
        if not external_license:
            if TRACE_DEEP: print('ScanCode license not in External:', matching_key)
            unmatched_scancode_by_key[scancode_license.key] = scancode_license
            continue

        # the matching key exists on both sides: merge/update both licenses
        scancode_updated, external_updated = merge_licenses(
            scancode_license, external_license,
            external_source.updatable_attributes,
            from_spdx=use_spdx_key)

        if not scancode_updated and not external_updated:
            if TRACE_DEEP: print('License attributes are identical:', matching_key)
            same.add(matching_key)

        if scancode_updated:
            if TRACE: print('ScanCode license updated: SPDX:', use_spdx_key, matching_key, end='. Attributes: ')
            for attrib, oldv, newv in scancode_updated:
                if TRACE: print('  %(attrib)s: %(oldv)r -> %(newv)r' % locals())
            updated_in_scancode.add(matching_key)

        if external_updated:
            if TRACE: print('External license updated: SPDX:', use_spdx_key, matching_key, end='. Attributes: ')
            for attrib, oldv, newv in external_updated:
                if TRACE: print('  %(attrib)s: %(oldv)r -> %(newv)r' % locals())
            updated_in_external.add(matching_key)

    """
        if not external_license:
            matched_key = get_key_through_text_match(
                matching_key, scancode_license.text,
                scancode_licenses,
                match_approx=True)
            if matched_key:
                print('\nScanCode license not in External:', matching_key, 'but matched to:', matched_key)
                external_license
            else:
                print('\nScanCode license not in External:', matching_key, ' and added to external')
                external_license = scancode_license.relocate(external_source.new_dir)
                added_to_external.add(matching_key)
                externals_by_key[matching_key] = external_license
                continue
"""
    # 2. iterate other licenses and compare with ScanCode
    if TRACE: print()
    for matching_key, external_license in externals_by_key.items():
        # does this key exists in scancode?
        scancode_license = scancodes_by_key.get(matching_key)
        if scancode_license:
            # we already dealt with this in the first loop
            continue

        if not TRACE: print('.', end='')

        if match_text:

            matched_key = get_key_through_text_match(
                matching_key, external_license.text,
                scancode_licenses,
                match_approx=match_approx)
            if TRACE:
                print('External license with different key:', matching_key, 'and text matched to ScanCode key:', matched_key)

            if matched_key:
                print('External license with different key:', matching_key, 'and text matched to ScanCode key:', matched_key)
                if matched_key in unmatched_scancode_by_key:
                    del unmatched_scancode_by_key[matched_key]

                scancode_license = scancodes_by_key.get(matched_key)
                if TRACE:
                    print('scancode_license:', matching_key, scancode_license)

                scancode_updated, external_updated = merge_licenses(
                    scancode_license=scancode_license,
                    external_license=external_license,
                    updatable_attributes=external_source.updatable_attributes,
                    from_spdx=use_spdx_key,
                )

                if not scancode_updated and not external_updated:
                    if TRACE_DEEP: print('License attributes are identical:', matching_key)
                    same.add(matching_key)

                if scancode_updated:
                    if TRACE: print('ScanCode license updated: SPDX:', use_spdx_key, matching_key, end='. Attributes: ')
                    for attrib, oldv, newv in scancode_updated:
                        if TRACE: print('  %(attrib)s: %(oldv)r -> %(newv)r' % locals())
                    updated_in_scancode.add(matching_key)

                if external_updated:
                    if TRACE: print('External license updated: SPDX:', use_spdx_key, matching_key, end='. Attributes: ')
                    for attrib, oldv, newv in external_updated:
                        if TRACE: print('  %(attrib)s: %(oldv)r -> %(newv)r' % locals())
                    updated_in_external.add(matching_key)

        else:
            # Create a new ScanCode license
            scancode_license = external_license.relocate(licensedcode.models.licenses_data_dir, matching_key)
            added_to_scancode.add(matching_key)
            scancodes_by_key[matching_key] = scancode_license
            if TRACE: print('External license key not in ScanCode:', matching_key, 'created in ScanCode.', 'SPDX:', use_spdx_key)

    # 3. For scancode licenses that were not matched to anything in external add them in external
    if TRACE:
        print()
        print('Processing unmatched_scancode_by_key.')
    for lkey, scancode_license in unmatched_scancode_by_key.items():
        if lkey in set(['here-proprietary']):
            continue
        if scancode_license.is_deprecated:
            continue
        external_license = scancode_license.relocate(external_source.new_dir)
        added_to_external.add(lkey)
        externals_by_key[lkey] = external_license
        if TRACE: print('ScanCode license key not in External:', lkey, 'created in External.')

    # finally write changes in place for updates and news
    for k in updated_in_scancode | added_to_scancode:
        lic = scancodes_by_key[k]
        models.update_ignorables(lic, verbose=False)
        lic.dump()

    for k in updated_in_external | added_to_external:
        lic = externals_by_key[k]
        # models.update_ignorables(lic, verbose=False)
        lic.dump()

# TODO: at last: print report of incorrect OTHER licenses to submit
# updates eg. make API calls to DejaCode to create or update
# licenses and submit review request e.g. submit requests to SPDX
# for addition
    for key in sorted(added_to_external):
        lic = externals_by_key[key]
        if not lic.owner:
            print('New external license without owner:', key)

    print()
    print('#####################################################')
    print('Same licenses:       ', len(same))
    print('Add to ScanCode:     ', len(added_to_scancode))
    print('Updated in ScanCode: ', len(updated_in_scancode))
    print('Added to External::  ', len(added_to_external))
    print('Updated in External: ', len(updated_in_external))
    print('#####################################################')

    return [externals_by_key[k] for k in added_to_external]