Exemplo n.º 1
0
def build_index(licenses_db=None, licenses_data_dir=None, rules_data_dir=None):
    """
    Return an index built from rules and licenses directories
    """
    from licensedcode.index import LicenseIndex
    from licensedcode.models import get_rules
    from licensedcode.models import get_all_spdx_key_tokens
    from licensedcode.models import get_license_tokens
    from licensedcode.models import licenses_data_dir as ldd
    from licensedcode.models import rules_data_dir as rdd
    from licensedcode.models import load_licenses
    from licensedcode.legalese import common_license_words

    licenses_data_dir = licenses_data_dir or ldd
    rules_data_dir = rules_data_dir or rdd

    licenses_db = licenses_db or load_licenses(
        licenses_data_dir=licenses_data_dir)
    rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)

    legalese = common_license_words
    spdx_tokens = set(get_all_spdx_key_tokens(licenses_db))
    license_tokens = set(get_license_tokens())

    return LicenseIndex(
        rules,
        _legalese=legalese,
        _spdx_tokens=spdx_tokens,
        _license_tokens=license_tokens,
    )
Exemplo n.º 2
0
def all_rule_tokens():
    """
    Return a set of tuples of tokens, one corresponding to every existing and
    added rules. Used to avoid duplicates.
    """
    rule_tokens = set()
    for rule in models.get_rules():
        rule_tokens.add(tuple(rule.tokens()))
    return rule_tokens
Exemplo n.º 3
0
 def test_index_rules_with_key_phrases_and_without_are_duplicates(self):
     rules_dir = self.get_test_loc('index/duplicate-key-phrases/rules')
     lics_dir = self.get_test_loc('index/duplicate-key-phrases/licenses')
     rules = models.get_rules(licenses_data_dir=lics_dir, rules_data_dir=rules_dir)
     try:
         idx = index.LicenseIndex(rules)
         for rid, tids in enumerate(idx.tids_by_rid):
             print(idx.rules_by_rid[rid].rid, repr(" ".join(idx.tokens_by_tid[t] for t in tids)))
         raise Exception("Exception not raised for duplicated rules")
     except index.DuplicateRuleError as e:
         assert str(e).startswith('Duplicate rules')
Exemplo n.º 4
0
 def test_spdx_match_contains_spdx_prefix(self):
     from licensedcode import index
     from licensedcode import tracing
     rule_dir = self.get_test_loc('spdx/rules-overlap/rules')
     lics_dir = self.get_test_loc('spdx/rules-overlap/licenses')
     idx = index.LicenseIndex(models.get_rules(lics_dir, rule_dir))
     querys = 'SPDX-license-identifier: BSD-3-Clause-No-Nuclear-Warranty'
     matches = idx.match(query_string=querys)
     assert len(matches) == 1
     match = matches[0]
     qtext, itext = tracing.get_texts(match)
     expected_qtext = 'SPDX-license-identifier: BSD-3-Clause-No-Nuclear-Warranty'
     assert qtext == expected_qtext
     expected_itext = 'spdx license identifier bsd 3 clause no nuclear warranty'
     assert itext == expected_itext
Exemplo n.º 5
0
def all_rule_by_tokens():
    """
    Return a mapping of {tuples of tokens: rule id}, with one item for each
    existing and added rules. Used to avoid duplicates.
    """
    rule_tokens = {}
    for rule in models.get_rules():
        try:
            rule_tokens[tuple(rule.tokens())] = rule.identifier
        except Exception as e:
            df = f"  file://{rule.data_file}"
            tf = f"  file://{rule.text_file}"
            raise Exception(
                f"Failed to to get tokens from rule:: {rule.identifier}\n"
                f"{df}\n{tf}") from e
    return rule_tokens
Exemplo n.º 6
0
def get_or_build_index_from_cache(force_clear=False):
    """
    Return a LicenseIndex loaded from cache. If the index is stale or does not exist,
    build a new index and caches it. Clear or purge the LicenseMatch cache as needed.
    """
    from licensedcode.index import LicenseIndex
    from licensedcode.models import get_rules
    try:
        # acquire lock and wait until timeout to get a lock or die
        with yg.lockfile.FileLock(index_lock_file, timeout=LICENSE_INDEX_LOCK_TIMEOUT):
            current_checksum = None
            # if we have a saved cached index
            if exists(tree_checksum_file) and exists(index_cache_file):
                # load saved tree_checksum and compare with current tree_checksum
                with open(tree_checksum_file, 'rb') as etcs:
                    existing_checksum = etcs.read()
                current_checksum = tree_checksum()
                if current_checksum == existing_checksum:
                    # The cache is consistent with the latest code and data:
                    # we load index from cache
                    with open(index_cache_file, 'rb') as ifc:
                        # Note: loads() is much (twice++???) faster than load()
                        idx = LicenseIndex.loads(ifc.read())
                    return idx

            # Here, the cache is not consistent with the latest code and data:
            # It is either stale or non-existing: we need to cleanup/regen
            # regen the index
            idx = LicenseIndex(get_rules())
            with open(index_cache_file, 'wb') as ifc:
                ifc.write(idx.dumps())

            # save the new checksums tree
            with open(tree_checksum_file, 'wb') as ctcs:
                ctcs.write(current_checksum or tree_checksum())

            return idx

    except yg.lockfile.FileLockTimeout:
        # TODO: unable to lock in a nicer way
        raise
def build_index(licenses_db=None, licenses_data_dir=None, rules_data_dir=None):
    """
    Return an index built from rules and licenses directories
    """
    from licensedcode.index import LicenseIndex
    from licensedcode.models import get_rules
    from licensedcode.models import get_all_spdx_key_tokens
    from licensedcode.models import licenses_data_dir as ldd
    from licensedcode.models import rules_data_dir as rdd

    licenses_data_dir = licenses_data_dir or ldd
    rules_data_dir = rules_data_dir or rdd

    licenses = licenses_db or build_licenses_db(
        licenses_data_dir=licenses_data_dir)

    spdx_tokens = set(get_all_spdx_key_tokens(licenses))

    rules = get_rules(
        licenses_data_dir=licenses_data_dir,
        rules_data_dir=rules_data_dir)

    return LicenseIndex(rules, _spdx_tokens=spdx_tokens)
Exemplo n.º 8
0
def get_or_build_index_from_cache():
    """
    Return a LicenseIndex loaded from cache or build a new index and caches it.
    """
    from licensedcode.index import LicenseIndex
    from licensedcode.models import get_rules
    try:
        # acquire global lock file and wait until timeout to get a lock or die
        with yg.lockfile.FileLock(index_lock_file, timeout=60 * 3):
            # if we have a saved cached index
            if exists(tree_checksum_file) and exists(index_cache_file):
                # load saved tree_checksum and compare with current tree_checksum
                with open(tree_checksum_file, 'rb') as etcs:
                    existing_checksum = etcs.read()
                current_checksum = tree_checksum()
                #  if this cached index is current for the code and data
                if current_checksum == existing_checksum:
                    # load index from cache
                    with open(index_cache_file, 'rb') as ifc:
                        idx = LicenseIndex.loads(ifc.read())
                        return idx

            # here the cache is stale or non-existing: we need to regen the index
            idx = LicenseIndex(get_rules())
            with open(index_cache_file, 'wb') as ifc:
                ifc.write(idx.dumps())

            # and save the checksums
            with open(tree_checksum_file, 'wb') as ctcs:
                ctcs.write(tree_checksum())

            return idx

    except yg.lockfile.FileLockTimeout:
        # handle unable to lock
        raise
Exemplo n.º 9
0
def get_cached_index(cache_dir=scancode_cache_dir,
                     check_consistency=SCANCODE_DEV_MODE,
                     # used for testing only
                     timeout=LICENSE_INDEX_LOCK_TIMEOUT,
                     tree_base_dir=scancode_src_dir,
                     licenses_data_dir=None, rules_data_dir=None,):
    """
    Return a LicenseIndex: either load a cached index or build and cache the
    index.
    - If the cache does not exist, a new index is built and cached.
    - If `check_consistency` is True, the cache is checked for consistency and
      rebuilt if inconsistent or stale.
    - If `check_consistency` is False, the cache is NOT checked for consistency
      If the cache files exist but ARE stale, the cache WILL NOT be rebuilt
    """
    from licensedcode.index import LicenseIndex
    from licensedcode.models import get_rules
    from licensedcode.models import get_all_spdx_key_tokens
    from licensedcode.models import licenses_data_dir as ldd
    from licensedcode.models import rules_data_dir as rdd

    licenses_data_dir = licenses_data_dir or ldd
    rules_data_dir = rules_data_dir or rdd

    lock_file, checksum_file, cache_file = get_license_cache_paths(cache_dir)

    has_cache = exists(cache_file)
    has_tree_checksum = exists(checksum_file)

    # bypass check if no consistency check is needed
    if has_cache and has_tree_checksum and not check_consistency:
        return load_index(cache_file)

    # here, we have no cache or we want a validity check: lock, check
    # and build or rebuild as needed
    try:
        # acquire lock and wait until timeout to get a lock or die
        with yg.lockfile.FileLock(lock_file, timeout=timeout):
            current_checksum = None
            # is the current cache consistent or stale?
            if has_cache and has_tree_checksum:
                # if we have a saved cached index
                # load saved tree_checksum and compare with current tree_checksum
                with open(checksum_file, 'rb') as etcs:
                    existing_checksum = etcs.read()
                current_checksum = tree_checksum(tree_base_dir=tree_base_dir)
                if current_checksum == existing_checksum:
                    # The cache is consistent with the latest code and data
                    # load and return
                    return load_index(cache_file)

            # Here, the cache is not consistent with the latest code and
            # data: It is either stale or non-existing: we need to
            # rebuild the index and cache it
            rules = get_rules(
                licenses_data_dir=licenses_data_dir,
                rules_data_dir=rules_data_dir)

            license_db = get_licenses_db(licenses_data_dir=licenses_data_dir)
            spdx_tokens = set(get_all_spdx_key_tokens(license_db))

            idx = LicenseIndex(rules, _spdx_tokens=spdx_tokens)

            with open(cache_file, 'wb') as ifc:
                ifc.write(idx.dumps())

            # save the new checksums tree
            with open(checksum_file, 'wb') as ctcs:
                ctcs.write(current_checksum
                           or tree_checksum(tree_base_dir=tree_base_dir))

            return idx

    except yg.lockfile.FileLockTimeout:
        # TODO: handle unable to lock in a nicer way
        raise
Exemplo n.º 10
0
    classes for basic and extended respectively.
    """
    # TODO: add test to detect the standard notice??

    cls = class_basic
    for i, rule in enumerate(rules):
        # only push 20 rules in the basic set
        if i > 20:
            cls = class_extended
        if rule.text_file and os.path.exists(rule.text_file):
            test_name = ('test_validate_detect_' +
                         text.python_safe_name(rule.identifier))
            test_method = make_validation_test(rule=rule, test_name=test_name)
            setattr(cls, test_name, test_method)


class TestValidateLicenseBasic(unittest.TestCase):
    # Test functions are attached to this class at import time
    pytestmark = pytest.mark.scanslow


class TestValidateLicenseExtended(unittest.TestCase):
    # Test functions are attached to this class at import time
    pytestmark = pytest.mark.scanvalidate


_rules = sorted(models.get_rules(), key=lambda r: r.identifier)
build_validation_tests(_rules, TestValidateLicenseBasic,
                       TestValidateLicenseExtended)
del _rules
Exemplo n.º 11
0
 def test_index_does_not_fail_on_rules_with_similar_normalized_names(self):
     rule_dir = self.get_test_loc('index/similar_names/rules')
     lics_dir = self.get_test_loc('index/similar_names/licenses')
     rules = models.get_rules(licenses_data_dir=lics_dir,
                              rules_data_dir=rule_dir)
     index.LicenseIndex(rules)
Exemplo n.º 12
0
def get_or_build_index_through_cache(
    check_consistency=DEV_MODE,
    return_index=True,
    # used for testing only
    _tree_base_dir=src_dir,
    _tree_checksum_file=tree_checksum_file,
    _index_lock_file=index_lock_file,
    _index_cache_file=index_cache_file,
    _licenses_data_dir=None,
    _rules_data_dir=None,
    _timeout=LICENSE_INDEX_LOCK_TIMEOUT,
):
    """
    Check and build or rebuild the LicenseIndex cache.
    If the cache does not exist, a new index is built an cached.
    Return the LicenseIndex if return_index is True.

    If `check_consistency` is True, the cache is checked for consistency
    and rebuilt if inconsistent or stale.

    If `check_consistency` is False, the cache is NOT checked for consistency
    If the cache files exist but stale, the cache WILL NOT be rebuilt
    """
    from licensedcode.index import LicenseIndex
    from licensedcode.models import get_rules
    from licensedcode.models import licenses_data_dir
    from licensedcode.models import rules_data_dir
    _licenses_data_dir = _licenses_data_dir or licenses_data_dir
    _rules_data_dir = _rules_data_dir or rules_data_dir

    has_cache = exists(_index_cache_file)
    has_tree_checksum = exists(_tree_checksum_file)

    # bypass check if no consistency check is needed
    if has_cache and has_tree_checksum and not check_consistency:
        return return_index and _load_index(_index_cache_file)

    # here, we have no cache or we want a validity check: lock, check
    # and build or rebuild as needed
    try:
        # acquire lock and wait until timeout to get a lock or die
        with yg.lockfile.FileLock(_index_lock_file, timeout=_timeout):
            current_checksum = None
            # is the current cache consistent or stale?
            if has_cache and has_tree_checksum:
                # if we have a saved cached index
                # load saved tree_checksum and compare with current tree_checksum
                with open(_tree_checksum_file, 'rb') as etcs:
                    existing_checksum = etcs.read()
                current_checksum = tree_checksum(tree_base_dir=_tree_base_dir)
                if current_checksum == existing_checksum:
                    # The cache is consistent with the latest code and data
                    # load and return
                    return return_index and _load_index(_index_cache_file)

            # Here, the cache is not consistent with the latest code and
            # data: It is either stale or non-existing: we need to
            # rebuild the index and cache it
            rules = get_rules(licenses_data_dir=_licenses_data_dir,
                              rules_data_dir=_rules_data_dir)
            idx = LicenseIndex(rules)
            with open(_index_cache_file, 'wb') as ifc:
                ifc.write(idx.dumps())

            # save the new checksums tree
            with open(_tree_checksum_file, 'wb') as ctcs:
                ctcs.write(current_checksum
                           or tree_checksum(tree_base_dir=_tree_base_dir))

            return return_index and idx

    except yg.lockfile.FileLockTimeout:
        # TODO: handle unable to lock in a nicer way
        raise
Exemplo n.º 13
0
def get_or_build_index_through_cache(
        check_consistency=DEV_MODE,
        return_index=True,
        # used for testing only
        _tree_base_dir=src_dir,
        _tree_checksum_file=tree_checksum_file,
        _index_lock_file=index_lock_file,
        _index_cache_file=index_cache_file,
        _licenses_data_dir=None,
        _rules_data_dir=None,
        _timeout=LICENSE_INDEX_LOCK_TIMEOUT,
        ):
    """
    Check and build or rebuild the LicenseIndex cache.
    If the cache does not exist, a new index is built an cached.
    Return the LicenseIndex if return_index is True.

    If `check_consistency` is True, the cache is checked for consistency
    and rebuilt if inconsistent or stale.

    If `check_consistency` is False, the cache is NOT checked for consistency
    If the cache files exist but stale, the cache WILL NOT be rebuilt
    """
    from licensedcode.index import LicenseIndex
    from licensedcode.models import get_rules
    from licensedcode.models import licenses_data_dir
    from licensedcode.models import rules_data_dir
    _licenses_data_dir = _licenses_data_dir or licenses_data_dir
    _rules_data_dir = _rules_data_dir or rules_data_dir

    has_cache = exists(_index_cache_file)
    has_tree_checksum = exists(_tree_checksum_file)

    # bypass check if no consistency check is needed
    if has_cache and has_tree_checksum and not check_consistency:
        return return_index and _load_index(_index_cache_file)

    # here, we have no cache or we want a validity check: lock, check
    # and build or rebuild as needed
    try:
        # acquire lock and wait until timeout to get a lock or die
        with yg.lockfile.FileLock(_index_lock_file, timeout=_timeout):
            current_checksum = None
            # is the current cache consistent or stale?
            if has_cache and has_tree_checksum:
                # if we have a saved cached index
                # load saved tree_checksum and compare with current tree_checksum
                with open(_tree_checksum_file, 'rb') as etcs:
                    existing_checksum = etcs.read()
                current_checksum = tree_checksum(tree_base_dir=_tree_base_dir)
                if current_checksum == existing_checksum:
                    # The cache is consistent with the latest code and data
                    # load and return
                    return return_index and _load_index(_index_cache_file)

            # Here, the cache is not consistent with the latest code and
            # data: It is either stale or non-existing: we need to
            # rebuild the index and cache it
            rules = get_rules(
                licenses_data_dir=_licenses_data_dir,
                rules_data_dir=_rules_data_dir)
            idx = LicenseIndex(rules)
            with open(_index_cache_file, 'wb') as ifc:
                ifc.write(idx.dumps())

            # save the new checksums tree
            with open(_tree_checksum_file, 'wb') as ctcs:
                ctcs.write(current_checksum or tree_checksum(tree_base_dir=_tree_base_dir))

            return return_index and idx

    except yg.lockfile.FileLockTimeout:
        # TODO: handle unable to lock in a nicer way
        raise
Exemplo n.º 14
0
def get_cached_index(
    licensedcode_cache_dir=licensedcode_cache_dir,
    scancode_cache_dir=scancode_cache_dir,
    check_consistency=SCANCODE_DEV_MODE,
    # used for testing only
    timeout=LICENSE_INDEX_LOCK_TIMEOUT,
    tree_base_dir=scancode_src_dir,
    licenses_data_dir=None,
    rules_data_dir=None,
):
    """
    Return a LicenseIndex: either load a cached index or build and cache the
    index.
    - If the cache does not exist, a new index is built and cached.
    - If `check_consistency` is True, the cache is checked for consistency and
      rebuilt if inconsistent or stale.
    - If `check_consistency` is False, the cache is NOT checked for consistency
      If the cache files exist but ARE stale, the cache WILL NOT be rebuilt
    """
    from licensedcode.index import LicenseIndex
    from licensedcode.models import get_rules
    from licensedcode.models import get_all_spdx_key_tokens
    from licensedcode.models import licenses_data_dir as ldd
    from licensedcode.models import rules_data_dir as rdd

    from scancode import lockfile

    licenses_data_dir = licenses_data_dir or ldd
    rules_data_dir = rules_data_dir or rdd

    lock_file, checksum_file, cache_file = get_license_cache_paths(
        licensedcode_cache_dir=licensedcode_cache_dir,
        scancode_cache_dir=scancode_cache_dir,
    )

    has_cache = has_cache_index_file(cache_file)
    # bypass check if no consistency check is needed
    if has_cache and not check_consistency:
        try:
            return load_index(cache_file)
        except Exception as e:
            # work around some rare Windows quirks
            import traceback
            print(
                'Inconsistent License index cache: checking and rebuilding index.'
            )
            print(str(e))
            print(traceback.format_exc())

    has_tree_checksum = os.path.exists(checksum_file)

    # here, we have no cache or we want a validity check: lock, check
    # and build or rebuild as needed
    try:
        # acquire lock and wait until timeout to get a lock or die
        with lockfile.FileLock(lock_file).locked(timeout=timeout):
            current_checksum = None
            # is the current cache consistent or stale?
            if has_cache and has_tree_checksum:
                # if we have a saved cached index
                # load saved tree_checksum and compare with current tree_checksum
                with open(checksum_file) as etcs:
                    existing_checksum = etcs.read()

                current_checksum = tree_checksum(tree_base_dir=tree_base_dir)
                if current_checksum == existing_checksum:
                    # The cache is consistent with the latest code and data
                    # load and return
                    return load_index(cache_file)

            # Here, the cache is not consistent with the latest code and
            # data: It is either stale or non-existing: we need to
            # rebuild the index and cache it

            # FIXME: caching a pickle of this would be 10x times faster
            license_db = get_licenses_db(licenses_data_dir=licenses_data_dir)

            rules = get_rules(licenses_data_dir=licenses_data_dir,
                              rules_data_dir=rules_data_dir)

            spdx_tokens = set(get_all_spdx_key_tokens(license_db))

            idx = LicenseIndex(rules, _spdx_tokens=spdx_tokens)

            with open(cache_file, 'wb') as ifc:
                idx.dump(ifc)

            # save the new tree checksum
            current_checksum = tree_checksum(tree_base_dir=tree_base_dir)
            with open(checksum_file, 'w') as ctcs:
                ctcs.write(current_checksum)

            return idx

    except lockfile.LockTimeout:
        # TODO: handle unable to lock in a nicer way
        raise