def build_index(licenses_db=None, licenses_data_dir=None, rules_data_dir=None): """ Return an index built from rules and licenses directories """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules from licensedcode.models import get_all_spdx_key_tokens from licensedcode.models import get_license_tokens from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd from licensedcode.models import load_licenses from licensedcode.legalese import common_license_words licenses_data_dir = licenses_data_dir or ldd rules_data_dir = rules_data_dir or rdd licenses_db = licenses_db or load_licenses( licenses_data_dir=licenses_data_dir) rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir) legalese = common_license_words spdx_tokens = set(get_all_spdx_key_tokens(licenses_db)) license_tokens = set(get_license_tokens()) return LicenseIndex( rules, _legalese=legalese, _spdx_tokens=spdx_tokens, _license_tokens=license_tokens, )
def all_rule_tokens(): """ Return a set of tuples of tokens, one corresponding to every existing and added rules. Used to avoid duplicates. """ rule_tokens = set() for rule in models.get_rules(): rule_tokens.add(tuple(rule.tokens())) return rule_tokens
def test_index_rules_with_key_phrases_and_without_are_duplicates(self): rules_dir = self.get_test_loc('index/duplicate-key-phrases/rules') lics_dir = self.get_test_loc('index/duplicate-key-phrases/licenses') rules = models.get_rules(licenses_data_dir=lics_dir, rules_data_dir=rules_dir) try: idx = index.LicenseIndex(rules) for rid, tids in enumerate(idx.tids_by_rid): print(idx.rules_by_rid[rid].rid, repr(" ".join(idx.tokens_by_tid[t] for t in tids))) raise Exception("Exception not raised for duplicated rules") except index.DuplicateRuleError as e: assert str(e).startswith('Duplicate rules')
def test_spdx_match_contains_spdx_prefix(self): from licensedcode import index from licensedcode import tracing rule_dir = self.get_test_loc('spdx/rules-overlap/rules') lics_dir = self.get_test_loc('spdx/rules-overlap/licenses') idx = index.LicenseIndex(models.get_rules(lics_dir, rule_dir)) querys = 'SPDX-license-identifier: BSD-3-Clause-No-Nuclear-Warranty' matches = idx.match(query_string=querys) assert len(matches) == 1 match = matches[0] qtext, itext = tracing.get_texts(match) expected_qtext = 'SPDX-license-identifier: BSD-3-Clause-No-Nuclear-Warranty' assert qtext == expected_qtext expected_itext = 'spdx license identifier bsd 3 clause no nuclear warranty' assert itext == expected_itext
def all_rule_by_tokens(): """ Return a mapping of {tuples of tokens: rule id}, with one item for each existing and added rules. Used to avoid duplicates. """ rule_tokens = {} for rule in models.get_rules(): try: rule_tokens[tuple(rule.tokens())] = rule.identifier except Exception as e: df = f" file://{rule.data_file}" tf = f" file://{rule.text_file}" raise Exception( f"Failed to to get tokens from rule:: {rule.identifier}\n" f"{df}\n{tf}") from e return rule_tokens
def get_or_build_index_from_cache(force_clear=False): """ Return a LicenseIndex loaded from cache. If the index is stale or does not exist, build a new index and caches it. Clear or purge the LicenseMatch cache as needed. """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules try: # acquire lock and wait until timeout to get a lock or die with yg.lockfile.FileLock(index_lock_file, timeout=LICENSE_INDEX_LOCK_TIMEOUT): current_checksum = None # if we have a saved cached index if exists(tree_checksum_file) and exists(index_cache_file): # load saved tree_checksum and compare with current tree_checksum with open(tree_checksum_file, 'rb') as etcs: existing_checksum = etcs.read() current_checksum = tree_checksum() if current_checksum == existing_checksum: # The cache is consistent with the latest code and data: # we load index from cache with open(index_cache_file, 'rb') as ifc: # Note: loads() is much (twice++???) faster than load() idx = LicenseIndex.loads(ifc.read()) return idx # Here, the cache is not consistent with the latest code and data: # It is either stale or non-existing: we need to cleanup/regen # regen the index idx = LicenseIndex(get_rules()) with open(index_cache_file, 'wb') as ifc: ifc.write(idx.dumps()) # save the new checksums tree with open(tree_checksum_file, 'wb') as ctcs: ctcs.write(current_checksum or tree_checksum()) return idx except yg.lockfile.FileLockTimeout: # TODO: unable to lock in a nicer way raise
def build_index(licenses_db=None, licenses_data_dir=None, rules_data_dir=None): """ Return an index built from rules and licenses directories """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules from licensedcode.models import get_all_spdx_key_tokens from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd licenses_data_dir = licenses_data_dir or ldd rules_data_dir = rules_data_dir or rdd licenses = licenses_db or build_licenses_db( licenses_data_dir=licenses_data_dir) spdx_tokens = set(get_all_spdx_key_tokens(licenses)) rules = get_rules( licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir) return LicenseIndex(rules, _spdx_tokens=spdx_tokens)
def get_or_build_index_from_cache(): """ Return a LicenseIndex loaded from cache or build a new index and caches it. """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules try: # acquire global lock file and wait until timeout to get a lock or die with yg.lockfile.FileLock(index_lock_file, timeout=60 * 3): # if we have a saved cached index if exists(tree_checksum_file) and exists(index_cache_file): # load saved tree_checksum and compare with current tree_checksum with open(tree_checksum_file, 'rb') as etcs: existing_checksum = etcs.read() current_checksum = tree_checksum() # if this cached index is current for the code and data if current_checksum == existing_checksum: # load index from cache with open(index_cache_file, 'rb') as ifc: idx = LicenseIndex.loads(ifc.read()) return idx # here the cache is stale or non-existing: we need to regen the index idx = LicenseIndex(get_rules()) with open(index_cache_file, 'wb') as ifc: ifc.write(idx.dumps()) # and save the checksums with open(tree_checksum_file, 'wb') as ctcs: ctcs.write(tree_checksum()) return idx except yg.lockfile.FileLockTimeout: # handle unable to lock raise
def get_cached_index(cache_dir=scancode_cache_dir, check_consistency=SCANCODE_DEV_MODE, # used for testing only timeout=LICENSE_INDEX_LOCK_TIMEOUT, tree_base_dir=scancode_src_dir, licenses_data_dir=None, rules_data_dir=None,): """ Return a LicenseIndex: either load a cached index or build and cache the index. - If the cache does not exist, a new index is built and cached. - If `check_consistency` is True, the cache is checked for consistency and rebuilt if inconsistent or stale. - If `check_consistency` is False, the cache is NOT checked for consistency If the cache files exist but ARE stale, the cache WILL NOT be rebuilt """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules from licensedcode.models import get_all_spdx_key_tokens from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd licenses_data_dir = licenses_data_dir or ldd rules_data_dir = rules_data_dir or rdd lock_file, checksum_file, cache_file = get_license_cache_paths(cache_dir) has_cache = exists(cache_file) has_tree_checksum = exists(checksum_file) # bypass check if no consistency check is needed if has_cache and has_tree_checksum and not check_consistency: return load_index(cache_file) # here, we have no cache or we want a validity check: lock, check # and build or rebuild as needed try: # acquire lock and wait until timeout to get a lock or die with yg.lockfile.FileLock(lock_file, timeout=timeout): current_checksum = None # is the current cache consistent or stale? if has_cache and has_tree_checksum: # if we have a saved cached index # load saved tree_checksum and compare with current tree_checksum with open(checksum_file, 'rb') as etcs: existing_checksum = etcs.read() current_checksum = tree_checksum(tree_base_dir=tree_base_dir) if current_checksum == existing_checksum: # The cache is consistent with the latest code and data # load and return return load_index(cache_file) # Here, the cache is not consistent with the latest code and # data: It is either stale or non-existing: we need to # rebuild the index and cache it rules = get_rules( licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir) license_db = get_licenses_db(licenses_data_dir=licenses_data_dir) spdx_tokens = set(get_all_spdx_key_tokens(license_db)) idx = LicenseIndex(rules, _spdx_tokens=spdx_tokens) with open(cache_file, 'wb') as ifc: ifc.write(idx.dumps()) # save the new checksums tree with open(checksum_file, 'wb') as ctcs: ctcs.write(current_checksum or tree_checksum(tree_base_dir=tree_base_dir)) return idx except yg.lockfile.FileLockTimeout: # TODO: handle unable to lock in a nicer way raise
classes for basic and extended respectively. """ # TODO: add test to detect the standard notice?? cls = class_basic for i, rule in enumerate(rules): # only push 20 rules in the basic set if i > 20: cls = class_extended if rule.text_file and os.path.exists(rule.text_file): test_name = ('test_validate_detect_' + text.python_safe_name(rule.identifier)) test_method = make_validation_test(rule=rule, test_name=test_name) setattr(cls, test_name, test_method) class TestValidateLicenseBasic(unittest.TestCase): # Test functions are attached to this class at import time pytestmark = pytest.mark.scanslow class TestValidateLicenseExtended(unittest.TestCase): # Test functions are attached to this class at import time pytestmark = pytest.mark.scanvalidate _rules = sorted(models.get_rules(), key=lambda r: r.identifier) build_validation_tests(_rules, TestValidateLicenseBasic, TestValidateLicenseExtended) del _rules
def test_index_does_not_fail_on_rules_with_similar_normalized_names(self): rule_dir = self.get_test_loc('index/similar_names/rules') lics_dir = self.get_test_loc('index/similar_names/licenses') rules = models.get_rules(licenses_data_dir=lics_dir, rules_data_dir=rule_dir) index.LicenseIndex(rules)
def get_or_build_index_through_cache( check_consistency=DEV_MODE, return_index=True, # used for testing only _tree_base_dir=src_dir, _tree_checksum_file=tree_checksum_file, _index_lock_file=index_lock_file, _index_cache_file=index_cache_file, _licenses_data_dir=None, _rules_data_dir=None, _timeout=LICENSE_INDEX_LOCK_TIMEOUT, ): """ Check and build or rebuild the LicenseIndex cache. If the cache does not exist, a new index is built an cached. Return the LicenseIndex if return_index is True. If `check_consistency` is True, the cache is checked for consistency and rebuilt if inconsistent or stale. If `check_consistency` is False, the cache is NOT checked for consistency If the cache files exist but stale, the cache WILL NOT be rebuilt """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules from licensedcode.models import licenses_data_dir from licensedcode.models import rules_data_dir _licenses_data_dir = _licenses_data_dir or licenses_data_dir _rules_data_dir = _rules_data_dir or rules_data_dir has_cache = exists(_index_cache_file) has_tree_checksum = exists(_tree_checksum_file) # bypass check if no consistency check is needed if has_cache and has_tree_checksum and not check_consistency: return return_index and _load_index(_index_cache_file) # here, we have no cache or we want a validity check: lock, check # and build or rebuild as needed try: # acquire lock and wait until timeout to get a lock or die with yg.lockfile.FileLock(_index_lock_file, timeout=_timeout): current_checksum = None # is the current cache consistent or stale? if has_cache and has_tree_checksum: # if we have a saved cached index # load saved tree_checksum and compare with current tree_checksum with open(_tree_checksum_file, 'rb') as etcs: existing_checksum = etcs.read() current_checksum = tree_checksum(tree_base_dir=_tree_base_dir) if current_checksum == existing_checksum: # The cache is consistent with the latest code and data # load and return return return_index and _load_index(_index_cache_file) # Here, the cache is not consistent with the latest code and # data: It is either stale or non-existing: we need to # rebuild the index and cache it rules = get_rules(licenses_data_dir=_licenses_data_dir, rules_data_dir=_rules_data_dir) idx = LicenseIndex(rules) with open(_index_cache_file, 'wb') as ifc: ifc.write(idx.dumps()) # save the new checksums tree with open(_tree_checksum_file, 'wb') as ctcs: ctcs.write(current_checksum or tree_checksum(tree_base_dir=_tree_base_dir)) return return_index and idx except yg.lockfile.FileLockTimeout: # TODO: handle unable to lock in a nicer way raise
def get_or_build_index_through_cache( check_consistency=DEV_MODE, return_index=True, # used for testing only _tree_base_dir=src_dir, _tree_checksum_file=tree_checksum_file, _index_lock_file=index_lock_file, _index_cache_file=index_cache_file, _licenses_data_dir=None, _rules_data_dir=None, _timeout=LICENSE_INDEX_LOCK_TIMEOUT, ): """ Check and build or rebuild the LicenseIndex cache. If the cache does not exist, a new index is built an cached. Return the LicenseIndex if return_index is True. If `check_consistency` is True, the cache is checked for consistency and rebuilt if inconsistent or stale. If `check_consistency` is False, the cache is NOT checked for consistency If the cache files exist but stale, the cache WILL NOT be rebuilt """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules from licensedcode.models import licenses_data_dir from licensedcode.models import rules_data_dir _licenses_data_dir = _licenses_data_dir or licenses_data_dir _rules_data_dir = _rules_data_dir or rules_data_dir has_cache = exists(_index_cache_file) has_tree_checksum = exists(_tree_checksum_file) # bypass check if no consistency check is needed if has_cache and has_tree_checksum and not check_consistency: return return_index and _load_index(_index_cache_file) # here, we have no cache or we want a validity check: lock, check # and build or rebuild as needed try: # acquire lock and wait until timeout to get a lock or die with yg.lockfile.FileLock(_index_lock_file, timeout=_timeout): current_checksum = None # is the current cache consistent or stale? if has_cache and has_tree_checksum: # if we have a saved cached index # load saved tree_checksum and compare with current tree_checksum with open(_tree_checksum_file, 'rb') as etcs: existing_checksum = etcs.read() current_checksum = tree_checksum(tree_base_dir=_tree_base_dir) if current_checksum == existing_checksum: # The cache is consistent with the latest code and data # load and return return return_index and _load_index(_index_cache_file) # Here, the cache is not consistent with the latest code and # data: It is either stale or non-existing: we need to # rebuild the index and cache it rules = get_rules( licenses_data_dir=_licenses_data_dir, rules_data_dir=_rules_data_dir) idx = LicenseIndex(rules) with open(_index_cache_file, 'wb') as ifc: ifc.write(idx.dumps()) # save the new checksums tree with open(_tree_checksum_file, 'wb') as ctcs: ctcs.write(current_checksum or tree_checksum(tree_base_dir=_tree_base_dir)) return return_index and idx except yg.lockfile.FileLockTimeout: # TODO: handle unable to lock in a nicer way raise
def get_cached_index( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=SCANCODE_DEV_MODE, # used for testing only timeout=LICENSE_INDEX_LOCK_TIMEOUT, tree_base_dir=scancode_src_dir, licenses_data_dir=None, rules_data_dir=None, ): """ Return a LicenseIndex: either load a cached index or build and cache the index. - If the cache does not exist, a new index is built and cached. - If `check_consistency` is True, the cache is checked for consistency and rebuilt if inconsistent or stale. - If `check_consistency` is False, the cache is NOT checked for consistency If the cache files exist but ARE stale, the cache WILL NOT be rebuilt """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules from licensedcode.models import get_all_spdx_key_tokens from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd from scancode import lockfile licenses_data_dir = licenses_data_dir or ldd rules_data_dir = rules_data_dir or rdd lock_file, checksum_file, cache_file = get_license_cache_paths( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, ) has_cache = has_cache_index_file(cache_file) # bypass check if no consistency check is needed if has_cache and not check_consistency: try: return load_index(cache_file) except Exception as e: # work around some rare Windows quirks import traceback print( 'Inconsistent License index cache: checking and rebuilding index.' ) print(str(e)) print(traceback.format_exc()) has_tree_checksum = os.path.exists(checksum_file) # here, we have no cache or we want a validity check: lock, check # and build or rebuild as needed try: # acquire lock and wait until timeout to get a lock or die with lockfile.FileLock(lock_file).locked(timeout=timeout): current_checksum = None # is the current cache consistent or stale? if has_cache and has_tree_checksum: # if we have a saved cached index # load saved tree_checksum and compare with current tree_checksum with open(checksum_file) as etcs: existing_checksum = etcs.read() current_checksum = tree_checksum(tree_base_dir=tree_base_dir) if current_checksum == existing_checksum: # The cache is consistent with the latest code and data # load and return return load_index(cache_file) # Here, the cache is not consistent with the latest code and # data: It is either stale or non-existing: we need to # rebuild the index and cache it # FIXME: caching a pickle of this would be 10x times faster license_db = get_licenses_db(licenses_data_dir=licenses_data_dir) rules = get_rules(licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir) spdx_tokens = set(get_all_spdx_key_tokens(license_db)) idx = LicenseIndex(rules, _spdx_tokens=spdx_tokens) with open(cache_file, 'wb') as ifc: idx.dump(ifc) # save the new tree checksum current_checksum = tree_checksum(tree_base_dir=tree_base_dir) with open(checksum_file, 'w') as ctcs: ctcs.write(current_checksum) return idx except lockfile.LockTimeout: # TODO: handle unable to lock in a nicer way raise