def test_query_from_binary_lkms_3(self): location = self.get_test_loc('query/wlan_xauth.ko') idx = cache.get_index() result = Query(location, idx=idx) assert len(result.query_runs) < 900 qr = result.query_runs[0] assert 'license dual bsd gpl' in u' '.join(idx.tokens_by_tid[t] for t in qr.matchable_tokens())
def test_match_license_performance_profiling_on_full_index_match_hash(self): # pre-index : we are profiling only the detection, not the indexing idx = cache.get_index() stats_file = 'license_match_chunk_full_index_profile_log.txt' locations = [self.get_test_loc('perf/cc-by-nc-sa-3.0.SPDX')] self.profile_match(idx, locations, stats_file)
def test_score_is_not_100_for_exact_match_with_extra_words(self): idx = cache.get_index() test_loc = self.get_test_loc('detect/score/test.txt') matches = idx.match(location=test_loc) assert 1 == len(matches) match = matches[0] assert 99 < match.score() < 100
def get_licenses(location, min_score=0, include_text=False, diag=False, license_url_template=DEJACODE_LICENSE_URL): """ Yield mappings of license data detected in the file at `location`. `minimum_score` is a minimum score threshold from 0 to 100. The default is 0 means that all license matches will be returned. With any other value matches that have a score below minimum score with not be returned. if `include_text` is True, the matched text is included in the returned data. If `diag` is True, additional match details are returned with the matched_rule key of the returned mapping. """ from licensedcode.cache import get_index from licensedcode.cache import get_licenses_db idx = get_index() licenses = get_licenses_db() for match in idx.match(location=location, min_score=min_score): if include_text: matched_text = match.matched_text(whole_lines=False) for license_key in match.rule.licenses: lic = licenses.get(license_key) result = OrderedDict() result['key'] = lic.key result['score'] = match.score() result['short_name'] = lic.short_name result['category'] = lic.category result['owner'] = lic.owner result['homepage_url'] = lic.homepage_url result['text_url'] = lic.text_urls[0] if lic.text_urls else '' result['reference_url'] = license_url_template.format(lic.key) spdx_key = lic.spdx_license_key result['spdx_license_key'] = spdx_key if spdx_key: spdx_key = lic.spdx_license_key.rstrip('+') spdx_url = SPDX_LICENSE_URL.format(spdx_key) else: spdx_url = '' result['spdx_url'] = spdx_url result['start_line'] = match.start_line result['end_line'] = match.end_line matched_rule = result['matched_rule'] = OrderedDict() matched_rule['identifier'] = match.rule.identifier matched_rule['license_choice'] = match.rule.license_choice matched_rule['licenses'] = match.rule.licenses # FIXME: for sanity these should always be included??? if diag: matched_rule['matcher'] = match.matcher matched_rule['rule_length'] = match.rule.length matched_rule['matched_length'] = match.ilen() matched_rule['match_coverage'] = match.coverage() matched_rule['rule_relevance'] = match.rule.relevance # FIXME: for sanity this should always be included????? if include_text: result['matched_text'] = matched_text yield result
def test_match_does_not_detect_spurrious_short_apache_rule(self): idx = cache.get_index() querys = u''' <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" <title>Apache log4j 1.2 - Continuous Integration</title> ''' matches = idx.match(query_string=querys) assert [] == matches
def test_query_from_binary_lkms_2(self): location = self.get_test_loc('query/eeepc_acpi.ko') idx = cache.get_index() result = Query(location, idx=idx) assert len(result.query_runs) < 500 qrs = result.query_runs[5:10] assert any('license gpl' in u' '.join(idx.tokens_by_tid[t] for t in qr.matchable_tokens()) for qr in qrs)
def closure_test_function(*args, **kwargs): idx = cache.get_index() matches = idx.match( location=test_file, min_score=min_score, # if negative, do not detect negative rules when testing negative rules detect_negative=detect_negative) if not matches: matches = [] # TODO: we should expect matches properly, not with a grab bag of flat license keys # flattened list of all detected license keys across all matches. detected_licenses = functional.flatten( map(unicode, match.rule.licenses) for match in matches) try: if not detect_negative: # we skipped negative detection for a negative rule # we just want to ensure that the rule was matched proper assert matches and not expected_licenses and not detected_licenses else: assert expected_licenses == detected_licenses except: # On failure, we compare against more result data to get additional # failure details, including the test_file and full match details match_failure_trace = [] if trace_text: for match in matches: qtext, itext = get_texts(match, location=test_file, idx=idx) rule_text_file = match.rule.text_file rule_data_file = match.rule.data_file match_failure_trace.extend([ '', '', '======= MATCH ====', match, '======= Matched Query Text for:', 'file://{test_file}'.format(**locals()) ]) if test_data_file: match_failure_trace.append( 'file://{test_data_file}'.format(**locals())) match_failure_trace.append(qtext.splitlines()) match_failure_trace.extend([ '', '======= Matched Rule Text for:' 'file://{rule_text_file}'.format(**locals()), 'file://{rule_data_file}'.format(**locals()), itext.splitlines(), ]) # this assert will always fail and provide a detailed failure trace assert expected_licenses == detected_licenses + [ test_name, 'test file: file://' + test_file ] + match_failure_trace
def find_osi_map(license: "OSI License Object"): """ Return Scanengine key mapped to OSI License `license` """ idx = get_index() osi_key = license['id'] # search for Scanengine License matching OSI Key matches = list(idx.match(query_string=osi_key)) if not matches: return None return matches[0].rule.license_expression
def test_match_in_binary_lkms_1(self): idx = cache.get_index() qloc = self.get_test_loc('positions/ath_pci.ko') matches = idx.match(location=qloc) assert 1 == len(matches) match = matches[0] assert ['bsd-new', 'gpl-2.0'] == match.rule.licenses qtext, itext = get_texts(match, location=qloc, idx=idx) assert 'license Dual BSD GPL' == qtext assert 'license Dual BSD GPL' == itext
def test_match_in_binary_lkms_1(self): idx = cache.get_index() qloc = self.get_test_loc('positions/ath_pci.ko') matches = idx.match(location=qloc) assert 1 == len(matches) match = matches[0] assert ['bsd-new', 'gpl-2.0'] == match.rule.licenses qtext, itext = get_texts(match, location=qloc, idx=idx) assert 'license Dual BSD GPL' == qtext assert 'license Dual BSD GPL' == itext
def test_all_spdx_tokens_exists_in_dictionary(self): idx = cache.get_index() dic = idx.dictionary licenses = cache.get_licenses_db() tokens = set(models.get_all_spdx_key_tokens(licenses)) keys = set(idx.dictionary) try: assert tokens.issubset(keys) except: for token in tokens: dic[token]
def test_Query_tokens_by_line_behaves_the_same_on_python_2_and_python_3( self): location = self.get_test_loc('query/query_lines/yahoo-eula.txt') idx = cache.get_index() query = Query(location, idx=idx) tbl = list(query.tokens_by_line()) # inject the actual token string for sanity tbt = idx.tokens_by_tid results = [[[i, i and tbt[i] or i] for i in line] for line in tbl] expected = self.get_test_loc('query/query_lines/yahoo-eula.txt.json') check_result_equals_expected_json(results, expected, regen=False)
def rule_exists(text): """ Return the matched rule if the text is an existing rule matched exactly, False otherwise. """ idx = cache.get_index() matches = idx.match(query_string=text) if len(matches) != 1: return False match = matches[0] if match.matcher == match_hash.MATCH_HASH: return match.rule.identifier
def test_match_in_binary_lkms_2(self): idx = cache.get_index() qloc = self.get_test_loc('positions/eeepc_acpi.ko') matches = idx.match(location=qloc) assert 1 == len(matches) match = matches[0] assert ['gpl-1.0-plus'] == match.rule.licenses assert match.ispan == Span(0, 1) qtext, itext = get_texts(match, location=qloc, idx=idx) assert 'license GPL' == qtext assert 'License GPL' == itext
def test_query_from_binary_lkms_2(self): location = self.get_test_loc('query/eeepc_acpi.ko') idx = cache.get_index() result = Query(location, idx=idx) assert len(result.query_runs) < 500 qrs = result.query_runs[:10] # for i, qr in enumerate(qrs): # print('qr:', i, # 'qr_text:', ' '.join(idx.tokens_by_tid[t] for t in qr.matchable_tokens())) assert any('license gpl' in ' '.join(idx.tokens_by_tid[t] for t in qr.matchable_tokens()) for qr in qrs)
def test_match_in_binary_lkms_2(self): idx = cache.get_index() qloc = self.get_test_loc('positions/eeepc_acpi.ko') matches = idx.match(location=qloc) assert 1 == len(matches) match = matches[0] assert ['gpl'] == match.rule.licenses assert match.ispan == Span(0, 1) qtext, itext = get_texts(match, location=qloc, idx=idx) assert 'license GPL' == qtext assert 'License GPL' == itext
def test_query_run_for_text_with_long_lines(self): location1 = self.get_test_loc('query/long_lines.txt') location2 = self.get_test_loc('query/not_long_lines.txt') from typecode.contenttype import get_type ft1 = get_type(location1) assert ft1.is_text_with_long_lines ft2 = get_type(location2) assert not ft2.is_text_with_long_lines idx = cache.get_index() assert len(Query(location1, idx=idx).query_runs) == 3 assert len(Query(location2, idx=idx).query_runs) == 14
def check_special_rule_cannot_be_detected(rule): idx = cache.get_index() results = idx.match(location=rule.text_file) if results: data_file = rule.data_file if not data_file: data_file = rule.text_file.replace('.LICENSE', '.yml') # On failure, we compare againto get additional failure details such as # a clickable text_file path results = (results, f'file://{data_file}', f'file://{rule.text_file}') # this assert will always fail and provide a more detailed failure trace assert results == []
def get_licenses(location, min_score=0, include_text=False, license_text_diagnostics=False, license_url_template=DEJACODE_LICENSE_URL, deadline=sys.maxsize, **kwargs): """ Return a mapping or detected_licenses for licenses detected in the file at `location` This mapping contains two keys: - 'licenses' with a value that is list of mappings of license information. - 'license_expressions' with a value that is list of license expression strings. `minimum_score` is a minimum score threshold from 0 to 100. The default is 0 means that all license matches are returned. Otherwise, matches with a score below `minimum_score` are returned. if `include_text` is True, matched text is included in the returned `licenses` data. """ from licensedcode import cache idx = cache.get_index() detected_licenses = [] detected_expressions = [] matches = idx.match( location=location, min_score=min_score, deadline=deadline, **kwargs) for match in matches: matched_text = None # TODO: handle whole lines with the case of very long lines if include_text: if license_text_diagnostics: matched_text = match.matched_text(whole_lines=False) else: highlight_not_matched = highlight_matched = u'%s' matched_text = match.matched_text( highlight_matched=highlight_matched, highlight_not_matched=highlight_not_matched, whole_lines=True) detected_expressions.append(match.rule.license_expression) detected_licenses.extend( _licenses_data_from_match(match, matched_text, license_url_template) ) return OrderedDict([ ('licenses', detected_licenses), ('license_expressions', detected_expressions), ])
def get_license_matches(location=None, query_string=None, min_score=0): """ Yield detected license matches in the file at `location` or the `query_string` string. `min_score` is a minimum score threshold for a license match from 0 to 100 percent. 100 is a high confidence match and 0 a low confidence match. A `min_score` of 0 means all matches are returned. The minimum length for an approximate match is four tokens. Spurrious matched are always filtered. """ return get_index().match(location=location, query_string=query_string, min_score=min_score)
def test_match_has_correct_positions_basic(self): idx = cache.get_index() querys = u'''Licensed under the GNU General Public License (GPL). Licensed under the GNU General Public License (GPL). Licensed under the GNU General Public License (GPL).''' matches = idx.match(query_string=querys) rule = [r for r in idx.rules_by_rid if r.identifier == 'gpl_69.RULE'][0] m1 = LicenseMatch(rule=rule, qspan=Span(0, 7), ispan=Span(0, 7), start_line=1, end_line=1) m2 = LicenseMatch(rule=rule, qspan=Span(8, 15), ispan=Span(0, 7), start_line=2, end_line=2) m3 = LicenseMatch(rule=rule, qspan=Span(16, 23), ispan=Span(0, 7), start_line=3, end_line=3) assert [m1, m2, m3] == matches
def test_match_has_correct_positions_basic(self): idx = cache.get_index() querys = u'''Licensed under the GNU General Public License (GPL). Licensed under the GNU General Public License (GPL). Licensed under the GNU General Public License (GPL).''' matches = idx.match(query_string=querys) rule = [r for r in idx.rules_by_rid if r.identifier == 'gpl_69.RULE'][0] m1 = LicenseMatch(rule=rule, qspan=Span(0, 7), ispan=Span(0, 7), start_line=1, end_line=1) m2 = LicenseMatch(rule=rule, qspan=Span(8, 15), ispan=Span(0, 7), start_line=2, end_line=2) m3 = LicenseMatch(rule=rule, qspan=Span(16, 23), ispan=Span(0, 7), start_line=3, end_line=3) assert [m1, m2, m3] == matches
def test_match_in_binary_lkms_3(self): idx = cache.get_index() qloc = self.get_test_loc('positions/wlan_xauth.ko') matches = idx.match(location=qloc) assert 1 == len(matches) match = matches[0] assert ['bsd-new', 'gpl-2.0'] == match.rule.licenses assert 100 == match.coverage() assert 20 == match.score() qtext, itext = get_texts(match, location=qloc, idx=idx) assert 'license Dual BSD GPL' == qtext assert 'license Dual BSD GPL' == itext assert Span(0, 3) == match.ispan
def test_method(self): idx = cache.get_index() qry = Query(location=test_loc, idx=idx) results = [list(l) for l in qry.spdx_lines] if regen: with open(expected_loc, 'wb') as ef: json.dump(results, ef, indent=2) expected = results else: with open(expected_loc, 'rb') as ef: expected = json.load(ef, object_pairs_hook=OrderedDict) assert expected == results
def test_method(self): idx = cache.get_index() qry = Query(location=test_loc, idx=idx) results = [list(l) for l in qry.spdx_lines] if regen: with open(expected_loc, 'w') as ef: json.dump(results, ef, indent=2) expected = results else: with open(expected_loc) as ef: expected = json.load(ef) assert results == expected
def test_match_in_binary_lkms_3(self): idx = cache.get_index() qloc = self.get_test_loc('positions/wlan_xauth.ko') matches = idx.match(location=qloc) assert 1 == len(matches) match = matches[0] assert ['bsd-new', 'gpl-2.0'] == match.rule.licenses assert 100 == match.coverage() assert 20 == match.score() qtext, itext = get_texts(match, location=qloc, idx=idx) assert 'license Dual BSD GPL' == qtext assert 'license Dual BSD GPL' == itext assert Span(0, 3) == match.ispan
def check_license(location=None, query_string=None, expected=(), test_data_dir=None): if query_string: idx = cache.get_index() matches = idx.match(location=location, query_string=query_string) results = functional.flatten(map(unicode, match.rule.licenses) for match in matches) assert expected == results else: test_name = python_safe_name('test_' + location.replace(test_data_dir, '')) tester = make_license_test_function( expected_licenses=expected, test_file=location, test_data_file=None, test_name=test_name, trace_text=True) tester()
def check_position(self, test_path, expected, with_span=True): """ Check license detection in file or folder against expected result. Expected is a list of (license, lines span, qspan span) tuples. """ test_location = self.get_test_loc(test_path) results = [] # FULL INDEX!! idx = cache.get_index() matches = idx.match(test_location) for match in matches: for detected in match.rule.license_keys(): results.append((detected, match.lines(), with_span and match.qspan or None)) assert expected == results
def test_method(self): idx = cache.get_index() qry = Query(location=test_loc, idx=idx) results = [list(l) for l in qry.spdx_lines] if regen: wmode = 'w' with open(expected_loc, wmode) as ef: json.dump(results, ef, indent=2) expected = results else: with open(expected_loc, 'rb') as ef: expected = json.load(ef, encoding='utf-8') assert expected == results
def test_match_does_not_change_query_unknown_positions(self): from licensedcode.match import LicenseMatch from licensedcode.spans import Span location = self.get_test_loc('query/unknown_positions/lz4.license.txt') idx = cache.get_index() # build a query first qry1 = Query(location, idx=idx) # this has the side effect to populate the unknown txt = ' '.join(f'{i}-{idx.tokens_by_tid[t]}' for i, t in enumerate(qry1.tokens)) assert txt == ( '0-this 1-repository 2-uses 3-2 4-different 5-licenses ' '6-all 7-files 8-in 9-the 10-lib 11-directory 12-use 13-bsd 14-2 15-clause 16-license ' '17-all 18-other 19-files 20-use 21-gplv2 22-license 23-unless 24-explicitly 25-stated 26-otherwise ' '27-relevant 28-license 29-is 30-reminded 31-at 32-the 33-top 34-of 35-each 36-source 37-file ' '38-and 39-with 40-presence 41-of 42-copying 43-or 44-license 45-file 46-in 47-associated 48-directories ' '49-this 50-model 51-is 52-selected 53-to 54-emphasize 55-that ' '56-files 57-in 58-the 59-lib 60-directory 61-are 62-designed 63-to 64-be 65-included 66-into 67-3rd 68-party 69-applications ' '70-while 71-all 72-other 73-files 74-in 75-programs 76-tests 77-or 78-examples ' '79-receive 80-more 81-limited 82-attention 83-and 84-support 85-for 86-such 87-scenario' ) list(qry1.tokens_by_line()) assert qry1.unknowns_by_pos == {} # run matching matches = idx.match(location=location) match = matches[0] rule = [ r for r in idx.rules_by_rid if r.identifier == 'bsd-simplified_and_gpl-2.0_1.RULE' ][0] expected = LicenseMatch( matcher='2-aho', rule=rule, qspan=Span(0, 48), ispan=Span(0, 48), ) assert match == expected # check that query unknown by pos is the same and empty qry2 = match.query # this was incorrectly returned as {15: 0, 20: 0, 21: 0, 41: 0, 43: 0} # after querying done during matching assert qry2.unknowns_by_pos == {}
def check_rule_or_license_can_be_self_detected_exactly(rule): idx = cache.get_index() matches = idx.match( location=rule.text_file, _skip_hash_match=True, deadline=10, ) expected = [rule.identifier, '100'] results = flatten( (m.rule.identifier, str(int(m.coverage()))) for m in matches) try: assert results == expected except: from licensedcode.tracing import get_texts data_file = rule.data_file if not data_file: data_file = rule.text_file.replace('.LICENSE', '.yml') text_file = rule.text_file # On failure, we compare againto get additional failure details such as # a clickable text_file path failure_trace = ['======= TEST ===='] failure_trace.extend(results) failure_trace.extend([ '', f'file://{data_file}', f'file://{text_file}', '======================', ]) for i, match in enumerate(matches): qtext, itext = get_texts(match) m_text_file = match.rule.text_file if match.rule.is_from_license: m_data_file = m_text_file.replace('LICENSE', '.yml') else: m_data_file = match.rule.data_file failure_trace.extend([ '', f'======= MATCH {i} ====', repr(match), f'file://{m_data_file}', f'file://{m_text_file}', '======= Matched Query Text:', '', qtext, '' '======= Matched Rule Text:', '', itext ]) # this assert will always fail and provide a detailed failure trace assert '\n'.join(failure_trace) == '\n'.join(expected)
def process_codebase(self, codebase, **kwargs): """ Update detected clues to remove redundant clues already found in another detected clue for all the resources of codebase. """ if TRACE: logger_debug('RedundantFilter:process_codebase') from licensedcode.cache import get_index rules_by_id = {r.identifier: r for r in get_index().rules_by_rid} for resource in codebase.walk(): filtered = filter_ignorable_resource_clues(resource, rules_by_id) if filtered: filtered.save(codebase)
def test_match_texts_with_short_lgpl_and_gpl_notices(self): idx = cache.get_index() test_loc = self.get_test_loc('detect/short_l_and_gpls') matches = idx.match(location=test_loc) assert 6 == len(matches) results = [m.matched_text(whole_lines=False) for m in matches] expected = [ 'This software is distributed under the following licenses:\n[Driver]: GNU General Public License (GPL)', 'GNU Lesser General Public License (LGPL)', 'This software is distributed under the following licenses:\n[Driver]: GNU General Public License (GPL)', 'GNU Lesser General Public (LGPL)', 'GNU Lesser General Public (LGPL)', 'GNU Lesser General Public (LGPL)' ] assert expected == results
def test_match_texts_with_short_lgpl_and_gpl_notices(self): idx = cache.get_index() test_loc = self.get_test_loc('detect/short_l_and_gpls') matches = idx.match(location=test_loc) assert 6 == len(matches) results = [m.matched_text(whole_lines=False) for m in matches] expected =[ 'GNU General Public License (GPL', 'GNU Lesser General Public License (LGPL', 'GNU General Public License (GPL', 'GNU Lesser General Public (LGPL', 'GNU Lesser General Public (LGPL', 'GNU Lesser General Public (LGPL' ] assert expected == results
def get_match(text): """ Return a tuple of: (top matched license key or None, (True if this an exact match, False if the match is ok, None if the match is weak, the matched score). """ idx = get_index() matches = list(idx.match(query_string=text, min_score=80)) if not matches: return None, None, 0 match = matches[0] query = match.query query_len = len(query.whole_query_run().tokens) rule = match.rule key = rule.licenses[0] is_exact = ( len(matches) == 1 and rule.is_license and len(rule.licenses) == 1 and match.matcher == '1-hash' and match.score() == 100 and match.qlen == query_len ) if is_exact: return key, True, 100 is_ok = ( len(rule.licenses) == 1 and match.coverage() > 95 and match.score() > 95) if is_ok: return key, False, match.score() is_weak = ( len(rule.licenses) == 1 and match.coverage() > 90 and match.score() > 90) if is_weak: return key, None, match.score() if match.score() > 85: # junk match return key, -1, match.score() else: return None, None, None
def closure_test_function(*args, **kwargs): idx = cache.get_index() matches = idx.match(location=test_file, min_score=0) if not matches: matches = [] detected_expressions = [ match.rule.license_expression for match in matches ] # use detection as expected and dump test back if regen: if not expected_failure: license_test.license_expressions = detected_expressions license_test.dump() return try: assert expected_expressions == detected_expressions except: # On failure, we compare against more result data to get additional # failure details, including the test_file and full match details failure_trace = detected_expressions[:] failure_trace.extend([test_name, 'test file: file://' + test_file]) for match in matches: qtext, itext = get_texts(match, location=test_file, idx=idx) rule_text_file = match.rule.text_file rule_data_file = match.rule.data_file failure_trace.extend([ '', '', '======= MATCH ====', match, '======= Matched Query Text for:', 'file://{test_file}'.format(**locals()) ]) if test_data_file: failure_trace.append( 'file://{test_data_file}'.format(**locals())) failure_trace.append(qtext.splitlines()) failure_trace.extend([ '', '======= Matched Rule Text for:' 'file://{rule_text_file}'.format(**locals()), 'file://{rule_data_file}'.format(**locals()), itext.splitlines(), ]) # this assert will always fail and provide a detailed failure trace assert expected_expressions == failure_trace
def _print_rule_stats(): """ Print rules statistics. """ from licensedcode.cache import get_index idx = get_index() rules = idx.rules_by_rid sizes = Counter(r.length for r in rules) print('Top 15 lengths: ', sizes.most_common(15)) print('15 smallest lengths: ', sorted(sizes.iteritems(), key=itemgetter(0))[:15]) high_sizes = Counter(r.high_length for r in rules) print('Top 15 high lengths: ', high_sizes.most_common(15)) print('15 smallest high lengths: ', sorted(high_sizes.iteritems(), key=itemgetter(0))[:15])
def _print_rule_stats(): """ Print rules statistics. """ from licensedcode.cache import get_index idx = get_index() rules = idx.rules_by_rid sizes = Counter(r.length for r in rules) print('Top 15 lengths: ', sizes.most_common(15)) print('15 smallest lengths: ', sorted(sizes.iteritems(), key=itemgetter(0))[:15]) high_sizes = Counter(r.high_length for r in rules) print('Top 15 high lengths: ', high_sizes.most_common(15)) print('15 smallest high lengths: ', sorted(high_sizes.iteritems(), key=itemgetter(0))[:15])
def get_match(text): """ Return a tuple of (license key, True if exact match, match score, match text) e.g.: - top matched license key or None, - True if this an exact match, False if the match is ok, None if the match is weak, - match score or 0 or None - matched text or None """ from licensedcode.cache import get_index idx = get_index() matches = list(idx.match(query_string=text, min_score=80)) if not matches: return None, None, 0, None match = matches[0] matched_text = match.matched_text(whole_lines=False) query = match.query query_len = len(query.whole_query_run().tokens) rule = match.rule rule_licenses = rule.license_keys() key = rule_licenses[0] is_exact = (len(matches) == 1 and rule.is_license and len(rule_licenses) == 1 and match.matcher == '1-hash' and match.score() == 100 and match.qlen == query_len) if is_exact: return key, True, 100, matched_text is_ok = (len(rule_licenses) == 1 and match.coverage() > 95 and match.score() > 95) if is_ok: return key, False, match.score(), matched_text is_weak = (len(rule_licenses) == 1 and match.coverage() > 90 and match.score() > 90) if is_weak: return key, None, match.score(), matched_text if match.score() > 85: # junk match return key, -1, match.score(), matched_text else: return None, None, None, None
def test_match_works_for_apache_rule(self): idx = cache.get_index() querys = u'''I am not a license. The Apache Software License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.txt ''' matches = idx.match(query_string=querys) assert 1 == len(matches) match = matches[0] assert 'apache-2.0_8.RULE' == match.rule.identifier assert match_aho.MATCH_AHO_EXACT == match.matcher qtext, _itext = get_texts(match, query_string=querys, idx=idx) assert u'The Apache Software License Version 2 0 http www apache org licenses LICENSE 2 0 txt' == qtext assert (3, 4) == match.lines()
def test_match_works_for_apache_rule(self): idx = cache.get_index() querys = u'''I am not a license. The Apache Software License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.txt ''' matches = idx.match(query_string=querys) assert 1 == len(matches) match = matches[0] assert 'apache-2.0_8.RULE' == match.rule.identifier assert match_aho.MATCH_AHO_EXACT == match.matcher qtext, _itext = get_texts(match, query_string=querys, idx=idx) assert u'The Apache Software License Version 2 0 http www apache org licenses LICENSE 2 0 txt' == qtext assert (3, 4) == match.lines()
def closure_test_function(*args, **kwargs): idx = cache.get_index() matches = idx.match(location=test_file, min_score=min_score, # if negative, do not detect negative rules when testing negative rules detect_negative=detect_negative) if not matches: matches = [] # TODO: we should expect matches properly, not with a grab bag of flat license keys # flattened list of all detected license keys across all matches. detected_licenses = functional.flatten(map(unicode, match.rule.licenses) for match in matches) try: if not detect_negative: # we skipped negative detection for a negative rule # we just want to ensure that the rule was matched proper assert matches and not expected_licenses and not detected_licenses else: assert expected_licenses == detected_licenses except: # On failure, we compare against more result data to get additional # failure details, including the test_file and full match details match_failure_trace = [] if trace_text: for match in matches: qtext, itext = get_texts(match, location=test_file, idx=idx) rule_text_file = match.rule.text_file rule_data_file = match.rule.data_file match_failure_trace.extend(['', '', '======= MATCH ====', match, '======= Matched Query Text for:', 'file://{test_file}'.format(**locals()) ]) if test_data_file: match_failure_trace.append('file://{test_data_file}'.format(**locals())) match_failure_trace.append(qtext.splitlines()) match_failure_trace.extend(['', '======= Matched Rule Text for:' 'file://{rule_text_file}'.format(**locals()), 'file://{rule_data_file}'.format(**locals()), itext.splitlines(), ]) # this assert will always fail and provide a detailed failure trace assert expected_licenses == detected_licenses + [test_name, 'test file: file://' + test_file] + match_failure_trace
def get_license_matches_from_query_string(query_string, start_line=1): """ Returns a sequence of LicenseMatch objects from license detection of the `query_string` starting at ``start_line`` number. This is useful when matching a text fragment alone when it is part of a larger text. """ if not query_string: return [] from licensedcode import cache idx = cache.get_index() qry = query.build_query( query_string=query_string, idx=idx, start_line=start_line, ) return idx.match_query(qry=qry)
def check_position(self, test_path, expected, with_span=True, print_results=False): """ Check license detection in file or folder against expected result. Expected is a list of (license, lines span, qspan span) tuples. """ test_location = self.get_test_loc(test_path) results = [] # FULL INDEX!! idx = cache.get_index() matches = idx.match(test_location) for match in matches: for detected in match.rule.licenses: if print_results: print() print(match) print_matched_texts(match, location=test_location, idx=idx) results.append((detected, match.lines(), with_span and match.qspan or None)) assert expected == results
def test_Query_with_spdx_basic(self): idx = cache.get_index() querys = ''' * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT) * SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 Always From uboot: the first two lines are patch-like: * SPDX-License-Identifier: GPL-2.0+ BSD-2-Clause ''' qry = Query(query_string=querys, idx=idx) expected = [ ('SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)', 0, 15), ('SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0',16, 34), ('SPDX-License-Identifier: GPL-2.0+ BSD-2-Clause', 45, 53)] assert qry.spdx_lines == expected
def test_query_run_tokens_matchable(self): idx = cache.get_index() # NOTE: this is not a token present in any rules or licenses unknown_token = u'baridationally' assert unknown_token not in idx.dictionary query_s = u' '.join(u''' 3 unable to create proc entry license gpl description driver author eric depends 2 6 24 19 generic smp mod module acpi baridationally register driver proc acpi disabled acpi install notify acpi baridationally get status cache caches create proc entry baridationally generate proc event acpi evaluate object acpi remove notify remove proc entry acpi baridationally driver acpi acpi gcc gnu 4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack pointer current stack pointer this module end usr src modules acpi include linux include asm include asm generic include acpi acpi c posix types 32 h types h types h h h h h '''.split()) result = Query(query_string=query_s, idx=idx) assert 1 == len(result.query_runs) qr = result.query_runs[0] expected_qr0 = u' '.join(u''' 3 unable to create proc entry license gpl description driver author eric depends 2 6 24 19 generic smp mod module acpi register driver proc acpi disabled acpi install notify acpi get status cache caches create proc entry generate proc event acpi evaluate object acpi remove notify remove proc entry acpi driver acpi acpi gcc gnu 4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack pointer current stack pointer this module end usr src modules acpi include linux include asm include asm generic include acpi acpi c posix types 32 h types h types h h h h h '''.split()) assert expected_qr0 == u' '.join(idx.tokens_by_tid[t] for t in qr.tokens) assert expected_qr0 == u' '.join(idx.tokens_by_tid[t] for p, t in enumerate(qr.tokens) if p in qr.matchables) # only gpl and gnu are is in high matchables expected = u'license gpl gnu gnu' assert expected == u' '.join(idx.tokens_by_tid[t] for p, t in enumerate(qr.tokens) if p in qr.high_matchables)
def test_query_run_tokens(self): query_s = u' '.join(u''' 3 unable to create proc entry license gpl description driver author eric depends 2 6 24 19 generic smp mod module acpi baridationally register driver proc acpi disabled acpi install notify acpi baridationally get status cache caches create proc entry baridationally generate proc event acpi evaluate object acpi remove notify remove proc entry acpi baridationally driver acpi acpi gcc gnu 4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack pointer current stack pointer this module end usr src modules acpi include linux include asm include asm generic include acpi acpi c posix types 32 h types h types h h h h h '''.split()) idx = cache.get_index() result = Query(query_string=query_s, idx=idx) assert 1 == len(result.query_runs) qr = result.query_runs[0] # NOTE: this is not a token present in any rules or licenses unknown_tokens = ('baridationally',) assert unknown_tokens not in idx.dictionary assert u' '.join([t for t in query_s.split() if t not in unknown_tokens]) == u' '.join(idx.tokens_by_tid[t] for t in qr.tokens)
def test_match_handles_negative_rules_and_does_not_match_negative_regions_properly(self): # note: this test relies on the negative rule: not-a-license_busybox_2.RULE # with this text: # "libbusybox is GPL, not LGPL, and exports no stable API that might act as a copyright barrier." # and relies on the short rules that detect GPL and LGPL idx = cache.get_index() # lines 3 and 4 should NOT be part of any matches # they should match the negative "not-a-license_busybox_2.RULE" negative_lines_not_to_match = 3, 4 querys = u''' licensed under the LGPL license libbusybox is GPL, not LGPL, and exports no stable API that might act as a copyright barrier. for the license license: dual BSD/GPL ''' matches = idx.match(query_string=querys) for match in matches: for line in negative_lines_not_to_match: assert line not in match.lines()
def test_query_run_tokens_matchable(self): idx = cache.get_index() # NOTE: this is not a token present in any rules or licenses unknown_token = u'baridationally' assert unknown_token not in idx.dictionary query_s = u' '.join(u''' 3 unable to create proc entry license gpl description driver author eric depends 2 6 24 19 generic smp mod module acpi baridationally register driver proc acpi disabled acpi install notify acpi baridationally get status cache caches create proc entry baridationally generate proc event acpi evaluate object acpi remove notify remove proc entry acpi baridationally driver acpi acpi gcc gnu 4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack pointer current stack pointer this module end usr src modules acpi include linux include asm include asm generic include acpi acpi c posix types 32 h types h types h h h h h '''.split()) result = Query(query_string=query_s, idx=idx) assert 1 == len(result.query_runs) qr = result.query_runs[0] expected_qr0 = u' '.join(u''' 3 unable to create proc entry license gpl description driver author eric depends 2 6 24 19 generic smp mod module acpi register driver proc acpi disabled acpi install notify acpi get status cache caches create proc entry generate proc event acpi evaluate object acpi remove notify remove proc entry acpi driver acpi acpi gcc gnu 4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack pointer current stack pointer this module end usr src modules acpi include linux include asm include asm generic include acpi acpi c posix types 32 h types h types h h h h h '''.split()) assert expected_qr0 == u' '.join(idx.tokens_by_tid[t] for t in qr.tokens) assert expected_qr0 == u' '.join(idx.tokens_by_tid[t] for p, t in enumerate(qr.tokens) if p in qr.matchables) # only gpl is in high matchables expected = u'gpl' assert expected == u' '.join(idx.tokens_by_tid[t] for p, t in enumerate(qr.tokens) if p in qr.high_matchables)
def test_match_has_correct_line_positions_for_query_with_repeats(self): expected = [ # licenses, match.lines(), qtext, ([u'apache-2.0'], (1, 2), u'The Apache Software License Version 2 0 http www apache org licenses LICENSE 2 0 txt'), ([u'apache-2.0'], (3, 4), u'The Apache Software License Version 2 0 http www apache org licenses LICENSE 2 0 txt'), ([u'apache-2.0'], (5, 6), u'The Apache Software License Version 2 0 http www apache org licenses LICENSE 2 0 txt'), ([u'apache-2.0'], (7, 8), u'The Apache Software License Version 2 0 http www apache org licenses LICENSE 2 0 txt'), ([u'apache-2.0'], (9, 10), u'The Apache Software License Version 2 0 http www apache org licenses LICENSE 2 0 txt'), ] test_path = 'positions/license1.txt' test_location = self.get_test_loc(test_path) idx = cache.get_index() matches = idx.match(test_location) for i, match in enumerate(matches): ex_lics, ex_lines, ex_qtext = expected[i] qtext, _itext = get_texts(match, location=test_location, idx=idx) try: assert ex_lics == match.rule.licenses assert ex_lines == match.lines() assert ex_qtext == qtext except AssertionError: assert expected[i] == (match.rule.licenses, match.lines(), qtext)
def test_match_license_performance_profiling_on_full_index_mixed_matching_long(self): # pre-index : we are profiling only the detection, not the indexing idx = cache.get_index() stats_file = 'license_match_mixed_matching_full_index_profile_log2.txt' locations = [self.get_test_loc(f) for f in ['perf/test1.txt', 'perf/whatever.py', 'perf/udll.cxx']] self.profile_match(idx, locations, stats_file)
def scan(input_path, scanners, verbose=False, quiet=False, processes=1, timeout=DEFAULT_TIMEOUT, diag=False, scans_cache_class=None, strip_root=False, full_root=False, pre_scan_plugins=()): """ Return a tuple of (files_count, scan_results, success) where scan_results is an iterable and success is a boolean. Run each requested scan proper: each individual file scan is cached on disk to free memory. Then the whole set of scans is loaded from the cache and streamed at the end. """ assert scans_cache_class scan_summary = OrderedDict() scan_summary['scanned_path'] = input_path scan_summary['processes'] = processes # Display scan start details ############################ # FIXME: it does not make sense to use tuple and positional values scans = [k for k, v in scanners.items() if v[0]] _scans = ', '.join(scans) if not quiet: echo_stderr('Scanning files for: %(_scans)s with %(processes)d process(es)...' % locals()) scan_summary['scans'] = scans[:] scan_start = time() indexing_time = 0 # FIXME: It does not make sense to use tuple and positional values with_licenses, _ = scanners.get('licenses', (False, '')) if with_licenses: # build index outside of the main loop for speed # this also ensures that forked processes will get the index on POSIX naturally if not quiet: echo_stderr('Building license detection index...', fg='green', nl=False) from licensedcode.cache import get_index get_index(False) indexing_time = time() - scan_start if not quiet: echo_stderr('Done.', fg='green', nl=True) scan_summary['indexing_time'] = indexing_time pool = None resources = resource_paths(input_path, diag, scans_cache_class, pre_scan_plugins=pre_scan_plugins) paths_with_error = [] files_count = 0 logfile_path = scans_cache_class().cache_files_log if on_linux: file_logger = partial(open, logfile_path, 'wb') else: file_logger = partial(codecs.open, logfile_path, 'w', encoding='utf-8') with file_logger() as logfile_fd: logged_resources = _resource_logger(logfile_fd, resources) scanit = partial(_scanit, scanners=scanners, scans_cache_class=scans_cache_class, diag=diag, timeout=timeout, processes=processes) max_file_name_len = compute_fn_max_len() # do not display a file name in progress bar if there is less than 5 chars available. display_fn = bool(max_file_name_len > 10) try: if processes: # maxtasksperchild helps with recycling processes in case of leaks pool = get_pool(processes=processes, maxtasksperchild=1000) # Using chunksize is documented as much more efficient in the Python doc. # Yet "1" still provides a better and more progressive feedback. # With imap_unordered, results are returned as soon as ready and out of order. scanned_files = pool.imap_unordered(scanit, logged_resources, chunksize=1) pool.close() else: # no multiprocessing with processes=0 scanned_files = imap(scanit, logged_resources) if not quiet: echo_stderr('Disabling multi-processing and multi-threading...', fg='yellow') if not quiet: echo_stderr('Scanning files...', fg='green') def scan_event(item): """Progress event displayed each time a file is scanned""" if quiet or not item or not display_fn: return '' _scan_success, _scanned_path = item _scanned_path = unicode(toascii(_scanned_path)) if verbose: _progress_line = _scanned_path else: _progress_line = fixed_width_file_name(_scanned_path, max_file_name_len) return style('Scanned: ') + style(_progress_line, fg=_scan_success and 'green' or 'red') scanning_errors = [] files_count = 0 with progressmanager( scanned_files, item_show_func=scan_event, show_pos=True, verbose=verbose, quiet=quiet, file=sys.stderr) as scanned: while True: try: result = scanned.next() scan_success, scanned_rel_path = result if not scan_success: paths_with_error.append(scanned_rel_path) files_count += 1 except StopIteration: break except KeyboardInterrupt: print('\nAborted with Ctrl+C!') if pool: pool.terminate() break finally: if pool: # ensure the pool is really dead to work around a Python 2.7.3 bug: # http://bugs.python.org/issue15101 pool.terminate() # TODO: add stats to results somehow # Compute stats ########################## scan_summary['files_count'] = files_count scan_summary['files_with_errors'] = paths_with_error total_time = time() - scan_start scanning_time = total_time - indexing_time scan_summary['total_time'] = total_time scan_summary['scanning_time'] = scanning_time files_scanned_per_second = round(float(files_count) / scanning_time , 2) scan_summary['files_scanned_per_second'] = files_scanned_per_second if not quiet: # Display stats ########################## echo_stderr('Scanning done.', fg=paths_with_error and 'red' or 'green') if paths_with_error: if diag: echo_stderr('Some files failed to scan properly:', fg='red') # iterate cached results to collect all scan errors cached_scan = scans_cache_class() root_dir = _get_root_dir(input_path, strip_root, full_root) scan_results = cached_scan.iterate(scans, root_dir, paths_subset=paths_with_error) for scan_result in scan_results: errored_path = scan_result.get('path', '') echo_stderr('Path: ' + errored_path, fg='red') for error in scan_result.get('scan_errors', []): for emsg in error.splitlines(False): echo_stderr(' ' + emsg) echo_stderr('') else: echo_stderr('Some files failed to scan properly. Use the --diag option for additional details:', fg='red') for errored_path in paths_with_error: echo_stderr(' ' + errored_path, fg='red') echo_stderr('Scan statistics: %(files_count)d files scanned in %(total_time)ds.' % locals()) echo_stderr('Scan options: %(_scans)s with %(processes)d process(es).' % locals()) echo_stderr('Scanning speed: %(files_scanned_per_second)s files per sec.' % locals()) echo_stderr('Scanning time: %(scanning_time)ds.' % locals()) echo_stderr('Indexing time: %(indexing_time)ds.' % locals(), reset=True) success = not paths_with_error # finally return an iterator on cached results cached_scan = scans_cache_class() root_dir = _get_root_dir(input_path, strip_root, full_root) return files_count, cached_scan.iterate(scans, root_dir), success
def test_query_from_binary_lkms_1(self): location = self.get_test_loc('query/ath_pci.ko') idx = cache.get_index() result = Query(location, idx=idx) assert len(result.query_runs) < 15
def test_match_license_performance_profiling_on_full_index_with_spurious_filtered_seq_matches(self): # pre-index : we are profiling only the detection, not the indexing idx = cache.get_index() stats_file = 'license_match_mixed_matching_full_index_profile_filtered_seq_matches_log.txt' locations = [self.get_test_loc(f) for f in ['perf/bsd-new_37.txt']] self.profile_match(idx, locations, stats_file)
def test_match_license_performance_profiling_on_full_index_small_binary_lkm2(self): # pre-index : we are profiling only the detection, not the indexing idx = cache.get_index() stats_file = 'license_match_full_index_profile_log.txt' locations = [self.get_test_loc('perf/ath_pci.ko')] self.profile_match(idx, locations, stats_file)