def debug_matches(self, matches, message, location=None, query_string=None, with_text=False, qry=None): """ Log debug-level data for a list of `matches`. """ logger_debug(message + ':', len(matches)) if qry: # set line early to ease debugging match.set_lines(matches, qry.line_by_pos) if not with_text: for m in matches: logger_debug(m) else: logger_debug(message + ' MATCHED TEXTS') from licensedcode.tracing import get_texts for m in matches: logger_debug(m) qt, it = get_texts(m) print(' MATCHED QUERY TEXT:', qt) print(' MATCHED RULE TEXT:', it) print()
def debug_matches(self, matches, message, location=None, query_string=None, with_text=False, query=None): if TRACE or TRACE_NEGATIVE: logger_debug(message + ':', len(matches)) if query: # set line early to ease debugging match.set_lines(matches, query.line_by_pos) if TRACE_MATCHES or TRACE_NEGATIVE: map(logger_debug, matches) if (TRACE_MATCHES_TEXT or TRACE_NEGATIVE) and with_text: logger_debug(message + ' MATCHED TEXTS') from licensedcode.tracing import get_texts for m in matches: logger_debug(m) qt, it = get_texts(m, location, query_string, self) print(' MATCHED QUERY TEXT:', qt) print(' MATCHED RULE TEXT:', it) print()
def debug_matches(self, matches, message, location=None, query_string=None, with_text=False, query=None): if TRACE or TRACE_NEGATIVE: logger_debug(message + ':', len(matches)) if query: # set line early to ease debugging match.set_lines(matches, query.line_by_pos) if TRACE_MATCHES or TRACE_NEGATIVE: map(logger_debug, matches) if (TRACE_MATCHES_TEXT or TRACE_NEGATIVE) and with_text: logger_debug(message + ' MATCHED TEXTS') for m in matches: logger_debug(m) qt, it = match.get_texts(m, location, query_string, self) print(' MATCHED QUERY TEXT:', qt) print(' MATCHED RULE TEXT:', it) print()
def match(self, location=None, query_string=None, min_score=0, as_expression=False, deadline=sys.maxsize, _skip_hash_match=False, **kwargs): """ This is the main entry point to match licenses. Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. If `as_expression` is True, treat the whole text as a single SPDX license expression and use only expression matching. `deadline` is a time.time() value in seconds by which the processing should stop and return whatever was matched so far. `_skip_hash_match` is used only for testing. """ assert 0 <= min_score <= 100 if not location and not query_string: return [] qry = query.build_query(location, query_string, idx=self, text_line_threshold=15, bin_line_threshold=50) if TRACE: logger_debug('match: for:', location, 'query:', qry) if not qry: return [] whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] if not _skip_hash_match: matches = match_hash.hash_match(self, whole_query_run) if matches: match.set_lines(matches, qry.line_by_pos) return matches # TODO: add match to degenerated expressions with custom symbols if as_expression: matches = self.get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_lines(matches, qry.line_by_pos) return matches negative_matches = [] if self.negative_rids: negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: self.debug_matches( matches=negative_matches, message='negative_matches', location=location, query_string=query_string) # , with_text, query) matches = [] if USE_AHO_FRAGMENTS: approx = self.get_fragments_matches else: approx = self.get_approximate_matches matchers = [ # matcher, include_low in post-matching remaining matchable check (self.get_spdx_id_matches, True, 'spdx_lid'), (self.get_exact_matches, False, 'aho'), (approx, False, 'seq'), ] already_matched_qspans = [] for matcher, include_low, matcher_name in matchers: if TRACE: logger_debug() logger_debug('matching with matcher:', matcher_name) matched = matcher(qry, matched_qspans=already_matched_qspans, existing_matches=matches, deadline=deadline) if TRACE: self.debug_matches( matches=matched, message='matched with: ' + matcher_name, location=location, query_string=query_string) # , with_text, query) matched = match.merge_matches(matched) matches.extend(matched) # subtract whole text matched if this is long enough for m in matched: if m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98: qry.subtract(m.qspan) # check if we have some matchable left # do not match futher if we do not need to # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) already_matched_qspans.extend(m.qspan for m in matched if m.coverage() == 100) if not whole_query_run.is_matchable( include_low=include_low, qspans=already_matched_qspans): break # break if deadline has passed if time() > deadline: break if not matches: return [] if TRACE: logger_debug() self.debug_matches(matches=matches, message='matches before final merge', location=location, query_string=query_string, with_text=True, qry=qry) matches, _discarded = match.refine_matches( matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True, merge=True) matches.sort() match.set_lines(matches, qry.line_by_pos) if TRACE: print() self.debug_matches(matches=matches, message='final matches', location=location, query_string=query_string , with_text=True, qry=qry) return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash, negative and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash(self, whole_query_run) if hash_matches: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative = self.negative_match(whole_query_run) for neg in negative: if TRACE_NEGATIVE: self.debug_matches(negative, ' ##match: NEGATIVE subtracting #:', location, query_string) whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: logger_debug(' #match: NEGATIVE found', negative) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = refine_matches(exact_matches, self, query=qry) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [ m.qspan for m in exact_matches if m.coverage() == 100 ] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash(self, query_run) if hash_matches: if TRACE: self.debug_matches( hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_QUERY_RUN: logger_debug( ' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: logger_debug( ' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', candidate) start_offset = 0 while True: rule_matches = match_sequence( self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches( rule_matches, ' #match: query_run: seq matches for candidate' ) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True) run_matches = merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches( run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug( '!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True) return matches
def match(self, location=None, query_string=None, min_score=0, as_expression=False, **kwargs): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. If `as_expression` is True, treat the whole text as a single SPDX license expression and use only expression matching. """ assert 0 <= min_score <= 100 if not location and not query_string: return [] qry = query.build_query(location, query_string, idx=self, text_line_threshold=15, bin_line_threshold=50) if not qry: return [] whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] hash_matches = match_hash.hash_match(self, whole_query_run) if hash_matches: match.set_lines(hash_matches, qry.line_by_pos) return hash_matches if as_expression: matches = self.get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_lines(matches, qry.line_by_pos) return matches negative_matches = [] if self.negative_rids: negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: self.debug_matches(negative_matches, 'negative_matches', location, query_string) #, with_text, query) matches = [] matchers = [ self.get_spdx_id_matches, self.get_exact_matches, self.get_approximate_matches ] for matcher in matchers: matched = matcher(qry) if TRACE: logger_debug('matching with matcher:', matcher) self.debug_matches(matched, 'matched', location, query_string) #, with_text, query) matches.extend(matched) # check if we have some matchable left # do not match futher if we do not need to # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [m.qspan for m in matches if m.coverage() == 100] if not whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): break if not matches: return [] matches, _discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True) matches.sort() match.set_lines(matches, qry.line_by_pos) return matches
def match_query( self, qry, min_score=0, as_expression=False, expression_symbols=None, approximate=True, deadline=sys.maxsize, _skip_hash_match=False, **kwargs, ): """ Return a sequence of LicenseMatch by matching the `qry` Query against this index. See Index.match() for arguments documentation. """ whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] if not _skip_hash_match: matches = match_hash.hash_match(self, whole_query_run) if matches: match.set_lines(matches, qry.line_by_pos) return matches get_spdx_id_matches = partial( self.get_spdx_id_matches, expression_symbols=expression_symbols, ) if as_expression: matches = get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_lines(matches, qry.line_by_pos) return matches matches = [] if USE_AHO_FRAGMENTS: approx = self.get_fragments_matches else: approx = self.get_approximate_matches matchers = [ # matcher, include_low in post-matching remaining matchable check (self.get_exact_matches, False, 'aho'), (get_spdx_id_matches, True, 'spdx_lid'), ] if approximate: matchers += [ (approx, False, 'seq'), ] already_matched_qspans = [] for matcher, include_low, matcher_name in matchers: if TRACE: logger_debug() logger_debug('matching with matcher:', matcher_name) matched = matcher( qry, matched_qspans=already_matched_qspans, existing_matches=matches, deadline=deadline, ) if TRACE: self.debug_matches( matches=matched, message='matched with: ' + matcher_name, location=location, query_string=query_string, ) matched = match.merge_matches(matched) matches.extend(matched) # Subtract whole text matched if this is long enough for m in matched: if (m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98): qry.subtract(m.qspan) # Check if we have some matchable left do not match futher if we do # not need to collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match # fragments (unused for now). already_matched_qspans.extend(m.qspan for m in matched if m.coverage() == 100) if not whole_query_run.is_matchable(include_low=include_low, qspans=already_matched_qspans): break # break if deadline has passed if time() > deadline: break if not matches: return [] if TRACE: logger_debug() self.debug_matches(matches=matches, message='matches before final merge', location=location, query_string=query_string, with_text=True, qry=qry) matches, _discarded = match.refine_matches( matches=matches, idx=self, query=qry, min_score=min_score, filter_false_positive=True, merge=True, ) matches.sort() match.set_lines(matches, qry.line_by_pos) if TRACE: self.debug_matches( matches=matches, message='final matches', location=location, query_string=query_string, with_text=True, qry=qry, ) return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash.hash_match(self, whole_query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) match.set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative_matches = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False, merge=False) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [m.qspan for m in exact_matches if m.coverage() == 100] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): # FIXME: we should exclude small and "weak" rules from the subset entirely # they are unlikely to be matchable with a seq match rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:query_run:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash.hash_match(self, query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # FIXME: why do not we aho match again here? This would avoid # going into the costly set and seq re-match that may not be needed at all # alternatively we should consider aho matches to excludes them from candidates # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = match_set.compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_CANDIDATES: logger_debug(' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: _, canrule, _ = candidate logger_debug(' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', canrule) start_offset = 0 while True: rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches(rule_matches, ' #match: query_run: seq matches for candidate', with_text=True, query=qry) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True, query=qry) run_matches = match.merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches(run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() match.set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True, query=qry) return matches