def get_spdx_id_matches(self, query, from_spdx_id_lines=True, **kwargs): """ Matching strategy for SPDX-Licensed-Identifier style of expressions. If `from_spdx_id_lines` is True detect only in the SPDX license identifier lines found in the query. Otherwise use the whole query for detection. """ matches = [] if from_spdx_id_lines: qrs_and_texts = query.spdx_lid_query_runs_and_text() else: # If we are not specifically looking at a single SPDX-Licene- # identifier line, then use the whole query run with the whole text. # Note this can only work for small texts or this will likely make # the expression parser choke if you feed it large texts query_lines = [ln for _, ln in tokenize.query_lines(query.location, query.query_string)] qrs_and_texts = query.whole_query_run(), u'\n'.join(query_lines) qrs_and_texts = [qrs_and_texts] for query_run, detectable_text in qrs_and_texts: if not query_run.matchables: # this could happen if there was some negative match applied continue spdx_match = match_spdx_lid.spdx_id_match( self, query_run, detectable_text) query_run.subtract(spdx_match.qspan) matches.append(spdx_match) return matches
def get_exact_matches(self, query, deadline=sys.maxsize, **kwargs): """ Extract matching strategy using an automaton for multimatching at once. """ wqr = query.whole_query_run() matches = match_aho.exact_match(self, wqr, self.rules_automaton, deadline=deadline) matches, _discarded = match.refine_matches(matches, self, query=query, filter_false_positive=False, merge=False) return matches
def get_spdx_id_matches( self, query, from_spdx_id_lines=True, expression_symbols=None, **kwargs, ): """ Matching strategy for SPDX-Licensed-Identifier style of expressions. If `from_spdx_id_lines` is True detect only in the SPDX license identifier lines found in the query. Otherwise use the whole query for detection. Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol} if provided. Otherwise use the standard SPDX license symbols. """ matches = [] if from_spdx_id_lines: qrs_and_texts = query.spdx_lid_query_runs_and_text() else: # If we are not specifically looking at a single SPDX-Licene- # identifier line, then use the whole query run with the whole text. # Note this can only work for small texts or this will likely make # the expression parser choke if you feed it large texts query_lines = tokenize.query_lines(query.location, query.query_string) query_lines = [ln for _, ln in query_lines] qrs_and_texts = query.whole_query_run(), u'\n'.join(query_lines) qrs_and_texts = [qrs_and_texts] for query_run, detectable_text in qrs_and_texts: if not query_run.matchables: continue if TRACE_SPDX_LID: logger_debug( 'get_spdx_id_matches:', 'query_run:', query_run, 'detectable_text:', detectable_text, ) spdx_match = match_spdx_lid.spdx_id_match( idx=self, query_run=query_run, text=detectable_text, expression_symbols=expression_symbols, ) if spdx_match: query_run.subtract(spdx_match.qspan) matches.append(spdx_match) return matches
def get_approximate_matches(self, query, matched_qspans, existing_matches, deadline=sys.maxsize, **kwargs): """ Approximate matching strategy breaking a query in query_runs and using multiple local alignments (aka. diff). Return a list of matches. """ matches = [] matchable_rids = self.approx_matchable_rids already_matched_qspans = matched_qspans[:] MAX_NEAR_DUPE_CANDIDATES = 10 # first check if the whole file may be close, near-dupe match whole_query_run = query.whole_query_run() near_dupe_candidates = match_set.compute_candidates( query_run=whole_query_run, idx=self, matchable_rids=matchable_rids, top=MAX_NEAR_DUPE_CANDIDATES, high_resemblance=True, _use_bigrams=USE_BIGRAM_MULTISETS, ) # if near duplicates, we only match the whole file at once against these # candidates if near_dupe_candidates: if TRACE_APPROX_CANDIDATES: logger_debug('get_query_run_approximate_matches: near dupe candidates:') for rank, ((sv1, sv2), _rid, can, _inter) in enumerate(near_dupe_candidates, 1): logger_debug(rank, sv1, sv2, can.identifier) matched = self.get_query_run_approximate_matches( whole_query_run, near_dupe_candidates, already_matched_qspans, deadline) matches.extend(matched) # subtract these for match in matched: qspan = match.qspan query.subtract(qspan) already_matched_qspans.append(qspan) # break if deadline has passed if time() > deadline: return matches # otherwise, and in all cases we break things in smaller query runs and # match each separately if USE_RULE_STARTS: query.refine_runs() if TRACE_APPROX: logger_debug('get_approximate_matches: len(query.query_runs):', len(query.query_runs)) MAX_CANDIDATES = 70 for query_run in query.query_runs: # inverted index match and ranking, query run-level candidates = match_set.compute_candidates( query_run=query_run, idx=self, matchable_rids=matchable_rids, top=MAX_CANDIDATES, high_resemblance=False, _use_bigrams=USE_BIGRAM_MULTISETS, ) if TRACE_APPROX_CANDIDATES: logger_debug('get_query_run_approximate_matches: candidates:') for rank, ((sv1, sv2), _rid, can, _inter) in enumerate(candidates, 1): logger_debug(rank, sv1, sv2, can.identifier) matched = self.get_query_run_approximate_matches( query_run, candidates, matched_qspans, deadline) matches.extend(matched) # break if deadline has passed if time() > deadline: break return matches