def test_merge_does_not_merge_overlapping_matches_in_sequence_with_assymetric_overlap( self): r1 = Rule(text_file='r1', license_expression=u'lgpl-2.0-plus') # ---> merge_matches: current: LicenseMatch<'3-seq', lines=(9, 28), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=87.5, qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(199, 200), ispan=Span(5, 21)|Span(23, 46)|Span(48, 77)|Span(79, 93)|Span(95, 100)|Span(108, 128)|Span(130, 142), hispan=Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)> # ---> merge_matches: next: LicenseMatch<'2-aho', lines=(28, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143), qspan=Span(198, 341), ispan=Span(0, 143), hispan=Span(1)|Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)> # ---> ###merge_matches: next overlaps in sequence current, merged as new: LicenseMatch<'3-seq 2-aho', lines=(9, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=268, ilen=144, hilen=21, rlen=144, qreg=(50, 341), ireg=(0, 143), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(198, 341), ispan=Span(0, 143), his # ---> merge_matches: current: qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142) # ---> merge_matches: next: qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143) m1 = LicenseMatch( rule=r1, qspan=Span(50, 90) | Span(92, 142) | Span(151, 182) | Span(199, 200), ispan=Span(5, 21) | Span(23, 46) | Span(48, 77) | Span(79, 93) | Span(95, 100) | Span(108, 128) | Span(130, 142), hispan=Span(10) | Span(14) | Span(18) | Span(24) | Span(27) | Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68) | Span(70) | Span(80) | Span(88) | Span(96) | Span(111) | Span(113) | Span(115) | Span(131) | Span(141), ) m2 = LicenseMatch(rule=r1, qspan=Span(198, 341), ispan=Span(0, 143), hispan=Span(1) | Span(10) | Span(14) | Span(18) | Span(24) | Span(27) | Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68) | Span(70) | Span(80) | Span(88) | Span(96) | Span(111) | Span(113) | Span(115) | Span(131) | Span(141)) matches = merge_matches([m1, m2]) assert [m1, m2] == matches
def test_merge_merges_duplicate_matches(self): r1 = Rule(text_file='r1', license_expression='apache-2.0') m1 = LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8)) m2 = LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8)) matches = merge_matches([m1, m2]) assert ([m1] == matches) or ([m2] == matches)
def test_merge_merges_duplicate_matches(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8)) m2 = LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8)) matches = merge_matches([m1, m2]) assert ([m1] == matches) or ([m2] == matches)
def test_merge_does_not_merge_overlapping_matches_in_sequence_with_assymetric_overlap(self): r1 = Rule(text_file='r1', licenses=[u'lgpl-2.0-plus']) # ---> merge_matches: current: LicenseMatch<'3-seq', lines=(9, 28), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=87.5, qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(199, 200), ispan=Span(5, 21)|Span(23, 46)|Span(48, 77)|Span(79, 93)|Span(95, 100)|Span(108, 128)|Span(130, 142), hispan=Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)> # ---> merge_matches: next: LicenseMatch<'2-aho', lines=(28, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143), qspan=Span(198, 341), ispan=Span(0, 143), hispan=Span(1)|Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)> # ---> ###merge_matches: next overlaps in sequence current, merged as new: LicenseMatch<'3-seq 2-aho', lines=(9, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=268, ilen=144, hilen=21, rlen=144, qreg=(50, 341), ireg=(0, 143), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(198, 341), ispan=Span(0, 143), his # ---> merge_matches: current: qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142) # ---> merge_matches: next: qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143) m1 = LicenseMatch( rule=r1, qspan=Span(50, 90) | Span(92, 142) | Span(151, 182) | Span(199, 200), ispan= Span(5, 21) | Span(23, 46) | Span(48, 77) | Span(79, 93) | Span(95, 100) | Span(108, 128) | Span(130, 142), hispan= Span(10) | Span(14) | Span(18) | Span(24) | Span(27) | Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68) | Span(70) | Span(80) | Span(88) | Span(96) | Span(111) | Span(113) | Span(115) | Span(131) | Span(141), ) m2 = LicenseMatch( rule=r1, qspan=Span(198, 341), ispan=Span(0, 143), hispan= Span(1) | Span(10) | Span(14) | Span(18) | Span(24) | Span(27) | Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68) | Span(70) | Span(80) | Span(88) | Span(96) | Span(111) | Span(113) | Span(115) | Span(131) | Span(141)) matches = merge_matches([m1, m2]) assert [m1, m2] == matches
def test_merge_does_merge_overlapping_matches_of_same_rules_if_in_sequence(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == merge_matches([m1, m2])
def get_fragments_matches( self, query, matched_qspans, deadline=sys.maxsize, **kwargs, ): """ Approximate matching strategy breaking a query in query_runs and using fragment matching. Return a list of matches. """ matches = [] for query_run in query.query_runs: # we cannot do a sequence match in query run without some high token left if not query_run.is_matchable(include_low=False, qspans=matched_qspans): continue qrun_matches = match_aho.match_fragments(self, query_run) matches.extend(match.merge_matches(qrun_matches)) # break if deadline has passed if time() > deadline: break return matches
def test_merge_overlapping_matches(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) matches = merge_matches([m1, m2]) assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == matches
def test_merge_should_not_merge_repeated_matches_out_of_sequence(self): rule = Rule(text_file='gpl-2.0_49.RULE', licenses=[u'gpl-2.0']) rule.rid = 2615 m1 = LicenseMatch(rule=rule, matcher='chunk1', qspan=Span(0, 7), ispan=Span(0, 7)) m2 = LicenseMatch(rule=rule, matcher='chunk2', qspan=Span(8, 15), ispan=Span(0, 7)) m3 = LicenseMatch(rule=rule, matcher='chunk3', qspan=Span(16, 23), ispan=Span(0, 7)) result = merge_matches([m1, m2, m3]) assert [m1, m2, m3] == result
def test_merge_contiguous_contained_matches(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6)) m5 = LicenseMatch(rule=r1, qspan=Span(7, 8), ispan=Span(7, 8)) result = merge_matches([m1, m2, m5]) assert [LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8))] == result
def test_merge_contiguous_touching_matches_in_sequence(self): r1 = Rule(_text='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6)) result = merge_matches([m1, m2]) match = result[0] assert LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)) == match
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_not_sequence(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(1, 3), ispan=Span(1, 3)) m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(1, 3)) matches = merge_matches([m1, m2]) assert sorted([m1, m2]) == sorted(matches)
def test_merge_does_not_merge_overlapping_matches_of_different_rules_with_different_licensing(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl2']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) assert [m1, m2] == merge_matches([m1, m2])
def test_merge_does_merge_non_contiguous_matches_in_sequence(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(4, 6), ispan=Span(4, 6)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) results = merge_matches([m1, m2, m5]) assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == results
def test_merge_does_not_merge_contained_matches_of_different_rules_with_same_licensing(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) matches = merge_matches([m1, m2]) assert sorted([m1, m2]) == sorted(matches)
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps_for_long_match(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r1.length = 20 m1 = LicenseMatch(rule=r1, qspan=Span(1, 10), ispan=Span(1, 10)) m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(14, 20)) expected = [LicenseMatch(rule=r1, qspan=Span(1, 10) | Span(14, 20), ispan=Span(1, 10) | Span(14, 20))] results = merge_matches([m1, m2]) assert expected == results
def test_merge_merges_contained_and_overlapping_match(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) contained = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4)) overlapping = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) assert contained in overlapping assert contained in m1 result = merge_matches([m1, contained, overlapping]) expected = [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] assert expected == result
def test_merge_does_not_merges_matches_with_same_spans_if_rules_are_different(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) result = merge_matches([m1, m2, m5]) assert sorted([LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)), m2]) == sorted(result)
def test_merge_merges_contained_and_overlapping_match(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) contained = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4)) overlapping = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) assert contained in overlapping assert contained in m1 result = merge_matches([m1, contained, overlapping]) expected = [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] assert expected == result
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps( self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(1, 3), ispan=Span(1, 3)) m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(4, 10)) assert [ LicenseMatch(rule=r1, qspan=Span(1, 3) | Span(14, 20), ispan=Span(1, 10)) ] == merge_matches([m1, m2])
def test_merge_does_not_merges_matches_with_same_spans_if_licenses_are_the_same_but_have_different_licenses_ordering( self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) r2 = Rule(text_file='r2', license_expression='gpl OR apache-2.0') m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) result = merge_matches([m1, m2, m5]) assert sorted( [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)), m2]) == sorted(result)
def test_merge_does_not_merge_matches_with_same_spans_if_licenses_are_identical_but_rule_differ( self): r1 = Rule(text_file='r1', license_expression='apache-2.0') m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) r2 = Rule(text_file='r2', license_expression='apache-2.0') m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) matches = merge_matches([m1, m2, m5]) assert sorted( [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)), m2]) == sorted(matches)
def test_merge_then_filter_matches_with_same_spans_if_licenses_are_identical_but_rule_differ(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) r2 = Rule(text_file='r2', licenses=['apache-2.0']) m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) matches = merge_matches([m1, m2, m5]) matches, discarded = filter_contained_matches(matches) assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == matches assert discarded
def test_merge_does_not_merge_overlaping_matches_with_same_licensings(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) overlap = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) same_span1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) same_span2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) result = merge_matches([overlap, same_span1, same_span2]) expected = [ LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)), LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)), ] assert sorted(expected) == sorted(result)
def get_approximate_matches(self, query): """ Approximate matching strategy using query_runs and multiple local alignments (aka. diff) Return a list of matches. """ matches = [] # we exclude small and "weak" rules from the subset entirely: they are # unlikely to be matchable with a seq match rules_subset = (self.regular_rids | self.small_rids).difference( self.weak_rids) for query_run in query.query_runs: if not query_run.is_matchable(include_low=True): continue # per query run hash matching just in case we are lucky hash_matches = match_hash.hash_match(self, query_run) if hash_matches: matches.extend(hash_matches) continue # inverted index match and ranking, query run-level # FIXME: we should consider aho matches to excludes them from candidates # FIXME: also exclude from candidates any rule that is only aho-matchable run_matches = [] MAX_CANDIDATES = 50 candidates = match_set.compute_candidates( query_run, self, rules_subset=rules_subset, top=MAX_CANDIDATES) # multiple sequence matching/alignment, query run-level for candidate in candidates: start_offset = 0 while True: rule_matches = match_seq.match_sequence( self, candidate, query_run, start_offset=start_offset) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break matches.extend(match.merge_matches(run_matches, max_dist=MAX_DIST)) return matches
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps( self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') r1.length = 50 m1 = LicenseMatch(rule=r1, qspan=Span(1, 3), ispan=Span(1, 3)) m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(4, 10)) expected = [ LicenseMatch(rule=r1, qspan=Span(1, 3) | Span(14, 20), ispan=Span(1, 10)) ] results = merge_matches([m1, m2]) assert expected == results
def test_merge_does_not_merge_multiple_contained_matches_across_rules(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) contained1 = LicenseMatch(rule=r2, qspan=Span(1, 2), ispan=Span(1, 2)) r3 = Rule(text_file='r3', licenses=['apache-2.0', 'gpl']) contained2 = LicenseMatch(rule=r3, qspan=Span(3, 4), ispan=Span(3, 4)) r5 = Rule(text_file='r5', licenses=['apache-2.0', 'gpl']) m5 = LicenseMatch(rule=r5, qspan=Span(1, 6), ispan=Span(1, 6)) result = merge_matches([m1, contained1, contained2, m5]) assert sorted([m1, contained1, contained2, m5]) == sorted(result)
def get_approximate_matches(self, query, matched_qspans=None, **kwargs): """ Approximate matching strategy breaking a query in query_runs and using exacat matching then multiple local alignments (aka. diff). Return a list of matches. """ matches = [] rules_subset = self.approx_matchable_rules_subset for query_run in query.query_runs: if not query_run.is_matchable(include_low=False, qspans=matched_qspans): continue # inverted index match and ranking, query run-level # FIXME: we should consider aho matches to excludes them from candidates # FIXME: also exclude from candidates any rule that is only aho-matchable run_matches = [] MAX_CANDIDATES = 50 candidates = match_set.compute_candidates( query_run, self, rules_subset=rules_subset, top=MAX_CANDIDATES) # multiple sequence matching/alignment, query run-level for candidate in candidates: start_offset = 0 while True: rule_matches = match_seq.match_sequence( self, candidate, query_run, start_offset=start_offset) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break matches.extend(match.merge_matches(run_matches, max_dist=MAX_DIST)) return matches
def get_query_run_approximate_matches( self, query_run, candidates, matched_qspans, deadline=sys.maxsize, **kwargs, ): """ Return Return a list of approximate matches for a single query run. """ matches = [] # we cannot do a sequence match in query run without some high token left if not query_run.is_matchable(include_low=False, qspans=matched_qspans): if TRACE_APPROX: logger_debug( 'get_query_run_approximate_matches: query_run not matchable:', query_run) return matches # Perform multiple sequence matching/alignment for each candidate, # query run-level for as long as we have more non-overlapping # matches returned for _score_vecs, rid, candidate_rule, high_intersection in candidates: if USE_DMP: # Myers diff works best when the difference are small, otherwise # it performs rather poorly as it is not aware of legalese match_blocks = match_blocks_dmp high_postings = None else: # we prefer to use the high tken aware seq matching only # when the matches are not clear. it works best when things # are farther apart match_blocks = match_blocks_seq high_postings = self.high_postings_by_rid[rid] high_postings = { tid: postings for tid, postings in high_postings.items() if tid in high_intersection} start_offset = 0 while True: rule_matches = match_seq.match_sequence( self, candidate_rule, query_run, high_postings=high_postings, start_offset=start_offset, match_blocks=match_blocks, ) if TRACE_APPROX_MATCHES: self.debug_matches( matches=rule_matches, message='get_query_run_approximate_matches: rule_matches:', with_text=True, qry=query_run.query, ) if not rule_matches: break matches_end = max(m.qend for m in rule_matches) matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break # break if deadline has passed if time() > deadline: break # break if deadline has passed if time() > deadline: break # FIXME: is this really needed here? matches = match.merge_matches(matches) return matches
def match_query( self, qry, min_score=0, as_expression=False, expression_symbols=None, approximate=True, unknown_licenses=False, deadline=sys.maxsize, _skip_hash_match=False, **kwargs, ): """ Return a sequence of LicenseMatch by matching the ``qry`` Query against this index. See Index.match() for arguments documentation. """ whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] if not _skip_hash_match: matches = match_hash.hash_match(self, whole_query_run) if matches: match.set_matched_lines(matches, qry.line_by_pos) return matches get_spdx_id_matches = partial( self.get_spdx_id_matches, expression_symbols=expression_symbols, ) if as_expression: matches = get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_matched_lines(matches, qry.line_by_pos) return matches matches = [] if USE_AHO_FRAGMENTS: approx = self.get_fragments_matches else: approx = self.get_approximate_matches matchers = [ # matcher, include_low in post-matching remaining matchable check (self.get_exact_matches, False, 'aho'), (get_spdx_id_matches, True, 'spdx_lid'), ] if approximate: matchers += [(approx, False, 'seq'), ] already_matched_qspans = [] for matcher, include_low, matcher_name in matchers: if TRACE: logger_debug() logger_debug('matching with matcher:', matcher_name) matched = matcher( qry, matched_qspans=already_matched_qspans, existing_matches=matches, deadline=deadline, ) if TRACE: self.debug_matches( matches=matched, message='matched with: ' + matcher_name, location=qry.location, query_string=qry.query_string, ) matched = match.merge_matches(matched) matches.extend(matched) # Subtract whole text matched if this is long enough for m in matched: if (m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98 ): qry.subtract(m.qspan) # Check if we have some matchable left do not match futher if we do # not need to collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match # fragments (unused for now). already_matched_qspans.extend( m.qspan for m in matched if m.coverage() == 100) if not whole_query_run.is_matchable( include_low=include_low, qspans=already_matched_qspans): break # break if deadline has passed if time() > deadline: break # refining matches without filtering false positives matches, _discarded = match.refine_matches( matches=matches, query=qry, min_score=min_score, filter_false_positive=False, merge=True, ) if unknown_licenses: good_matches, weak_matches = match.split_weak_matches(matches) # collect the positions that are "good matches" to exclude from # matching for unknown_licenses. Create a Span to check for unknown # based on this. original_qspan = Span(0, len(qry.tokens) - 1) good_qspans = (m.qspan for m in good_matches) good_qspan = Span().union(*good_qspans) unmatched_qspan = original_qspan.difference(good_qspan) # for each subspan, run unknown license detection unknown_matches = [] for unspan in unmatched_qspan.subspans(): unquery_run = query.QueryRun( query=qry, start=unspan.start, end=unspan.end, ) unknown_match = match_unknown.match_unknowns( idx=self, query_run=unquery_run, automaton=self.unknown_automaton, ) if unknown_match: unknown_matches.append(unknown_match) unknown_matches = match.filter_invalid_contained_unknown_matches( unknown_matches=unknown_matches, good_matches=good_matches, ) matches.extend(unknown_matches) # reinject weak matches and let refine matches keep the bests matches.extend(weak_matches) if not matches: return [] if TRACE: logger_debug() self.debug_matches( matches=matches, message='matches before final merge', location=qry.location, query_string=qry.query_string, with_text=True, qry=qry) matches, _discarded = match.refine_matches( matches=matches, query=qry, min_score=min_score, filter_false_positive=True, merge=True, ) matches.sort() if TRACE: self.debug_matches( matches=matches, message='final matches', location=qry.location, query_string=qry.query_string, with_text=True, qry=qry, ) return matches
def match_fragments(idx, query_run): """ Return a list of Span by matching the `query_run` against the `automaton` and `idx` index. This is using a BLAST-like matching approach: we match ngram fragments of rules (e.g. a seed) and then we extend left and right. """ if TRACE_FRAG: logger_debug('-------------->match_fragments') # Get matches using the AHO Fragments automaton matches = exact_match(idx, query_run, automaton=idx.fragments_automaton, matcher=MATCH_AHO_FRAG) if TRACE_FRAG: logger_debug('match_fragments') for m in matches: print(m) # Discard fragments that have any already matched positions in previous matches from licensedcode.match import filter_already_matched_matches matches, _discarded = filter_already_matched_matches( matches, query_run.query) # Merge matches with a zero max distance, e.g. contiguous or overlapping # with matches to the same rule from licensedcode.match import merge_matches matches = merge_matches(matches, max_dist=0) # extend matched fragments left and right. We group by rule from licensedcode.seq import extend_match rules_by_rid = idx.rules_by_rid tids_by_rid = idx.tids_by_rid len_legalese = idx.len_legalese alo = qbegin = query_run.start ahi = query_run.end query = query_run.query qtokens = query.tokens matchables = query_run.matchables frag_matches = [] keyf = lambda m: m.rule.rid matches.sort(key=keyf) matches_by_rule = groupby(matches, key=keyf) for rid, rule_matches in matches_by_rule: itokens = tids_by_rid[rid] blo, bhi = 0, len(itokens) rule = rules_by_rid[rid] for match in rule_matches: i, j, k = match.qstart, match.istart, match.len() # extend alignment left and right as long as we have matchables qpos, ipos, mlen = extend_match(i, j, k, qtokens, itokens, alo, ahi, blo, bhi, matchables) qspan = Span(range(qpos, qpos + mlen)) ispan = Span(range(ipos, ipos + mlen)) hispan = Span(p for p in ispan if itokens[p] < len_legalese) match = LicenseMatch(rule, qspan, ispan, hispan, qbegin, matcher=MATCH_AHO_FRAG, query=query) frag_matches.append(match) # Merge matches as usual matches = merge_matches(matches) return frag_matches
def match_query( self, qry, min_score=0, as_expression=False, expression_symbols=None, approximate=True, deadline=sys.maxsize, _skip_hash_match=False, **kwargs, ): """ Return a sequence of LicenseMatch by matching the `qry` Query against this index. See Index.match() for arguments documentation. """ whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] if not _skip_hash_match: matches = match_hash.hash_match(self, whole_query_run) if matches: match.set_lines(matches, qry.line_by_pos) return matches get_spdx_id_matches = partial( self.get_spdx_id_matches, expression_symbols=expression_symbols, ) if as_expression: matches = get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_lines(matches, qry.line_by_pos) return matches matches = [] if USE_AHO_FRAGMENTS: approx = self.get_fragments_matches else: approx = self.get_approximate_matches matchers = [ # matcher, include_low in post-matching remaining matchable check (self.get_exact_matches, False, 'aho'), (get_spdx_id_matches, True, 'spdx_lid'), ] if approximate: matchers += [ (approx, False, 'seq'), ] already_matched_qspans = [] for matcher, include_low, matcher_name in matchers: if TRACE: logger_debug() logger_debug('matching with matcher:', matcher_name) matched = matcher( qry, matched_qspans=already_matched_qspans, existing_matches=matches, deadline=deadline, ) if TRACE: self.debug_matches( matches=matched, message='matched with: ' + matcher_name, location=location, query_string=query_string, ) matched = match.merge_matches(matched) matches.extend(matched) # Subtract whole text matched if this is long enough for m in matched: if (m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98): qry.subtract(m.qspan) # Check if we have some matchable left do not match futher if we do # not need to collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match # fragments (unused for now). already_matched_qspans.extend(m.qspan for m in matched if m.coverage() == 100) if not whole_query_run.is_matchable(include_low=include_low, qspans=already_matched_qspans): break # break if deadline has passed if time() > deadline: break if not matches: return [] if TRACE: logger_debug() self.debug_matches(matches=matches, message='matches before final merge', location=location, query_string=query_string, with_text=True, qry=qry) matches, _discarded = match.refine_matches( matches=matches, idx=self, query=qry, min_score=min_score, filter_false_positive=True, merge=True, ) matches.sort() match.set_lines(matches, qry.line_by_pos) if TRACE: self.debug_matches( matches=matches, message='final matches', location=location, query_string=query_string, with_text=True, qry=qry, ) return matches
def match(self, location=None, query_string=None, min_score=0, as_expression=False, deadline=sys.maxsize, _skip_hash_match=False, **kwargs): """ This is the main entry point to match licenses. Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. If `as_expression` is True, treat the whole text as a single SPDX license expression and use only expression matching. `deadline` is a time.time() value in seconds by which the processing should stop and return whatever was matched so far. `_skip_hash_match` is used only for testing. """ assert 0 <= min_score <= 100 if not location and not query_string: return [] qry = query.build_query(location, query_string, idx=self, text_line_threshold=15, bin_line_threshold=50) if TRACE: logger_debug('match: for:', location, 'query:', qry) if not qry: return [] whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] if not _skip_hash_match: matches = match_hash.hash_match(self, whole_query_run) if matches: match.set_lines(matches, qry.line_by_pos) return matches # TODO: add match to degenerated expressions with custom symbols if as_expression: matches = self.get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_lines(matches, qry.line_by_pos) return matches negative_matches = [] if self.negative_rids: negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: self.debug_matches( matches=negative_matches, message='negative_matches', location=location, query_string=query_string) # , with_text, query) matches = [] if USE_AHO_FRAGMENTS: approx = self.get_fragments_matches else: approx = self.get_approximate_matches matchers = [ # matcher, include_low in post-matching remaining matchable check (self.get_spdx_id_matches, True, 'spdx_lid'), (self.get_exact_matches, False, 'aho'), (approx, False, 'seq'), ] already_matched_qspans = [] for matcher, include_low, matcher_name in matchers: if TRACE: logger_debug() logger_debug('matching with matcher:', matcher_name) matched = matcher(qry, matched_qspans=already_matched_qspans, existing_matches=matches, deadline=deadline) if TRACE: self.debug_matches( matches=matched, message='matched with: ' + matcher_name, location=location, query_string=query_string) # , with_text, query) matched = match.merge_matches(matched) matches.extend(matched) # subtract whole text matched if this is long enough for m in matched: if m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98: qry.subtract(m.qspan) # check if we have some matchable left # do not match futher if we do not need to # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) already_matched_qspans.extend(m.qspan for m in matched if m.coverage() == 100) if not whole_query_run.is_matchable( include_low=include_low, qspans=already_matched_qspans): break # break if deadline has passed if time() > deadline: break if not matches: return [] if TRACE: logger_debug() self.debug_matches(matches=matches, message='matches before final merge', location=location, query_string=query_string, with_text=True, qry=qry) matches, _discarded = match.refine_matches( matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True, merge=True) matches.sort() match.set_lines(matches, qry.line_by_pos) if TRACE: print() self.debug_matches(matches=matches, message='final matches', location=location, query_string=query_string , with_text=True, qry=qry) return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash, negative and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash(self, whole_query_run) if hash_matches: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative = self.negative_match(whole_query_run) for neg in negative: if TRACE_NEGATIVE: self.debug_matches(negative, ' ##match: NEGATIVE subtracting #:', location, query_string) whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: logger_debug(' #match: NEGATIVE found', negative) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = refine_matches(exact_matches, self, query=qry) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [ m.qspan for m in exact_matches if m.coverage() == 100 ] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash(self, query_run) if hash_matches: if TRACE: self.debug_matches( hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_QUERY_RUN: logger_debug( ' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: logger_debug( ' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', candidate) start_offset = 0 while True: rule_matches = match_sequence( self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches( rule_matches, ' #match: query_run: seq matches for candidate' ) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True) run_matches = merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches( run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug( '!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True) return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash.hash_match(self, whole_query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) match.set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative_matches = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False, merge=False) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [m.qspan for m in exact_matches if m.coverage() == 100] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): # FIXME: we should exclude small and "weak" rules from the subset entirely # they are unlikely to be matchable with a seq match rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:query_run:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash.hash_match(self, query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # FIXME: why do not we aho match again here? This would avoid # going into the costly set and seq re-match that may not be needed at all # alternatively we should consider aho matches to excludes them from candidates # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = match_set.compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_CANDIDATES: logger_debug(' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: _, canrule, _ = candidate logger_debug(' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', canrule) start_offset = 0 while True: rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches(rule_matches, ' #match: query_run: seq matches for candidate', with_text=True, query=qry) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True, query=qry) run_matches = match.merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches(run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() match.set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True, query=qry) return matches