def test_LicenseMatch_score_100_non_contiguous(self): r1 = Rule(text_file='r1', license_expression='apache-2.0') r1.relevance = 100 r1.length = 42 m1 = LicenseMatch(rule=r1, qspan=Span(0, 19) | Span(30, 51), ispan=Span(0, 41)) assert m1.score() == 80.77
def test_merge_contiguous_touching_matches_in_sequence(self): r1 = Rule(stored_text='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6)) result = merge_matches([m1, m2]) match = result[0] assert LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)) == match
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps_for_long_match(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r1.length = 20 m1 = LicenseMatch(rule=r1, qspan=Span(1, 10), ispan=Span(1, 10)) m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(14, 20)) expected = [LicenseMatch(rule=r1, qspan=Span(1, 10) | Span(14, 20), ispan=Span(1, 10) | Span(14, 20))] results = merge_matches([m1, m2]) assert expected == results
def test_merge_contiguous_contained_matches(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6)) m5 = LicenseMatch(rule=r1, qspan=Span(7, 8), ispan=Span(7, 8)) result = merge_matches([m1, m2, m5]) assert [LicenseMatch(rule=r1, qspan=Span(0, 8), ispan=Span(0, 8))] == result
def test_merge_does_merge_overlapping_matches_of_same_rules_if_in_sequence( self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == merge_matches([m1, m2])
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_not_sequence( self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(1, 3), ispan=Span(1, 3)) m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(1, 3)) matches = merge_matches([m1, m2]) assert sorted([m1, m2]) == sorted(matches)
def test_combine_matches_with_same_rules(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) match = m1.combine(m2) assert Span(0, 6) == match.qspan assert Span(0, 6) == match.ispan
def test_merge_does_merge_non_contiguous_matches_in_sequence(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(4, 6), ispan=Span(4, 6)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) results = merge_matches([m1, m2, m5]) assert [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] == results
def test_match_does_not_return_incorrect_matches(self): ftr = Rule(text_file=self.create_test_file('A one. A two. A three.')) index = detect.LicenseIndex([ftr]) docs = [ u'some other path', u'some junk', u'some path', u'some other junk' ] for d in docs: matches = index.match([d]) self.assertEqual([], matches)
def test_filter_matches_filters_multiple_nested_contained_matches_and_large_overlapping(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) large_overlap = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) contained = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4)) in_contained = LicenseMatch(rule=r1, qspan=Span(2, 3), ispan=Span(2, 3)) result, discarded = filter_contained_matches([m1, contained, in_contained, large_overlap]) assert [m1] == result assert discarded
def check_detection(self, doc_file, rule_file, expected_matches): test_rule = self.get_test_loc(rule_file) ftr = Rule(text_file=test_rule, licenses=['mit']) index = detect.LicenseIndex([ftr]) test_doc = self.get_test_loc(doc_file) matches = index.match(test_doc) assert 1 == len(matches) assert expected_matches == matches[0].query_position
def test_get_full_matched_text(self): rule_text = u''' Copyright {{some copyright}} THIS IS FROM {{THE CODEHAUS}} AND CONTRIBUTORS IN NO EVENT SHALL {{THE CODEHAUS}} OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE {{POSSIBILITY OF SUCH}} DAMAGE ''' rule = Rule( _text=rule_text, licenses=['test'], ) idx = index.LicenseIndex([rule]) querys = u''' foobar 45 Copyright 2003 (C) James. All Rights Reserved. THIS IS FROM THE CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC ''' result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] expected = u"""Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved]. THIS IS FROM [THE] [CODEHAUS] AND CONTRIBUTORS IN NO EVENT SHALL [THE] [best] [CODEHAUS] OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE [POSSIBILITY] [OF] [SUCH] DAMAGE""" matched_text = u''.join( get_full_matched_text(match, query_string=querys, idx=idx)) assert expected == matched_text # test again using a template expected = u"""Copyright <br>2003</br> (<br>C</br>) <br>James</br>. <br>All</br> <br>Rights</br> <br>Reserved</br>. THIS IS FROM <br>THE</br> <br>CODEHAUS</br> AND CONTRIBUTORS IN NO EVENT SHALL <br>THE</br> <br>best</br> <br>CODEHAUS</br> OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE <br>POSSIBILITY</br> <br>OF</br> <br>SUCH</br> DAMAGE""" matched_text = u''.join( get_full_matched_text(match, query_string=querys, idx=idx, highlight_not_matched=u'<br>%s</br>')) assert expected == matched_text # test again using whole_lines expected = u""" foobar 45 Copyright 2003 (C) James. All Rights Reserved. THIS IS FROM THE CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC\n""" matched_text = u''.join( get_full_matched_text(match, query_string=querys, idx=idx, highlight_not_matched=u'%s', whole_lines=True)) assert expected == matched_text
def test_match_return_correct_positions_with_short_index_and_queries(self): idx = index.LicenseIndex([Rule(_text='MIT License', licenses=['mit'])]) matches = idx.match(query_string='MIT License') assert 1 == len(matches) assert {'_tst_11_0': {'mit': [0]}} == idx.to_dict() qtext, itext = get_texts(matches[0], query_string='MIT License', idx=idx) assert 'MIT License' == qtext assert 'MIT License' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan matches = idx.match(query_string='MIT MIT License') assert 1 == len(matches) qtext, itext = get_texts(matches[0], query_string='MIT MIT License', idx=idx) assert 'MIT License' == qtext assert 'MIT License' == itext assert Span(1, 2) == matches[0].qspan assert Span(0, 1) == matches[0].ispan query_doc1 = 'do you think I am a mit license MIT License, yes, I think so' # # 0 1 2 3 matches = idx.match(query_string=query_doc1) assert 2 == len(matches) qtext, itext = get_texts(matches[0], query_string=query_doc1, idx=idx) assert 'mit license' == qtext assert 'MIT License' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan qtext, itext = get_texts(matches[1], query_string=query_doc1, idx=idx) assert 'MIT License' == qtext assert 'MIT License' == itext assert Span(2, 3) == matches[1].qspan assert Span(0, 1) == matches[1].ispan query_doc2 = '''do you think I am a mit license MIT License yes, I think so''' matches = idx.match(query_string=query_doc2) assert 2 == len(matches) qtext, itext = get_texts(matches[0], query_string=query_doc2, idx=idx) assert 'mit license' == qtext assert 'MIT License' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan qtext, itext = get_texts(matches[1], query_string=query_doc2, idx=idx) assert 'MIT License' == qtext assert 'MIT License' == itext assert Span(2, 3) == matches[1].qspan assert Span(0, 1) == matches[1].ispan
def test_merge_merges_contained_and_overlapping_match(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) contained = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4)) overlapping = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) assert contained in overlapping assert contained in m1 result = merge_matches([m1, contained, overlapping]) expected = [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))] assert expected == result
def test_rule_cannot_contain_extra_unknown_attributes(self): data_file = self.get_test_loc('models/rule_with_extra_attributes/sun-bcl.yml') text_file = self.get_test_loc('models/rule_with_extra_attributes/sun-bcl.RULE') expected = 'data file has unknown attributes: license_expressionnotuce' try: Rule(data_file=data_file, text_file=text_file) self.fail('Exception not raised.') except Exception as e: assert expected in str(e)
def get_test_rules(self, base, subset=None): base = self.get_test_loc(base) test_files = sorted(os.listdir(base)) if subset: test_files = [t for t in test_files if t in subset] return [ Rule(text_file=os.path.join(base, license_key), license_expression=license_key) for license_key in test_files ]
def test_QueryRun(self): idx = index.LicenseIndex([Rule(_text='redistributions in binary form must redistributions in')]) qry = Query(query_string='redistributions in binary form must redistributions in', idx=idx) qruns = qry.query_runs assert 1 == len(qruns) qr = qruns[0] # test result = [idx.tokens_by_tid[tid] for tid in qr.tokens] expected = ['redistributions', 'in', 'binary', 'form', 'must', 'redistributions', 'in'] assert expected == result
def test_non_contiguous_matches_are_not_filtered(self): r1 = Rule(licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, query_position=analysis.Token(start=0, end=2)) m2 = LicenseMatch(rule=r1, query_position=analysis.Token(start=4, end=6)) m5 = LicenseMatch(rule=r1, query_position=analysis.Token(start=1, end=6)) self.assertEqual([m1, m5], detect.filter_matches([m1, m2, m5]))
def test_overlap_detection3(self): # test this containment relationship between test and index licenses: # * Index licenses: # +-license 2 --------+ # | +-license 1 --+ | # +-------------------+ # # * License texts to detect: # +- license 3 -----------+ # | +-license 2 --------+ | # | | +-license 1 --+ | | # | +-------------------+ | # +-----------------------+ # # setup index license1 = '''Redistribution and use permitted.''' license2 = '''Redistributions of source must retain copyright. Redistribution and use permitted. Redistributions in binary form is permitted.''' rule1 = Rule(_text=license1, licenses=['overlap']) rule2 = Rule(_text=license2, licenses=['overlap']) idx = index.LicenseIndex([rule1, rule2]) querys = '''My source. Redistributions of source must retain copyright. Redistribution and use permitted. Redistributions in binary form is permitted. My code.''' # test : querys contains license2 that contains license1: return license2 as exact coverage matches = idx.match(query_string=querys) assert 1 == len(matches) match = matches[0] assert rule2 == match.rule qtext, _itext = get_texts(match, query_string=querys, idx=idx) expected = ''' Redistributions of source must retain copyright Redistribution and use permitted Redistributions in binary form is permitted'''.split() assert expected == qtext.split()
def test_QueryRun_repr(self): idx = index.LicenseIndex([Rule(_text='redistributions in binary form must redistributions in')]) qry = Query(query_string='redistributions in binary form must redistributions in', idx=idx) qruns = qry.query_runs qr = qruns[0] # test expected = 'QueryRun(start=0, len=7, start_line=1, end_line=1)' assert expected == repr(qr) expected = 'QueryRun(start=0, len=7, start_line=1, end_line=1, tokens="redistributions in binary form must redistributions in")' assert expected == qr.__repr__(trace_repr=True)
def test_single_contained_matche_is_filtered(self): r1 = Rule(licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, query_position=analysis.Token(start=0, end=5)) contained = LicenseMatch(rule=r1, query_position=analysis.Token(start=1, end=4)) m5 = LicenseMatch(rule=r1, query_position=analysis.Token(start=1, end=6)) test = detect.filter_matches([m1, contained, m5]) self.assertEqual([m1, m5], test)
def test_simple_detection_against_same_text(self): tf1 = self.get_test_loc('detect/mit/mit.c') ftr = Rule(text_file=tf1, licenses=['mit']) index = detect.LicenseIndex([ftr]) matches = index.match(tf1) assert 1 == len(matches) match = matches[0] assert ftr == match.rule assert 0 == match.span.start assert 86 == match.span.end
def test_filter_matches_filters_non_contiguous_or_overlapping__but_contained_matches(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(1, 2), ispan=Span(1, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6)) m3 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) m4 = LicenseMatch(rule=r1, qspan=Span(0, 7), ispan=Span(0, 7)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) result, discarded = filter_contained_matches([m1, m2, m3, m4, m5]) assert [m4] == result assert discarded
def test_match_matches_correctly_simple_exact_query_1(self): tf1 = self.get_test_loc('detect/mit/mit.c') ftr = Rule(text_file=tf1, licenses=['mit']) idx = index.LicenseIndex([ftr]) query_doc = self.get_test_loc('detect/mit/mit2.c') matches = idx.match(query_doc) assert 1 == len(matches) match = matches[0] assert ftr == match.rule assert Span(0, 86) == match.qspan assert Span(0, 86) == match.ispan
def test_overlapping_matches_are_filtered(self): r1 = Rule(licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, query_position=analysis.Token(start=0, end=5)) same_span = LicenseMatch(rule=r1, query_position=analysis.Token(start=1, end=6)) same_span_too = LicenseMatch(rule=r1, query_position=analysis.Token(start=1, end=6)) test = detect.filter_matches([m1, same_span, same_span_too]) self.assertEqual([m1, same_span], test)
def test_special_characters_detection(self): tf1 = self.get_test_loc('detect/specialcharacter/kerberos.txt') tf2 = self.get_test_loc('detect/specialcharacter/kerberos1.txt') tf3 = self.get_test_loc('detect/specialcharacter/kerberos2.txt') tf4 = self.get_test_loc('detect/specialcharacter/kerberos3.txt') docs = [tf1, tf2, tf3, tf4] for loc in docs: ftr = Rule(text_file=loc, licenses=['kerberos']) index = detect.LicenseIndex([ftr]) matches = index.match(loc) self.assertEqual(1, len(matches))
def test_simple_detection1(self): tf1 = self.get_test_loc('detect/mit/mit.c') ftr = Rule(text_file=tf1, licenses=['mit']) index = detect.LicenseIndex([ftr]) tf2 = self.get_test_loc('detect/mit/mit2.c') matches = index.match(tf2) assert 1 == len(matches) match = matches[0] assert ftr == match.rule assert 5 == match.span.start assert 91 == match.span.end
def test_match_index_return_one_match_with_correct_offsets(self): ftr = Rule(text_file=self.create_test_file('A one. A two. A three.')) index = detect.LicenseIndex([ftr]) doc1 = (u'/some/path/', u'some junk. A one. A two. A three.') # 1111111111222222222233 # 012345678901234567890123456789012 matches = index.match([doc1[1]]) self.assertEqual(1, len(matches)) self.assertEqual(11, matches[0].query_position.start_char) self.assertEqual(32, matches[0].query_position.end_char)
def test_match_can_match_with_plain_rule_simple2(self): rule_text = u'''X11 License Copyright (C) 1996 X Consortium Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Except as contained in this notice, the name of the X Consortium shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization from the X Consortium. X Window System is a trademark of X Consortium, Inc. ''' rule = Rule(_text=rule_text, licenses=['x-consortium']) idx = index.LicenseIndex([rule]) query_loc = self.get_test_loc( 'detect/simple_detection/x11-xconsortium_text.txt') matches = idx.match(location=query_loc) assert 1 == len(matches) expected_qtext = u''' X11 License Copyright C 1996 X Consortium Permission is hereby granted free of charge to any person obtaining a copy of this software and associated documentation files the Software to deal in the Software without restriction including without limitation the rights to use copy modify merge publish distribute sublicense and or sell copies of the Software and to permit persons to whom the Software is furnished to do so subject to the following conditions The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software THE SOFTWARE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND EXPRESS OR IMPLIED INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM DAMAGES OR OTHER LIABILITY WHETHER IN AN ACTION OF CONTRACT TORT OR OTHERWISE ARISING FROM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE Except as contained in this notice the name of the X Consortium shall not be used in advertising or otherwise to promote the sale use or other dealings in this Software without prior written authorization from the X Consortium X Window System is a trademark of X Consortium Inc '''.split() match = matches[0] qtext, _itext = get_texts(match, location=query_loc, idx=idx) assert expected_qtext == qtext.split()
def test_filter_multiple_contained_matches(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) r2 = Rule(text_file='r2', license_expression='apache-2.0 OR gpl') contained1 = LicenseMatch(rule=r2, qspan=Span(1, 2), ispan=Span(1, 2)) r3 = Rule(text_file='r3', license_expression='apache-2.0 OR gpl') contained2 = LicenseMatch(rule=r3, qspan=Span(3, 4), ispan=Span(3, 4)) r5 = Rule(text_file='r5', license_expression='apache-2.0 OR gpl') m5 = LicenseMatch(rule=r5, qspan=Span(1, 6), ispan=Span(1, 6)) matches, discarded = filter_contained_matches( [m1, contained1, contained2, m5]) assert [m1] == matches assert sorted([ m5, contained1, contained2, ]) == sorted(discarded)