def add_rule(spdx_text, license_obj): """ Add a new rule with text `spdx_text` for the `license_obj` License. """ rule_base_name = 'spdx_license_id_' + spdx_text.lower( ) + '_for_' + license_obj.key text_file = os.path.join(rules_data_dir, rule_base_name + '.RULE') data_file = os.path.join(rules_data_dir, rule_base_name + '.yml') if os.path.exists(text_file) or os.path.exists(data_file): raise Exception( 'Cannot create new SPDX rules text file for {text}. ' 'File already exists at: {text_file}'.format(**locals())) with open(text_file, 'wb') as tf: tf.write(spdx_text) rule = Rule( text_file=text_file, license_expression=license_obj.key, relevance=80, minimum_coverage=100, notes='Used to detect a bare SPDX license id', ) rule.data_file = data_file rule.dump() click.echo('Added new rule: ' + repr(rule))
def test_Query_tokens_by_line_from_string(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' rule = Rule(stored_text=rule_text, license_expression='bsd') legalese = set([ 'redistribution', 'form', ]) idx = index.LicenseIndex([rule], _legalese=legalese) querys = ''' The Redistribution and use in source and binary are permitted Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx, _test_mode=True) result = list(qry.tokens_by_line()) expected = [ [], [None], [1, 2, 3, 4, 5, 2, 6, 12, 13], [], [None, None, None, None], [None, 2, None], [None], ] assert expected == result # convert tid to actual token strings qtbl_as_str = lambda qtbl: [[ None if tid is None else idx.tokens_by_tid[tid] for tid in tids ] for tids in qtbl] result_str = qtbl_as_str(result) expected_str = [ [], [None], [ 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted' ], [], [None, None, None, None], [None, 'and', None], [None], ] assert expected_str == result_str assert [3, 3, 3, 3, 3, 3, 3, 3, 3, 6] == qry.line_by_pos idx = index.LicenseIndex( [Rule(stored_text=rule_text, license_expression='bsd')]) querys = 'and this is not a license' qry = Query(query_string=querys, idx=idx, _test_mode=True) result = list(qry.tokens_by_line()) expected = [['and', None, None, None, 'license']] assert expected == qtbl_as_str(result)
def add_rule(spdx_text, license_obj): """ Add a new rule with text `spdx_text` for the `license_obj` License. """ rule_base_name = "spdx_license_id_" + spdx_text.lower( ) + "_for_" + license_obj.key text_file = os.path.join(rules_data_dir, rule_base_name + ".RULE") data_file = os.path.join(rules_data_dir, rule_base_name + ".yml") if os.path.exists(text_file) or os.path.exists(data_file): raise Exception( "Cannot create new SPDX rules text file for {text}. " "File already exists at: {text_file}".format(**locals())) with io.open(text_file, "w", encoding="utf-8") as tf: tf.write(spdx_text) rule = Rule( text_file=text_file, license_expression=license_obj.key, relevance=80, minimum_coverage=100, notes="Used to detect a bare SPDX license id", ) rule.data_file = data_file rule.dump() click.echo("Added new rule: " + repr(rule))
def test_multiple_contained_matches_are_filtered(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, query_position=analysis.Token(start=0, end=5), score=100) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) contained1 = LicenseMatch(rule=r2, query_position=analysis.Token(start=1, end=2), score=100) r3 = Rule(text_file='r3', licenses=['apache-2.0', 'gpl']) contained2 = LicenseMatch(rule=r3, query_position=analysis.Token(start=3, end=4), score=100) r5 = Rule(text_file='r5', licenses=['apache-2.0', 'gpl']) m5 = LicenseMatch(rule=r5, query_position=analysis.Token(start=1, end=6), score=100) result = detect.filter_overlapping_matches( [m1, contained1, contained2, m5]) assert [m1, m5] == result
def test_LicenseMatch_small(self): r1_text = u'licensed under the GPL, licensed under the GPL' r1 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r1_text) r2_text = u'licensed under the GPL, licensed under the GPL' * 10 r2 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r2_text) _idx = index.LicenseIndex([r1, r2]) assert LicenseMatch(rule=r1, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)).small() assert LicenseMatch(rule=r1, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(11, 12)).small() assert LicenseMatch(rule=r1, qspan=Span(10, 11, 12), ispan=Span(10, 11, 12), hispan=Span(11, 12)).small() assert LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)).small() assert LicenseMatch(rule=r2, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)).small() assert LicenseMatch(rule=r2, qspan=Span(5, 10), ispan=Span(5, 10), hispan=Span(5, 6)).small() assert LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)).small()
def test_LicenseMatch_score_0_relevance(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) r1.relevance = 0 r1.length = 6 m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) assert m1.score() == 0
def test_LicenseMatch_score_0(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) r1.relevance = 0 r1.length = 6 m1 = LicenseMatch(rule=r1, qspan=Span(), ispan=Span()) assert m1.score() == 0
def test_LicenseMatch_score_0_relevance(self): r1 = Rule(text_file='r1', license_expression='apache-2.0') r1.relevance = 0 r1.length = 6 m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) assert m1.score() == 0
def test_LicenseMatch_small(self): r1_text = u'licensed under the GPL, licensed under the GPL distribute extent of law' small_rule = Rule(text_file='small_rule', licenses=['apache-1.1'], _text=r1_text) r2_text = u'licensed under the GPL, licensed under the GPL re distribute extent of law' * 10 long_rule = Rule(text_file='long_rule', licenses=['apache-1.1'], _text=r2_text) _idx = index.LicenseIndex([small_rule, long_rule]) test = LicenseMatch(rule=small_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(11, 12)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(10, 11, 12), ispan=Span(10, 11, 12), hispan=Span(11, 12)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(1, 6), ispan=Span(1, 6)) assert test.small() test = LicenseMatch(rule=long_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)) assert test.small() test = LicenseMatch(rule=long_rule, qspan=Span(5, 10), ispan=Span(5, 10), hispan=Span(5, 6)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(1, 10), ispan=Span(1, 10), hispan=Span(3, 6)) assert not test.small()
def test_LicenseMatch_score_100_contiguous(self): r1 = Rule(text_file='r1', license_expression='apache-2.0') r1.relevance = 100 r1.length = 42 m1 = LicenseMatch(rule=r1, qspan=Span(0, 41), ispan=Span(0, 41)) assert m1.score() == 100
def test_overlap_detection5(self): # test this containment relationship between test and index licenses: # * Index licenses: # +-license 2 --------+ # | +-license 1 --+ | # +-------------------+ # # +-license 4 --------+ # | +-license 1 --+ | # +-------------------+ # setup index license1 = '''Redistribution and use permitted for MIT license.''' license2 = '''Redistributions of source must retain copyright. Redistribution and use permitted for MIT license. Redistributions in binary form is permitted.''' rule1 = Rule(_text=license1, licenses=['overlap']) rule2 = Rule(_text=license2, licenses=['overlap']) idx = index.LicenseIndex([rule1, rule2]) querys = '''My source. Redistribution and use permitted for MIT license. My code.''' # test : querys contains license1: return license1 as exact coverage matches = idx.match(query_string=querys) assert 1 == len(matches) match = matches[0] assert rule1 == match.rule qtext, _itext = get_texts(match, query_string=querys, idx=idx) assert 'Redistribution and use permitted for MIT license' == qtext
def test_LicenseMatch_score_100_non_contiguous(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) r1.relevance = 100 r1.length = 42 m1 = LicenseMatch(rule=r1, qspan=Span(0, 19) | Span(30, 51), ispan=Span(0, 41)) assert m1.score() == 80.77
def test_overlap_detection2_exact(self): # test this containment relationship between test and index licenses: # * Index licenses: # +-license 2 --------+ # | +-license 1 --+ | # +-------------------+ # setup index license1 = '''Redistribution and use permitted.''' license2 = '''Redistributions of source must retain copyright. Redistribution and use permitted. Redistributions in binary form is permitted.''' rule1 = Rule(stored_text=license1, license_expression='overlap') rule2 = Rule(stored_text=license2, license_expression='overlap') idx = index.LicenseIndex([rule1, rule2]) # test : license2 contains license1: return license2 as exact coverage querys = 'Redistribution and use bla permitted.' matches = idx.match(query_string=querys) assert 1 == len(matches) match = matches[0] assert rule1 == match.rule qtext, _itext = get_texts(match, query_string=querys, idx=idx) assert 'Redistribution and use [bla] permitted' == qtext
def test_LicenseMatch_score_25_with_stored_relvance(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) r1.relevance = 50 r1.length = 6 m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) # NB we do not have a query here assert m1.score() == 50
def test_merge_should_not_merge_repeated_matches_out_of_sequence(self): rule = Rule(text_file='gpl-2.0_49.RULE', licenses=[u'gpl-2.0']) rule.rid = 2615 m1 = LicenseMatch(rule=rule, matcher='chunk1', qspan=Span(0, 7), ispan=Span(0, 7)) m2 = LicenseMatch(rule=rule, matcher='chunk2', qspan=Span(8, 15), ispan=Span(0, 7)) m3 = LicenseMatch(rule=rule, matcher='chunk3', qspan=Span(16, 23), ispan=Span(0, 7)) result = merge_matches([m1, m2, m3]) assert [m1, m2, m3] == result
def test_merge_does_not_merge_overlapping_matches_of_different_rules_with_different_licensing(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl2']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) assert [m1, m2] == merge_matches([m1, m2])
def test_LicenseMatch_score_25_with_stored_relevance(self): r1 = Rule(text_file='r1', license_expression='apache-2.0') r1.relevance = 50 r1.length = 6 m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) # NB we do not have a query here assert m1.score() == 25
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps_for_long_match(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r1.length = 20 m1 = LicenseMatch(rule=r1, qspan=Span(1, 10), ispan=Span(1, 10)) m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(14, 20)) expected = [LicenseMatch(rule=r1, qspan=Span(1, 10) | Span(14, 20), ispan=Span(1, 10) | Span(14, 20))] results = merge_matches([m1, m2]) assert expected == results
def test_merge_does_not_merge_contained_matches_of_different_rules_with_same_licensing(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) matches = merge_matches([m1, m2]) assert sorted([m1, m2]) == sorted(matches)
def test_overlap_detection(self): # test this containment relationship between test and index licenses: # * Index licenses: # # +-license 2 --------+ # | +-license 1 --+ | # | +-------------+ | # +-------------------+ # # * License texts to detect: # # +- license 3 -----------+ # | +-license 2 --------+ | # | | +-license 1 --+ | | # | | +-------------+ | | # | +-------------------+ | # +-----------------------+ # # +-license 4 --------+ # | +-license 1 --+ | # | +-------------+ | # +-------------------+ tf1 = self.get_test_loc('detect/overlap/license.txt') tf2 = self.get_test_loc('detect/overlap/license2.txt') tf3 = self.get_test_loc('detect/overlap/license3.txt') tf4 = self.get_test_loc('detect/overlap/license4.txt') # setup index ftr1 = Rule(text_file=tf1, licenses=['overlap_license']) ftr2 = Rule(text_file=tf2, licenses=['overlap_license']) index = detect.LicenseIndex([ftr1, ftr2]) # test : 1 contains nothing: return 1 matches = index.match(tf1) self.assertEqual(1, len(matches)) match = matches[0] self.assertEqual(ftr1, match.rule) # test : 2 contains 1: return 2 matches = index.match(tf2) self.assertEqual(1, len(matches)) match = matches[0] self.assertEqual(ftr2, match.rule) # test : 3 contains 2 that contains 1: return 2 matches = index.match(tf3) self.assertEqual(1, len(matches)) match = matches[0] self.assertEqual(ftr2, match.rule) # test : 4 contains 1: return 1 matches = index.match(tf4) self.assertEqual(1, len(matches)) match = matches[0] self.assertEqual(ftr1, match.rule)
def test_files_does_filter_contained_matches_of_different_rules_with_same_licensing(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) matches, discarded = filter_contained_matches([m1, m2]) assert [m2] == matches assert [m1] == discarded
def test_merge_does_not_merges_matches_with_same_spans_if_rules_are_different(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) result = merge_matches([m1, m2, m5]) assert sorted([LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)), m2]) == sorted(result)
def test_match_is_same(self): r1 = Rule(licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, query_position=analysis.Token(start=0, end=2)) r2 = Rule(licenses=['gpl', 'apache-2.0']) m2 = LicenseMatch(rule=r2, query_position=analysis.Token(start=0, end=2)) self.assertTrue(m1.is_same(m2)) self.assertTrue(m2.is_same(m1))
def test_filter_prefers_longer_overlaping_matches(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) overlap = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) same_span1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) same_span2 = LicenseMatch(rule=r2, qspan=Span(1, 8), ispan=Span(1, 8)) matches, discarded = filter_contained_matches([overlap, same_span1, same_span2]) assert [same_span2] == matches assert discarded
def test_matches_with_same_span_are_kept_if_licenses_are_different(self): r1 = Rule(licenses=['apache-2.0']) m1 = LicenseMatch(rule=r1, query_position=analysis.Token(start=0, end=2)) r2 = Rule(licenses=['apache-1.1']) m2 = LicenseMatch(rule=r2, query_position=analysis.Token(start=0, end=2)) m5 = LicenseMatch(rule=r1, query_position=analysis.Token(start=1, end=6)) self.assertEqual([m1, m2, m5], detect.filter_matches([m1, m2, m5]))
def test_combine_raise_TypeError_for_matches_of_different_rules(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') r2 = Rule(text_file='r2', license_expression='apache-2.0 OR gpl2') m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) try: m1.combine(m2) except TypeError: pass
def test_filter_matches_filters_matches_with_medium_overlap_only_if_license_are_the_same(self): r1 = Rule(text_file='r1', licenses=['apache-1.1']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 10), ispan=Span(0, 10)) m2 = LicenseMatch(rule=r1, qspan=Span(3, 11), ispan=Span(3, 11)) r2 = Rule(text_file='r2', licenses=['gpl', 'apache-2.0']) m3 = LicenseMatch(rule=r2, qspan=Span(7, 15), ispan=Span(7, 15)) result, discarded = filter_contained_matches([m1, m2, m3]) assert sorted([m1, m3]) == sorted(result) assert discarded
def test_filter_does_filter_overlaping_matches_with_same_licensings(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') r2 = Rule(text_file='r2', license_expression='apache-2.0 OR gpl') overlap = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) same_span1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) same_span2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) matches, discarded = filter_contained_matches( [overlap, same_span1, same_span2]) assert [overlap] == matches assert discarded
def test_filter_filters_matches_with_same_spans_if_licenses_are_identical_but_rule_differ(self): r1 = Rule(text_file='r1', licenses=['apache-2.0']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) r2 = Rule(text_file='r2', licenses=['apache-2.0']) m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) matches, discarded = filter_contained_matches([m1, m2, m5]) assert [m5] == matches assert discarded
def test_combine_matches_cannot_combine_matches_with_same_licensing_and_different_rules(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)) try: m1.combine(m2) self.fail('Should fail') except TypeError: pass
def test_merge_does_not_merges_matches_with_same_spans_if_licenses_are_the_same_but_have_different_licenses_ordering( self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) r2 = Rule(text_file='r2', license_expression='gpl OR apache-2.0') m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) result = merge_matches([m1, m2, m5]) assert sorted( [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)), m2]) == sorted(result)
def test_merge_does_not_merge_matches_with_same_spans_if_licenses_are_identical_but_rule_differ( self): r1 = Rule(text_file='r1', license_expression='apache-2.0') m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) r2 = Rule(text_file='r2', license_expression='apache-2.0') m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) matches = merge_matches([m1, m2, m5]) assert sorted( [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)), m2]) == sorted(matches)
def test_index_starters_with_inter_gap_equal_to_ngram_length(self): test_text = '''I hereby abandon any{{SAX 2.0 (the)}}, and release all of {{the SAX 2.0 }}source code of his''' rule = Rule(_text=test_text, licenses=['public-domain']) rule_tokens = list(rule.tokens()) assert ['i', 'hereby', 'abandon', 'any', 'and', 'release', 'all', 'of', 'source', 'code', 'of', 'his'] == rule_tokens gaps = rule.gaps assert set([3, 7]) == gaps result = match_chunk.index_starters(rule_tokens, gaps, _ngram_length=4) expected = [ (('i', 'hereby', 'abandon', 'any'), 0), (('and', 'release', 'all', 'of'), 4), (('source', 'code', 'of', 'his'), 8) ] assert expected == list(result)
def test_index_starters_with_multiple_gaps_and_short_start(self): test_text = """ Copyright {{10 Copyright}}. All Rights Reserved. Redistribution materials provided The name {{5 Author}} must not be used to endorse or promote {{5 Author}}. For written permission, please contact {{5 Author Contact}}. 4. Products derived from this Software may not be called {{5 Product}} nor may {{5 Product}} appear in their names without prior {{10 Author}} is a registered trademark of {{5 Author}}. 5. Due credit should be given to {{10 Author and URL}} THIS SOFTWARE IS PROVIDED BY {{10 org}} ``AS IS'' AND ANY EXPRESSED OR IMPLIED IN NO EVENT SHALL {{5 Author}} OR ITS CONTRIBUTORS BE LIABLE {{tail gap}}""" rule = Rule(_text=test_text, licenses=['public-domain']) rule_tokens = list(rule.tokens()) gaps = rule.gaps assert set([0, 8, 16, 21, 31, 33, 39, 44, 51, 56, 67]) == gaps result = match_chunk.index_starters(rule_tokens, gaps, _ngram_length=4) expected = [ (('all', 'rights', 'reserved', 'redistribution'), 1), (('must', 'not', 'be', 'used'), 9), (('for', 'written', 'permission', 'please'), 17), (('4', 'products', 'derived', 'from'), 22), (('appear', 'in', 'their', 'names'), 34), (('is', 'a', 'registered', 'trademark'), 40), (('5', 'due', 'credit', 'should'), 45), (('this', 'software', 'is', 'provided'), 52), (('as', 'is', 'and', 'any'), 57), (('or', 'its', 'contributors', 'be'), 68) ] assert expected == list(result)