def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self): _text = u'copyright reserved mit is license, {{}} copyright reserved mit is license' # 0 1 2 3 4 5 6 7 8 9 licenses = ['tst'] rule = models.Rule(licenses=licenses, _text=_text) idx = index.LicenseIndex([rule]) expected_idx = {'_tst_73_0': {u'copyright': [0, 5], u'license': [4, 9], u'mit': [2, 7]}} assert expected_idx == idx.to_dict() querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.' # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 qry = Query(query_string=querys, idx=idx) # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None] # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 assert expected == tks_as_str(qry.tokens_with_unknowns()) result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] assert Span(0, 4) | Span(6, 10) == match.qspan assert Span(0, 9) == match.ispan assert 100 == match.coverage() qtext, itext = get_texts(match, query_string=querys, idx=idx) assert 'copyright reserved mit is license <is> [the] copyright reserved mit is license' == qtext assert 'copyright reserved mit is license copyright reserved mit is license' == itext
def test_Query_tokens_by_line_from_string(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])]) querys = ''' The Redistribution and use in source and binary are permitted Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx, _test_mode=True) result = list(qry.tokens_by_line()) expected = [ [], [None], [11, 0, 6, 4, 3, 0, 1, 9, 2], [], [None, None, None, None], [None, 0, None], [None], ] assert expected == result # convert tid to actual token strings qtbl_as_str = lambda qtbl: [[None if tid is None else idx.tokens_by_tid[tid] for tid in tids] for tids in qtbl] result_str = qtbl_as_str(result) expected_str = [ [], [None], ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted'], [], [None, None, None, None], [None, 'and', None], [None], ] assert expected_str == result_str assert [3, 3, 3, 3, 3, 3, 3, 3, 3, 6] == qry.line_by_pos idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])]) querys = 'and this is not a license' qry = Query(query_string=querys, idx=idx, _test_mode=True) result = list(qry.tokens_by_line()) expected = [['and', None, None, None, None, None]] assert expected == qtbl_as_str(result)
def test_query_and_index_tokens_are_identical_for_same_text(self): rule_dir = self.get_test_loc('query/rtos_exact/') from licensedcode.models import load_rules idx = index.LicenseIndex(load_rules(rule_dir)) query_loc = self.get_test_loc('query/rtos_exact/gpl-2.0-freertos.RULE') index_text_tokens = [idx.tokens_by_tid[t] for t in idx.tids_by_rid[0]] qry = Query(location=query_loc, idx=idx, line_threshold=4) wqry = qry.whole_query_run() query_text_tokens = [idx.tokens_by_tid[t] for t in wqry.tokens] assert index_text_tokens == query_text_tokens assert u' '.join(index_text_tokens) == u' '.join(query_text_tokens)
def test_query_run_has_correct_offset(self): rule_dir = self.get_test_loc('query/runs/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('query/runs/query.txt') q = Query(location=query_doc, idx=idx, line_threshold=4) result = [qr.to_dict() for qr in q.query_runs] expected = [{ u'end': 0, u'start': 0, u'tokens': u'inc' }, { u'end': 121, u'start': 1, u'tokens': (u'this library is free software you can redistribute it and or modify ' u'it under the terms of the gnu library general public license as ' u'published by the free software foundation either version 2 of the ' u'license or at your option any later version this library is ' u'distributed in the hope that it will be useful but without any ' u'warranty without even the implied warranty of merchantability or ' u'fitness for particular purpose see the gnu library general public ' u'license for more details you should have received copy of the gnu ' u'library general public license along with this library see the file ' u'copying lib if not write to the free software foundation inc 51 ' u'franklin street fifth floor boston ma 02110 1301 usa') }] assert result == expected
def test_Query_from_real_index_and_location(self): idx = index.LicenseIndex(self.get_test_rules('index/bsd')) query_loc = self.get_test_loc('index/querytokens') qry = Query(location=query_loc, idx=idx, line_threshold=4) result = [qr.to_dict() for qr in qry.query_runs] expected = [{ 'end': 35, 'start': 0, 'tokens': (u'redistribution and use in source and binary forms ' u'redistributions of source code must the this that is not ' u'to redistributions in binary form must this software is ' u'provided by the copyright holders and contributors as is') }, { 'end': 36, 'start': 36, 'tokens': u'redistributions' }] assert result == expected expected_lbp = [ 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15 ] assert qry.line_by_pos == expected_lbp
def test_query_runs_with_plain_rule(self): rule_text = u'''X11 License Copyright (C) 1996 X Consortium Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Except as contained in this notice, the name of the X Consortium shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization from the X Consortium. X Window System is a trademark of X Consortium, Inc. ''' rule = Rule(stored_text=rule_text, license_expression='x-consortium') idx = index.LicenseIndex([rule]) query_loc = self.get_test_loc( 'detect/simple_detection/x11-xconsortium_text.txt') qry = Query(location=query_loc, idx=idx) result = [q.to_dict(brief=False) for q in qry.query_runs] expected = [{ 'start': 0, 'end': 213, 'tokens': (u'x11 license copyright c 1996 x consortium permission is hereby ' u'granted free of charge to any person obtaining copy of this ' u'software and associated documentation files the software to deal in ' u'the software without restriction including without limitation the ' u'rights to use copy modify merge publish distribute sublicense and or ' u'sell copies of the software and to permit persons to whom the ' u'software is furnished to do so subject to the following conditions ' u'the above copyright notice and this permission notice shall be ' u'included in all copies or substantial portions of the software the ' u'software is provided as is without warranty of any kind express or ' u'implied including but not limited to the warranties of ' u'merchantability fitness for particular purpose and noninfringement ' u'in no event shall the x consortium be liable for any claim damages or ' u'other liability whether in an action of contract tort or otherwise ' u'arising from out of or in connection with the software or the use or ' u'other dealings in the software except as contained in this notice the ' u'name of the x consortium shall not be used in advertising or ' u'otherwise to promote the sale use or other dealings in this software ' u'without prior written authorization from the x consortium x window ' u'system is trademark of x consortium inc') }] assert len(qry.query_runs[0].tokens) == 214 assert result == expected
def test_QueryRuns_tokens_with_unknowns(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' idx = index.LicenseIndex( [Rule(stored_text=rule_text, license_expression='bsd')]) querys = ''' The Redistribution and use in source and binary are permitted. Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx) assert set(qry.matchables) == set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) assert len(qry.query_runs) == 1 qrun = qry.query_runs[0] expected = [ 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and' ] assert tks_as_str(qrun.tokens, idx=idx) == expected expected = [ None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and' ] assert tks_as_str(query_run_tokens_with_unknowns(qrun), idx=idx) == expected assert qrun.start == 0 assert qrun.end == 9
def test_query_run_and_tokenizing_breaking_works__with_plus_as_expected(self): rule_dir = self.get_test_loc('query/run_breaking/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('query/run_breaking/query.txt') q = Query(query_doc, idx=idx) result = [qr.to_dict() for qr in q.query_runs] expected = [ {'end': 119, 'start': 0, 'tokens': 'this library is free software you can redistribute it ' 'and or modify it under the terms of the gnu library ' 'general public license as published by the free software ' 'foundation either version 2 of the license or at your ' 'option any later version this library is distributed in ' 'the hope that it will be useful but without any warranty ' 'without even the implied warranty of merchantability or ' 'fitness for particular purpose see the gnu library ' 'general public license for more details you should have ' 'received copy of the gnu library general public ' 'license along with this library see the file copying lib ' 'if not write to the free software foundation 51 franklin ' 'street fifth floor boston ma 02110 1301 usa'} ] assert result == expected # check rules token are the same exact set as the set of the last query run txtid = idx.tokens_by_tid qrt = [txtid[t] for t in q.query_runs[-1].tokens] irt = [txtid[t] for t in idx.tids_by_rid[0]] assert irt == qrt
def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self): from licensedcode_test_utils import query_tokens_with_unknowns # NOQA _stored_text = 'copyright reserved mit is license, copyright reserved mit is license' # 0 1 2 3 4 5 6 7 8 9 license_expression = 'tst' rule = models.Rule(license_expression=license_expression, stored_text=_stored_text) idx = MiniLicenseIndex([rule]) querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.' # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 qry = Query(query_string=querys, idx=idx) # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None] # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 assert tks_as_str(query_tokens_with_unknowns(qry)) == expected result = idx.match(query_string=querys) assert len(result) == 1 match = result[0] assert match.qspan == Span(0, 4) | Span(6, 10) assert match.ispan == Span(0, 9) assert match.coverage() == 100 qtext, itext = get_texts(match) assert qtext == 'copyright reserved mit is license [is] [the] copyright reserved mit is license' assert itext == 'copyright reserved mit is license copyright reserved mit is license'
def test_QueryRun_does_not_end_with_None(self): rule_text = 'Redistribution and use in source and binary forms, with or without modification, are permitted' idx = index.LicenseIndex( [Rule(stored_text=rule_text, license_expression='bsd')]) querys = ''' The Redistribution and use in source and binary forms, with or without modification, are permitted. Always bar modification foo ''' # convert tid to actual token strings tks_as_str = lambda tks: [ None if tid is None else idx.tokens_by_tid[tid] for tid in tks ] qry = Query(query_string=querys, idx=idx) expected = [ None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modification', 'are', 'permitted', None, None, 'modification', None ] assert [x for x in expected if x] == tks_as_str(qry.tokens) assert expected == tks_as_str(qry.tokens_with_unknowns()) assert 2 == len(qry.query_runs) qrun = qry.query_runs[0] expected = [ 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modification', 'are', 'permitted' ] assert expected == tks_as_str(qrun.tokens) assert 0 == qrun.start assert 13 == qrun.end qrun = qry.query_runs[1] expected = ['modification'] assert expected == tks_as_str(qrun.tokens) assert 14 == qrun.start assert 14 == qrun.end
def test_QueryRun_with_all_digit_lines(self): rule = Rule(stored_text=''' redistributions 0 1 2 3 4 1568 5 6 7 368 8 9 10 80 12213 232312 in binary 345 in 256 free 1953 software 406 foundation 1151 free 429 software 634 foundation 1955 free 724 software 932 foundation 234 software 694 foundation 110 ''') legalese = set(['binary', 'redistributions', 'foundation']) idx = index.LicenseIndex([rule], _legalese=legalese) qs = ''' 25 17 1 -80.00000 .25000 37.00000 .25000 0: 5107 -2502 -700 496 -656 468 -587 418 -481 347 -325 256 -111 152 166 50 493 -37 854 -96 1221 -118 1568 -125 1953 -143 2433 -195 2464 -281 2529 -395 1987 -729 447 -916 -3011 -1181 -5559 -406 -6094 541 -5714 1110 -5247 1289 -4993 1254 -4960 1151 1: 4757 -1695 -644 429 -627 411 -602 368 -555 299 -470 206 -328 96 -125 -15 126 -105 391 -146 634 -120 762 -58 911 -13 1583 -8 1049 -28 1451 123 1377 -464 907 -603 -4056 -1955 -6769 -485 -5797 929 -4254 1413 -3251 1295 -2871 993 -2899 724 2: 4413 -932 -563 355 -566 354 -582 322 -597 258 -579 164 -499 45 -341 -84 -127 -192 93 -234 288 -157 190 -25 -145 65 1065 74 -1087 -40 -877 1058 -994 18 1208 694 -5540 -3840 -7658 -332 -4130 1732 -1668 1786 -634 1127 -525 501 -856 110 ''' qry = Query(query_string=qs, idx=idx) result = [qr.to_dict() for qr in qry.query_runs] # FIXME: we should not even have a query run for things that are all digits expected = [ { 'end': 5, 'start': 0, 'tokens': '1 80 0 256 1568 1953' }, { 'end': 12, 'start': 6, 'tokens': '406 1151 1 429 368 634 8' }, { 'end': 17, 'start': 13, 'tokens': '1955 724 2 932 234' }, ] assert result == expected assert not any(qr.is_matchable() for qr in qry.query_runs)
def test_query_from_binary_lkms_3(self): location = self.get_test_loc('query/wlan_xauth.ko') idx = cache.get_index() result = Query(location, idx=idx) assert len(result.query_runs) < 900 qr = result.query_runs[0] assert 'license dual bsd gpl' in u' '.join( idx.tokens_by_tid[t] for t in qr.matchable_tokens())
def test_query_from_binary_lkms_2(self): location = self.get_test_loc('query/eeepc_acpi.ko') idx = cache.get_index() result = Query(location, idx=idx) assert len(result.query_runs) < 500 qrs = result.query_runs[5:10] assert any('license gpl' in u' '.join(idx.tokens_by_tid[t] for t in qr.matchable_tokens()) for qr in qrs)
def test_query_unknowns_by_pos_and_stopwords_are_not_set_on_last_query_position(self): print('\nINDEX') idx = index.LicenseIndex( [Rule(stored_text='is the binary a')], _legalese=set(['binary']), _spdx_tokens=set() ) print('\nQUERY') q = Query(query_string='a bar binary that was a binary a is the foo bar a', idx=idx) tids = list(q.tokens_by_line()) assert tids == [[None, 0, None, None, 0, 1, 2, None, None]] # word: a bar binary that was a binary a is the foo bar a # tids: [ None 0, None, None, 0, 1, 2, None, None ] # known: st uk kn uk uk st kn st kn kn uk uk st # pos: 0 1 2 3 assert q.unknowns_by_pos == {-1: 1, 0: 2, 3: 2} assert q.stopwords_by_pos == {-1: 1, 0: 1, 1: 1, 3: 1}
def test_match_with_templates_with_redundant_tokens_yield_single_exact_match( self): _stored_text = u'copyright reserved mit is license, {{}} copyright reserved mit is license' # 0 1 2 3 4 5 6 7 8 9 license_expression = 'tst' rule = models.Rule(license_expression=license_expression, stored_text=_stored_text) idx = index.LicenseIndex([rule]) expected_idx = { '_tst_73_0': { u'copyright': [0, 5], u'license': [4, 9], u'mit': [2, 7] } } assert expected_idx == idx.to_dict() querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.' # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 qry = Query(query_string=querys, idx=idx) # convert tid to actual token strings tks_as_str = lambda tks: [ None if tid is None else idx.tokens_by_tid[tid] for tid in tks ] expected = [ None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None ] # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 assert expected == tks_as_str(qry.tokens_with_unknowns()) result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] assert Span(0, 4) | Span(6, 10) == match.qspan assert Span(0, 9) == match.ispan assert 100 == match.coverage() qtext, itext = get_texts(match, query_string=querys, idx=idx) assert 'copyright reserved mit is license <is> [the] copyright reserved mit is license' == qtext assert 'copyright reserved mit is license copyright reserved mit is license' == itext
def test_QueryRun(self): idx = index.LicenseIndex([Rule(_text='redistributions in binary form must redistributions in')]) qry = Query(query_string='redistributions in binary form must redistributions in', idx=idx) qruns = qry.query_runs assert 1 == len(qruns) qr = qruns[0] # test result = [idx.tokens_by_tid[tid] for tid in qr.tokens] expected = ['redistributions', 'in', 'binary', 'form', 'must', 'redistributions', 'in'] assert expected == result
def test_Query_known_and_unknown_positions(self): rule_text = 'Redistribution and use in source and binary forms' rule = Rule(stored_text=rule_text, license_expression='bsd') legalese = set([ 'redistribution', 'form', ]) idx = index.LicenseIndex([rule], _legalese=legalese) querys = 'The new Redistribution and use in other form always' qry = Query(query_string=querys, idx=idx, _test_mode=False) # we have only 4 known positions in this query, hence only 4 entries there on a single line # "Redistribution and use in" assert [1, 1, 1, 1, 1] == qry.line_by_pos # this show our 4 known token in this query with their known position # "Redistribution and use in" assert [1, 2, 3, 4, 0] == qry.tokens # the first two tokens are unknown, then starting after "in" we have three trailing unknown. assert {3: 1, 4: 1, -1: 2} == qry.unknowns_by_pos # This shows how knowns and unknowns are blended result = list(qry.tokens_with_unknowns()) expected = [ # The new None, None, # Redistribution 1, # and 2, # use 3, # in 4, # other form always' None, 0, None ] assert expected == result
def test_QueryRun_does_not_end_with_None(self): rule_text = 'Redistribution and use in source and binary forms, with or without modification, are permitted' idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])]) querys = ''' The Redistribution and use in source and binary forms, with or without modification, are permitted. Always bar modification foo ''' # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] qry = Query(query_string=querys, idx=idx) expected = [ None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modification', 'are', 'permitted', None, None, 'modification', None ] assert [x for x in expected if x] == tks_as_str(qry.tokens) assert expected == tks_as_str(qry.tokens_with_unknowns()) assert 2 == len(qry.query_runs) qrun = qry.query_runs[0] expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modification', 'are', 'permitted'] assert expected == tks_as_str(qrun.tokens) assert 0 == qrun.start assert 13 == qrun.end qrun = qry.query_runs[1] expected = ['modification'] assert expected == tks_as_str(qrun.tokens) assert 14 == qrun.start assert 14 == qrun.end
def test_QueryRun_repr(self): idx = index.LicenseIndex([Rule(_text='redistributions in binary form must redistributions in')]) qry = Query(query_string='redistributions in binary form must redistributions in', idx=idx) qruns = qry.query_runs qr = qruns[0] # test expected = 'QueryRun(start=0, len=7, start_line=1, end_line=1)' assert expected == repr(qr) expected = 'QueryRun(start=0, len=7, start_line=1, end_line=1, tokens="redistributions in binary form must redistributions in")' assert expected == qr.__repr__(trace_repr=True)
def test_query_tokens_are_same_for_different_text_formatting(self): test_files = [self.get_test_loc(f) for f in [ 'queryformat/license2.txt', 'queryformat/license3.txt', 'queryformat/license4.txt', 'queryformat/license5.txt', 'queryformat/license6.txt', ]] rule_file = self.get_test_loc('queryformat/license1.txt') idx = index.LicenseIndex([Rule(text_file=rule_file, licenses=['mit'])]) q = Query(location=rule_file, idx=idx) assert 1 == len(q.query_runs) expected = q.query_runs[0] for tf in test_files: q = Query(tf, idx=idx) qr = q.query_runs[0] assert expected.tokens == qr.tokens
def test_query_from_binary_lkms_2(self): location = self.get_test_loc('query/eeepc_acpi.ko') idx = cache.get_index() result = Query(location, idx=idx) assert len(result.query_runs) < 500 qrs = result.query_runs[:10] # for i, qr in enumerate(qrs): # print('qr:', i, # 'qr_text:', ' '.join(idx.tokens_by_tid[t] for t in qr.matchable_tokens())) assert any('license gpl' in ' '.join(idx.tokens_by_tid[t] for t in qr.matchable_tokens()) for qr in qrs)
def test_Query_tokens_by_line_from_string(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' rule = Rule(stored_text=rule_text, license_expression='bsd') legalese = set([ 'redistribution', 'form', ]) idx = index.LicenseIndex([rule], _legalese=legalese) querys = ''' The Redistribution and use in source and binary are permitted Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx, _test_mode=True) result = list(qry.tokens_by_line()) expected = [ [], [None], [1, 2, 3, 4, 5, 2, 6, 12, 13], [], [None, None, None, None], [None, 2, None], [None], ] assert expected == result # convert tid to actual token strings qtbl_as_str = lambda qtbl: [[ None if tid is None else idx.tokens_by_tid[tid] for tid in tids ] for tids in qtbl] result_str = qtbl_as_str(result) expected_str = [ [], [None], [ 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted' ], [], [None, None, None, None], [None, 'and', None], [None], ] assert expected_str == result_str assert [3, 3, 3, 3, 3, 3, 3, 3, 3, 6] == qry.line_by_pos idx = index.LicenseIndex( [Rule(stored_text=rule_text, license_expression='bsd')]) querys = 'and this is not a license' qry = Query(query_string=querys, idx=idx, _test_mode=True) result = list(qry.tokens_by_line()) expected = [['and', None, None, None, 'license']] assert expected == qtbl_as_str(result)
def test_Query_known_and_unknown_positions(self): rule_text = 'Redistribution and use in source and binary forms' idx = index.LicenseIndex( [Rule(stored_text=rule_text, license_expression='bsd')]) querys = 'The new Redistribution and use in other form always' qry = Query(query_string=querys, idx=idx, _test_mode=False) # we have only 4 known positions in this query, hence only 4 entries there on a single line # "Redistribution and use in" assert [1, 1, 1, 1] == qry.line_by_pos # this show our 4 known token in this query with their known position # "Redistribution and use in" assert [6, 0, 3, 5] == qry.tokens # the first two tokens are unknown, then starting after "in" we have three trailing unknown. assert { -1: 2, 3: 3, } == qry.unknowns_by_pos # This shows how knowns and unknowns are blended result = list(qry.tokens_with_unknowns()) expected = [ # The new None, None, # Redistribution 6, # and 0, # use 3, # in 5, # other form always' None, None, None ] assert result == expected
def test_Query_tokenize_from_string(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])]) querys = ''' The Redistribution and use in source and binary are permitted. Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx, _test_mode=True) qry.tokenize_and_build_runs(qry.tokens_by_line()) # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and'] result = tks_as_str(qry.tokens) assert expected == result expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and', None, None] result = tks_as_str(qry.tokens_with_unknowns()) assert expected == result assert 1 == len(qry.query_runs) qr1 = qry.query_runs[0] assert 0 == qr1.start assert 9 == qr1.end assert 10 == len(qr1) expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and'] result = tks_as_str(qr1.tokens) assert expected == result expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and'] result = tks_as_str(qr1.tokens_with_unknowns()) assert expected == result
def test_method(self): idx = cache.get_index() qry = Query(location=test_loc, idx=idx) results = [list(l) for l in qry.spdx_lines] if regen: with open(expected_loc, 'w') as ef: json.dump(results, ef, indent=2) expected = results else: with open(expected_loc) as ef: expected = json.load(ef) assert results == expected
def test_method(self): idx = cache.get_index() qry = Query(location=test_loc, idx=idx) results = [list(l) for l in qry.spdx_lines] if regen: with open(expected_loc, 'wb') as ef: json.dump(results, ef, indent=2) expected = results else: with open(expected_loc, 'rb') as ef: expected = json.load(ef, object_pairs_hook=OrderedDict) assert expected == results
def test_query_run_unknowns(self): legalese = set(['binary']) idx = index.LicenseIndex([Rule(stored_text='a is the binary')], _legalese=legalese) assert idx.dictionary == {'binary': 0, 'is': 1, 'the': 2} assert idx.len_legalese == 1 # multiple unknowns at start, middle and end q = Query(query_string='that new binary was sure a kind of the real mega deal', idx=idx) # known pos 0 1 2 # abs pos 0 1 2 3 4 5 6 7 8 9 10 11 expected = {-1: 2, 0: 4, 1: 3} assert dict(q.unknowns_by_pos) == expected
def test_query_unknowns_by_pos_and_stopwords_are_not_defaultdic_and_not_changed_on_query( self): idx = index.LicenseIndex([Rule(stored_text='a is the binary')], _legalese=set(['binary']), _spdx_tokens=set()) q = Query(query_string='binary that was a binary', idx=idx) list(q.tokens_by_line()) assert q.unknowns_by_pos == {0: 2} assert q.stopwords_by_pos == {0: 1} assert not isinstance(q.unknowns_by_pos, defaultdict) assert not isinstance(q.stopwords_by_pos, defaultdict) try: q.unknowns_by_pos[1] assert q.unknowns_by_pos == {0: 2} except KeyError: pass try: q.stopwords_by_pos[1] assert q.stopwords_by_pos == {0: 1} except KeyError: pass
def test_query_run_unknowns(self): idx = index.LicenseIndex([Rule(stored_text='a is the binary')]) assert {u'binary': 0, u'is': 1, u'the': 2} == idx.dictionary assert 1 == idx.len_junk # multiple unknowns at start, middle and end q = Query(query_string= 'that new binary was sure a kind of the real mega deal', idx=idx) # known pos 0 1 2 # abs pos 0 1 2 3 4 5 6 7 8 9 10 11 expected = {-1: 2, 0: 4, 1: 3} assert expected == dict(q.unknowns_by_pos)
def test_method(self): idx = cache.get_index() qry = Query(location=test_loc, idx=idx) results = [list(l) for l in qry.spdx_lines] if regen: wmode = 'w' with open(expected_loc, wmode) as ef: json.dump(results, ef, indent=2) expected = results else: with open(expected_loc, 'rb') as ef: expected = json.load(ef, encoding='utf-8') assert expected == results
def test_query_runs_three_runs(self): idx = index.LicenseIndex(self.get_test_rules('index/bsd')) query_loc = self.get_test_loc('index/queryruns') qry = Query(location=query_loc, idx=idx) expected = [ {'end': 84, 'start': 0, 'tokens': u'the redistribution and use in ... 2 1 3 c 4'}, {'end': 97, 'start': 85, 'tokens': u'this software is provided by ... holders and contributors as is'}, {'end': 98, 'start': 98, 'tokens': u'redistributions'} ] result = [q.to_dict(brief=True) for q in qry.query_runs] assert expected == result
def test_query_runs_from_location(self): idx = index.LicenseIndex(self.get_test_rules('index/bsd')) query_loc = self.get_test_loc('index/querytokens') qry = Query(location=query_loc, idx=idx, line_threshold=3) result = [q.to_dict(brief=True) for q in qry.query_runs] expected = [ { 'start': 0, 'end': 35, 'tokens': u'redistribution and use in source ... holders and contributors as is'}, { 'start': 36, 'end': 36, 'tokens': u'redistributions'} ] assert expected == result
def test_Query_with_spdx_basic(self): idx = cache.get_index() querys = ''' * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT) * SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0 Always From uboot: the first two lines are patch-like: * SPDX-License-Identifier: GPL-2.0+ BSD-2-Clause ''' qry = Query(query_string=querys, idx=idx) expected = [ ('SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)', 0, 15), ('SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exception-2.0',16, 34), ('SPDX-License-Identifier: GPL-2.0+ BSD-2-Clause', 45, 53)] assert qry.spdx_lines == expected
def test_query_run_tokens_matchable(self): idx = cache.get_index() # NOTE: this is not a token present in any rules or licenses unknown_token = u'baridationally' assert unknown_token not in idx.dictionary query_s = u' '.join(u''' 3 unable to create proc entry license gpl description driver author eric depends 2 6 24 19 generic smp mod module acpi baridationally register driver proc acpi disabled acpi install notify acpi baridationally get status cache caches create proc entry baridationally generate proc event acpi evaluate object acpi remove notify remove proc entry acpi baridationally driver acpi acpi gcc gnu 4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack pointer current stack pointer this module end usr src modules acpi include linux include asm include asm generic include acpi acpi c posix types 32 h types h types h h h h h '''.split()) result = Query(query_string=query_s, idx=idx) assert 1 == len(result.query_runs) qr = result.query_runs[0] expected_qr0 = u' '.join(u''' 3 unable to create proc entry license gpl description driver author eric depends 2 6 24 19 generic smp mod module acpi register driver proc acpi disabled acpi install notify acpi get status cache caches create proc entry generate proc event acpi evaluate object acpi remove notify remove proc entry acpi driver acpi acpi gcc gnu 4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack pointer current stack pointer this module end usr src modules acpi include linux include asm include asm generic include acpi acpi c posix types 32 h types h types h h h h h '''.split()) assert expected_qr0 == u' '.join(idx.tokens_by_tid[t] for t in qr.tokens) assert expected_qr0 == u' '.join(idx.tokens_by_tid[t] for p, t in enumerate(qr.tokens) if p in qr.matchables) # only gpl and gnu are is in high matchables expected = u'license gpl gnu gnu' assert expected == u' '.join(idx.tokens_by_tid[t] for p, t in enumerate(qr.tokens) if p in qr.high_matchables)