def test_match_exact_from_string_twice_with_repeated_text(self): _stored_text = u'licensed under the GPL, licensed under the GPL' # 0 1 2 3 4 5 6 7 license_expression = 'tst' rule = models.Rule(license_expression=license_expression, stored_text=_stored_text) idx = index.LicenseIndex([rule]) querys = u'Hi licensed under the GPL, licensed under the GPL yes.' # 0 1 2 3 4 5 6 7 8 9 result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] qtext, itext = get_texts(match, query_string=querys, idx=idx) assert 'licensed under the GPL licensed under the GPL' == qtext assert 'licensed under the gpl licensed under the gpl' == itext assert Span(0, 7) == match.qspan assert Span(0, 7) == match.ispan # match again to ensure that there are no state side effects result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] assert Span(0, 7) == match.qspan assert Span(0, 7) == match.ispan qtext, itext = get_texts(match, query_string=querys, idx=idx) assert u'licensed under the GPL licensed under the GPL' == qtext assert u'licensed under the gpl licensed under the gpl' == itext
def test_match_exact_from_string_twice_with_repeated_text(self): _stored_text = u'licensed under the GPL, licensed under the GPL' # 0 1 2 3 4 5 6 7 license_expression = 'tst' rule = models.Rule(license_expression=license_expression, stored_text=_stored_text) idx = MiniLicenseIndex([rule]) querys = u'Hi licensed under the GPL, licensed under the GPL yes.' # 0 1 2 3 4 5 6 7 8 9 result = idx.match(query_string=querys) assert len(result) == 1 match = result[0] qtext, itext = get_texts(match) assert qtext == 'licensed under the GPL, licensed under the GPL' assert itext == 'licensed under the gpl licensed under the gpl' assert match.qspan == Span(0, 7) assert match.ispan == Span(0, 7) # match again to ensure that there are no state side effects result = idx.match(query_string=querys) assert len(result) == 1 match = result[0] assert match.qspan == Span(0, 7) assert match.ispan == Span(0, 7) qtext, itext = get_texts(match) assert qtext == u'licensed under the GPL, licensed under the GPL' assert itext == u'licensed under the gpl licensed under the gpl'
def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self): _stored_text = u'copyright reserved mit is license, {{}} copyright reserved mit is license' # 0 1 2 3 4 5 6 7 8 9 license_expression = 'tst' rule = models.Rule(license_expression=license_expression, stored_text=_stored_text) idx = MiniLicenseIndex([rule]) querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.' # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 qry = Query(query_string=querys, idx=idx) # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None] # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 assert expected == tks_as_str(qry.tokens_with_unknowns()) result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] assert Span(0, 4) | Span(6, 10) == match.qspan assert Span(0, 9) == match.ispan assert 100 == match.coverage() qtext, itext = get_texts(match) assert 'copyright reserved mit is license [is] [the] copyright reserved mit is license' == qtext assert 'copyright reserved mit is license copyright reserved mit is license' == itext
def test_match_can_match_with_simple_rule_template2(self): rule_text = u''' IN NO EVENT SHALL THE BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ''' rule = Rule(stored_text=rule_text, license_expression='x-consortium') idx = index.LicenseIndex([rule]) query_string = u''' IN NO EVENT SHALL THE Y CORP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ''' matches = idx.match(query_string=query_string) assert 1 == len(matches) match = matches[0] qtext, itext = get_texts(match, query_string=query_string, idx=idx) expected_qtokens = u''' IN NO EVENT SHALL THE [Y] [CORP] BE LIABLE FOR ANY CLAIM DAMAGES OR OTHER LIABILITY WHETHER IN AN ACTION OF CONTRACT TORT OR OTHERWISE ARISING FROM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE '''.split() expected_itokens = u''' IN NO EVENT SHALL THE BE LIABLE FOR ANY CLAIM DAMAGES OR OTHER LIABILITY WHETHER IN AN ACTION OF CONTRACT TORT OR OTHERWISE ARISING FROM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE '''.lower().split() assert expected_qtokens == qtext.split() assert expected_itokens == itext.split()
def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self): from licensedcode_test_utils import query_tokens_with_unknowns # NOQA _stored_text = 'copyright reserved mit is license, copyright reserved mit is license' # 0 1 2 3 4 5 6 7 8 9 license_expression = 'tst' rule = models.Rule(license_expression=license_expression, stored_text=_stored_text) idx = MiniLicenseIndex([rule]) querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.' # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 qry = Query(query_string=querys, idx=idx) # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None] # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 assert tks_as_str(query_tokens_with_unknowns(qry)) == expected result = idx.match(query_string=querys) assert len(result) == 1 match = result[0] assert match.qspan == Span(0, 4) | Span(6, 10) assert match.ispan == Span(0, 9) assert match.coverage() == 100 qtext, itext = get_texts(match) assert qtext == 'copyright reserved mit is license [is] [the] copyright reserved mit is license' assert itext == 'copyright reserved mit is license copyright reserved mit is license'
def debug_matches(self, matches, message, location=None, query_string=None, with_text=False, qry=None): """ Log debug-level data for a list of `matches`. """ logger_debug(message + ':', len(matches)) if qry: # set line early to ease debugging match.set_matched_lines(matches, qry.line_by_pos) if not with_text: for m in matches: logger_debug(m) else: logger_debug(message + ' MATCHED TEXTS') from licensedcode.tracing import get_texts for m in matches: logger_debug(m) qt, it = get_texts(m) logger_debug(' MATCHED QUERY TEXT:', qt) logger_debug(' MATCHED RULE TEXT:', it)
def debug_matches(self, matches, message, location=None, query_string=None, with_text=False, query=None): if TRACE or TRACE_NEGATIVE: logger_debug(message + ':', len(matches)) if query: # set line early to ease debugging match.set_lines(matches, query.line_by_pos) if TRACE_MATCHES or TRACE_NEGATIVE: map(logger_debug, matches) if (TRACE_MATCHES_TEXT or TRACE_NEGATIVE) and with_text: logger_debug(message + ' MATCHED TEXTS') from licensedcode.tracing import get_texts for m in matches: logger_debug(m) qt, it = get_texts(m, location, query_string, self) print(' MATCHED QUERY TEXT:', qt) print(' MATCHED RULE TEXT:', it) print()
def test_match_matches_correctly_simple_exact_query_across_query_runs(self): tf1 = self.get_test_loc('detect/mit/mit.c') ftr = Rule(text_file=tf1, license_expression='mit') idx = index.LicenseIndex([ftr]) query_doc = self.get_test_loc('detect/mit/mit3.c') matches = idx.match(query_doc) assert 1 == len(matches) match = matches[0] qtext, itext = get_texts(match, location=query_doc, idx=idx) expected_qtext = u''' Permission is hereby granted free of charge to any person obtaining copy of this software and associated documentation files the Software to deal in THE SOFTWARE WITHOUT RESTRICTION INCLUDING WITHOUT LIMITATION THE RIGHTS TO USE COPY MODIFY MERGE PUBLISH DISTRIBUTE SUBLICENSE AND OR SELL COPIES of the Software and to permit persons to whom the Software is furnished to do so subject to the following conditions The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software '''.split() assert expected_qtext == qtext.split() expected_itext = u''' Permission is hereby granted free of charge to any person obtaining copy of this software and associated documentation files the Software to deal in the Software without restriction including without limitation the rights to use copy modify merge publish distribute sublicense and or sell copies of the Software and to permit persons to whom the Software is furnished to do so subject to the following conditions The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software '''.lower().split() assert expected_itext == itext.split()
def test_overlap_detection5(self): # test this containment relationship between test and index licenses: # * Index licenses: # +-license 2 --------+ # | +-license 1 --+ | # +-------------------+ # # +-license 4 --------+ # | +-license 1 --+ | # +-------------------+ # setup index license1 = '''Redistribution and use permitted for MIT license.''' license2 = '''Redistributions of source must retain copyright. Redistribution and use permitted for MIT license. Redistributions in binary form is permitted.''' rule1 = Rule(stored_text=license1, license_expression='overlap') rule2 = Rule(stored_text=license2, license_expression='overlap') idx = index.LicenseIndex([rule1, rule2]) querys = '''My source. Redistribution and use permitted for MIT license. My code.''' # test : querys contains license1: return license1 as exact coverage matches = idx.match(query_string=querys) assert 1 == len(matches) match = matches[0] assert rule1 == match.rule qtext, _itext = get_texts(match, query_string=querys, idx=idx) assert 'Redistribution and use permitted for MIT license' == qtext
def test_match_seq_are_correct_on_apache(self): rule_dir = self.get_test_loc('match_seq/rules') legalese = (mini_legalese | set([ 'redistributions', 'written', 'registered', 'derived', 'damage', 'due', 'alternately', 'nor' ])) idx = index.LicenseIndex(load_rules(rule_dir), _legalese=legalese) query_loc = self.get_test_loc('match_seq/query') matches = idx.match(location=query_loc) assert len(matches) == 1 match = matches[0] assert match.matcher == match_seq.MATCH_SEQ qtext, _itext = get_texts(match) expected = u''' The OpenSymphony Group. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The end-user documentation included with the redistribution, if any, must include the following acknowledgment: [4]. "[This] [product] [includes] [software] [developed] [by] [the] [OpenSymphony] [Group] ([http]://[www].[opensymphony].[com]/)." [5]. Alternately, this acknowledgment may appear in the software itself, if and wherever such third-party acknowledgments normally appear. The names "OpenSymphony" and "The OpenSymphony Group" must not be used to endorse or promote products derived from this software without prior written permission. For written permission, please contact [email protected] . Products derived from this software may not be called "OpenSymphony" or "[OsCore]", nor may "OpenSymphony" or "[OsCore]" appear in their name, without prior written permission of the OpenSymphony Group. THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ''' assert qtext.split() == expected.split()
def closure_test_function(*args, **kwargs): idx = cache.get_index() matches = idx.match(location=test_file, min_score=0) if not matches: matches = [] detected_expressions = [match.rule.license_expression for match in matches] # use detection as expected and dump test back if regen: if not expected_failure: license_test.license_expressions = detected_expressions license_test.dump() return try: assert expected_expressions == detected_expressions except: # On failure, we compare against more result data to get additional # failure details, including the test_file and full match details results = expected_expressions + ['======================', ''] failure_trace = detected_expressions[:] + ['======================', ''] for match in matches: qtext, itext = get_texts(match) rule_text_file = match.rule.text_file if match.rule.is_license: rule_data_file = rule_text_file.replace('LICENSE', 'yml') else: rule_data_file = match.rule.data_file failure_trace.extend(['', '======= MATCH ====', repr(match), '======= Matched Query Text for:', 'file://{test_file}'.format(**locals()) ]) if test_data_file: failure_trace.append('file://{test_data_file}'.format(**locals())) failure_trace.append('') failure_trace.append(qtext) failure_trace.extend(['', '======= Matched Rule Text for:', 'file://{rule_text_file}'.format(**locals()), 'file://{rule_data_file}'.format(**locals()), '', itext, ]) if not matches: failure_trace.extend(['', '======= NO MATCH ====', '======= Not Matched Query Text for:', 'file://{test_file}'.format(**locals()) ]) if test_data_file: failure_trace.append('file://{test_data_file}'.format(**locals())) # this assert will always fail and provide a detailed failure trace assert '\n'.join(results) == '\n'.join(failure_trace)
def test_match_in_binary_lkms_1(self): idx = cache.get_index() qloc = self.get_test_loc('positions/ath_pci.ko') matches = idx.match(location=qloc) assert 1 == len(matches) match = matches[0] assert ['bsd-new', 'gpl-2.0'] == match.rule.license_keys() qtext, itext = get_texts(match, location=qloc, idx=idx) assert 'license Dual BSD GPL' == qtext assert 'license dual bsd gpl' == itext
def test_match_in_binary_lkms_2(self): idx = cache.get_index() qloc = self.get_test_loc('positions/eeepc_acpi.ko') matches = idx.match(location=qloc) assert 1 == len(matches) match = matches[0] assert ['gpl-1.0-plus'] == match.rule.license_keys() assert match.ispan == Span(0, 1) qtext, itext = get_texts(match, location=qloc, idx=idx) assert 'license GPL' == qtext assert 'license gpl' == itext
def test_match_can_match_with_plain_rule_simple2(self): rule_text = u'''X11 License Copyright (C) 1996 X Consortium Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Except as contained in this notice, the name of the X Consortium shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization from the X Consortium. X Window System is a trademark of X Consortium, Inc. ''' rule = Rule(stored_text=rule_text, license_expression='x-consortium') idx = index.LicenseIndex([rule]) query_loc = self.get_test_loc('detect/simple_detection/x11-xconsortium_text.txt') matches = idx.match(location=query_loc) assert 1 == len(matches) expected_qtext = u''' X11 License Copyright C 1996 X Consortium Permission is hereby granted free of charge to any person obtaining copy of this software and associated documentation files the Software to deal in the Software without restriction including without limitation the rights to use copy modify merge publish distribute sublicense and or sell copies of the Software and to permit persons to whom the Software is furnished to do so subject to the following conditions The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software THE SOFTWARE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND EXPRESS OR IMPLIED INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY FITNESS FOR PARTICULAR PURPOSE AND NONINFRINGEMENT IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM DAMAGES OR OTHER LIABILITY WHETHER IN AN ACTION OF CONTRACT TORT OR OTHERWISE ARISING FROM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE Except as contained in this notice the name of the X Consortium shall not be used in advertising or otherwise to promote the sale use or other dealings in this Software without prior written authorization from the X Consortium X Window System is trademark of X Consortium Inc '''.split() match = matches[0] qtext, _itext = get_texts(match, location=query_loc, idx=idx) assert expected_qtext == qtext.split()
def test_match_in_binary_lkms_3(self): idx = cache.get_index() qloc = self.get_test_loc('positions/wlan_xauth.ko') matches = idx.match(location=qloc) assert 1 == len(matches) match = matches[0] assert ['bsd-new', 'gpl-2.0'] == match.rule.license_keys() assert 100 == match.coverage() assert 100 == match.score() qtext, itext = get_texts(match, location=qloc, idx=idx) assert 'license Dual BSD GPL' == qtext assert 'license dual bsd gpl' == itext assert Span(0, 3) == match.ispan
def test_match_template_with_few_tokens_around_gaps_is_wholly_seq_matched( self): # was failing when a gapped token (from a template) starts at a # beginning of an index doc. We may still skip that, but capture a large match anyway. rule_text = u''' Copyright THIS IS FROM [[THE OLD CODEHAUS]] AND CONTRIBUTORS IN NO EVENT SHALL [[THE OLD CODEHAUS]] OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE [[POSSIBILITY OF NEW SUCH]] DAMAGE ''' rule = Rule(stored_text=rule_text, license_expression='test') legalese = (mini_legalese | set([ 'copyright', 'reserved', 'advised', 'liable', 'damage', 'contributors', 'alternately', 'possibility' ])) idx = index.LicenseIndex([rule], _legalese=legalese) querys = u''' Copyright 2003 (C) James. All Rights Reserved. THIS IS FROM THE CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ''' result = idx.match(query_string=querys) assert len(result) == 1 match = result[0] assert match.matcher == match_seq.MATCH_SEQ exp_qtext = u""" Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved]. [THIS] [IS] [FROM] [THE] CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """.split() exp_itext = u""" Copyright <THIS> <IS> <FROM> <THE> <OLD> CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE <OLD> CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF <NEW> SUCH DAMAGE """.lower().split() qtext, itext = get_texts(match) assert qtext.split() == exp_qtext assert qtext.split() == exp_qtext assert itext.split() == exp_itext assert match.coverage() >= 70
def test_overlap_detection1(self): # test this containment relationship between test and index licenses: # * Index licenses: # +-license 2 --------+ # | +-license 1 --+ | # +-------------------+ # # * License texts to detect: # +- license 3 -----------+ # | +-license 2 --------+ | # | | +-license 1 --+ | | # | +-------------------+ | # +-----------------------+ # # +-license 4 --------+ # | +-license 1 --+ | # +-------------------+ # setup index license1 = '''Redistribution and use permitted.''' license2 = '''Redistributions of source must retain copyright. Redistribution and use permitted. Redistributions in binary form is permitted.''' license3 = ''' this license source Redistributions of source must retain copyright. Redistribution and use permitted. Redistributions in binary form is permitted. has a permitted license''' license4 = '''My Redistributions is permitted. Redistribution and use permitted. Use is permitted too.''' rule1 = Rule(stored_text=license1, license_expression='overlap') rule2 = Rule(stored_text=license2, license_expression='overlap') rule3 = Rule(stored_text=license3, license_expression='overlap') rule4 = Rule(stored_text=license4, license_expression='overlap') idx = index.LicenseIndex([rule1, rule2, rule3, rule4]) querys = 'Redistribution and use bla permitted.' # test : license1 is in the index and contains no other rule. should return rule1 at exact coverage. matches = idx.match(query_string=querys) assert 1 == len(matches) match = matches[0] assert Span(0, 3) == match.qspan assert rule1 == match.rule qtext, _itext = get_texts(match, query_string=querys, idx=idx) assert 'Redistribution and use [bla] permitted' == qtext
def test_match_return_correct_positions_with_short_index_and_queries(self): idx = index.LicenseIndex( [Rule(stored_text='MIT License', license_expression='mit')] ) assert {'_tst_11_0': {'license': [1]}} == idx.to_dict(True) matches = idx.match(query_string='MIT License') assert 1 == len(matches) qtext, itext = get_texts(matches[0], query_string='MIT License', idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan matches = idx.match(query_string='MIT MIT License') assert 1 == len(matches) qtext, itext = get_texts(matches[0], query_string='MIT MIT License', idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(1, 2) == matches[0].qspan assert Span(0, 1) == matches[0].ispan query_doc1 = 'do you think I am a mit license MIT License, yes, I think so' # # 0 1 2 3 matches = idx.match(query_string=query_doc1) assert 2 == len(matches) qtext, itext = get_texts(matches[0], query_string=query_doc1, idx=idx) assert 'mit license' == qtext assert 'mit license' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan qtext, itext = get_texts(matches[1], query_string=query_doc1, idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(2, 3) == matches[1].qspan assert Span(0, 1) == matches[1].ispan query_doc2 = '''do you think I am a mit license MIT License yes, I think so''' matches = idx.match(query_string=query_doc2) assert 2 == len(matches) qtext, itext = get_texts(matches[0], query_string=query_doc2, idx=idx) assert 'mit license' == qtext assert 'mit license' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan qtext, itext = get_texts(matches[1], query_string=query_doc2, idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(2, 3) == matches[1].qspan assert Span(0, 1) == matches[1].ispan
def test_match_exact_from_file(self): idx = index.LicenseIndex(self.get_test_rules('index/mini')) query_loc = self.get_test_loc('index/queryperfect-mini') result = idx.match(location=query_loc) assert 1 == len(result) match = result[0] qtext, itext = get_texts(match, location=query_loc, idx=idx) assert 'Redistribution and use in source and binary forms with or without modification are permitted' == qtext assert 'redistribution and use in source and binary forms with or without modification are permitted' == itext assert Span(0, 13) == match.qspan assert Span(0, 13) == match.ispan
def test_match_exact_from_file(self): idx = MiniLicenseIndex(self.get_test_rules('index/mini')) query_loc = self.get_test_loc('index/queryperfect-mini') result = idx.match(location=query_loc) assert len(result) == 1 match = result[0] qtext, itext = get_texts(match) assert qtext == 'Redistribution and use in source and binary forms, with or without modification,\nare permitted.' assert itext == 'redistribution and use in source and binary forms with or without modification\nare permitted' assert match.qspan == Span(0, 13) assert match.ispan == Span(0, 13)
def test_match_exact_with_junk_in_between_good_tokens(self): _stored_text = u'licensed under the GPL, licensed under the GPL' license_expression = 'tst' rule = models.Rule(license_expression=license_expression, stored_text=_stored_text) idx = MiniLicenseIndex([rule]) querys = u'Hi licensed that under is the that GPL, licensed or under not the GPL by yes.' result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] qtext, itext = get_texts(match) assert u'licensed [that] under [is] the [that] GPL, licensed [or] under [not] the GPL' == qtext assert u'licensed under the gpl licensed under the gpl' == itext
def check_rule_or_license_can_be_self_detected_exactly(rule): idx = cache.get_index() matches = idx.match( location=rule.text_file, _skip_hash_match=True, deadline=10, ) expected = [rule.identifier, '100'] results = flatten( (m.rule.identifier, str(int(m.coverage()))) for m in matches) try: assert results == expected except: from licensedcode.tracing import get_texts data_file = rule.data_file if not data_file: data_file = rule.text_file.replace('.LICENSE', '.yml') text_file = rule.text_file # On failure, we compare againto get additional failure details such as # a clickable text_file path failure_trace = ['======= TEST ===='] failure_trace.extend(results) failure_trace.extend([ '', f'file://{data_file}', f'file://{text_file}', '======================', ]) for i, match in enumerate(matches): qtext, itext = get_texts(match) m_text_file = match.rule.text_file if match.rule.is_from_license: m_data_file = m_text_file.replace('LICENSE', '.yml') else: m_data_file = match.rule.data_file failure_trace.extend([ '', f'======= MATCH {i} ====', repr(match), f'file://{m_data_file}', f'file://{m_text_file}', '======= Matched Query Text:', '', qtext, '' '======= Matched Rule Text:', '', itext ]) # this assert will always fail and provide a detailed failure trace assert '\n'.join(failure_trace) == '\n'.join(expected)
def test_spdx_match_contains_spdx_prefix(self): from licensedcode import index from licensedcode import tracing rule_dir = self.get_test_loc('spdx/rules-overlap/rules') lics_dir = self.get_test_loc('spdx/rules-overlap/licenses') idx = index.LicenseIndex(models.get_rules(lics_dir, rule_dir)) querys = 'SPDX-license-identifier: BSD-3-Clause-No-Nuclear-Warranty' matches = idx.match(query_string=querys) assert len(matches) == 1 match = matches[0] qtext, itext = tracing.get_texts(match) expected_qtext = 'SPDX-license-identifier: BSD-3-Clause-No-Nuclear-Warranty' assert qtext == expected_qtext expected_itext = 'spdx license identifier bsd 3 clause no nuclear warranty' assert itext == expected_itext
def closure_test_function(*args, **kwargs): idx = cache.get_index() matches = idx.match(location=test_file, min_score=0) if not matches: matches = [] detected_expressions = [ match.rule.license_expression for match in matches ] # use detection as expected and dump test back if regen: if not expected_failure: license_test.license_expressions = detected_expressions license_test.dump() return try: assert expected_expressions == detected_expressions except: # On failure, we compare against more result data to get additional # failure details, including the test_file and full match details failure_trace = detected_expressions[:] failure_trace.extend([test_name, 'test file: file://' + test_file]) for match in matches: qtext, itext = get_texts(match, location=test_file, idx=idx) rule_text_file = match.rule.text_file rule_data_file = match.rule.data_file failure_trace.extend([ '', '', '======= MATCH ====', match, '======= Matched Query Text for:', 'file://{test_file}'.format(**locals()) ]) if test_data_file: failure_trace.append( 'file://{test_data_file}'.format(**locals())) failure_trace.append(qtext.splitlines()) failure_trace.extend([ '', '======= Matched Rule Text for:' 'file://{rule_text_file}'.format(**locals()), 'file://{rule_data_file}'.format(**locals()), itext.splitlines(), ]) # this assert will always fail and provide a detailed failure trace assert expected_expressions == failure_trace
def test_match_return_one_match_with_correct_offsets(self): idx = index.LicenseIndex([ Rule(stored_text='A one. a license two. A three.', license_expression='abc')]) querys = u'some junk. A one. A license two. A three.' # 0 1 2 3 4 5 6 7 8 matches = idx.match(query_string=querys) assert 1 == len(matches) match = matches[0] qtext, itext = get_texts(match, query_string=querys, idx=idx) assert 'one license two three' == qtext assert 'one license two three' == itext assert Span(0, 3) == match.qspan assert Span(0, 3) == match.ispan
def test_match_works_for_apache_rule(self): idx = cache.get_index() querys = u'''I am not a license. The Apache Software License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.txt ''' matches = idx.match(query_string=querys) assert 1 == len(matches) match = matches[0] assert 'apache-2.0_212.RULE' == match.rule.identifier assert match_aho.MATCH_AHO_EXACT == match.matcher qtext, _itext = get_texts(match, query_string=querys, idx=idx) assert u'license The Apache Software License Version 2 0 http www apache org licenses LICENSE 2 0 txt' == qtext assert (1, 4) == match.lines()
def test_match_template_with_few_tokens_around_gaps_is_wholly_seq_matched( self): # was failing when a gapped token (from a template) starts at a # beginning of an index doc. We may still skip that, but capture a large match anyway. rule_text = u''' Copyright THIS IS FROM {{THE OLD CODEHAUS}} AND CONTRIBUTORS IN NO EVENT SHALL {{THE OLD CODEHAUS}} OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE {{POSSIBILITY OF NEW SUCH}} DAMAGE ''' rule = Rule(stored_text=rule_text, license_expression='test') idx = index.LicenseIndex([rule]) querys = u''' Copyright 2003 (C) James. All Rights Reserved. THIS IS FROM THE CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ''' result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] assert match_seq.MATCH_SEQ == match.matcher exp_qtext = u""" Copyright [2003] [C] [James] [All] [Rights] [Reserved] THIS IS FROM THE CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE """.split() exp_itext = u""" Copyright THIS IS FROM THE <OLD> CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE <OLD> CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF <NEW> SUCH DAMAGE """.lower().split() qtext, itext = get_texts(match, query_string=querys, idx=idx) assert exp_qtext == qtext.split() assert exp_qtext == qtext.split() assert exp_itext == itext.split() assert 90 <= match.coverage()
def test_match_exact_from_string_once(self): rule_text = 'Redistribution and use in source and binary forms, with or without modification, are permitted' idx = MiniLicenseIndex([models.Rule(stored_text=rule_text, license_expression='bsd')]) querys = ''' The Redistribution and use in source and binary forms, with or without modification, are permitted. Always''' result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] qtext, itext = get_texts(match) assert 'Redistribution and use in source and binary forms, with or without modification,\nare permitted.' == qtext assert 'redistribution and use in source and binary forms with or without modification\nare permitted' == itext assert Span(0, 13) == match.qspan assert Span(0, 13) == match.ispan
def test_match_return_correct_offsets(self): _stored_text = u'A GPL. A MIT. A LGPL.' # 0 1 2 3 4 5 license_expression = 'tst' rule = models.Rule(license_expression=license_expression, stored_text=_stored_text) idx = index.LicenseIndex([rule]) querys = u'some junk. A GPL. A MIT. A LGPL.' # 0 1 2 3 4 5 6 7 result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] qtext, itext = get_texts(match, query_string=querys, idx=idx) assert 'A GPL A MIT A LGPL' == qtext assert 'A GPL A MIT A LGPL' == itext assert Span(0, 5) == match.qspan assert Span(0, 5) == match.ispan
def test_match_can_match_with_sax_rule_for_public_domain(self): test_text = ''' I hereby abandon any property rights to , and release all of source code, compiled code, and documentation contained in this distribution into the Public Domain. ''' rule = Rule(stored_text=test_text, license_expression='public-domain') idx = index.LicenseIndex([rule]) querys = ''' SAX2 is Free! I hereby abandon any property rights to SAX 2.0 (the Simple API for XML), and release all of the SAX 2.0 source code, compiled code, and documentation contained in this distribution into the Public Domain. SAX comes with NO WARRANTY or guarantee of fitness for any purpose. SAX2 is Free! ''' matches = idx.match(query_string=querys) assert 1 == len(matches) match = matches[0] qtext, itext = get_texts(match, query_string=querys, idx=idx) expected_qtext = u''' I hereby abandon any property rights to [SAX] [2] [0] <the> [Simple] [API] [for] [XML] <and> <release> <all> <of> <the> [SAX] [2] [0] source code compiled code and documentation contained in this distribution into the Public Domain '''.split() assert expected_qtext == qtext.split() expected_itext = u''' I hereby abandon any property rights to <and> <release> <all> <of> source code compiled code and documentation contained in this distribution into the Public Domain '''.lower().split() assert expected_itext == itext.split() assert 84 == match.coverage() assert 84 == match.score() assert Span(0, 6) | Span(13, 26) == match.qspan assert Span(0, 6) | Span(11, 24) == match.ispan