def test_Query_tokenize_from_string(self):
        rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted'
        idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])])
        querys = '''
            The
            Redistribution and use in source and binary are permitted.

            Athena capital of Grece
            Paris and Athene
            Always'''

        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        qry.tokenize_and_build_runs(qry.tokens_by_line())
        # convert tid to actual token strings
        tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid  in tks]

        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and']
        result = tks_as_str(qry.tokens)
        assert expected == result

        expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and', None, None]
        result = tks_as_str(qry.tokens_with_unknowns())
        assert expected == result

        assert 1 == len(qry.query_runs)
        qr1 = qry.query_runs[0]
        assert 0 == qr1.start
        assert 9 == qr1.end
        assert 10 == len(qr1)
        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and']
        result = tks_as_str(qr1.tokens)
        assert expected == result
        expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and']
        result = tks_as_str(qr1.tokens_with_unknowns())
        assert expected == result
예제 #2
0
    def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self):
        _text = u'copyright reserved mit is license, {{}} copyright reserved mit is license'
        #                 0        1  2   3       4               5        6   7  8       9
        licenses = ['tst']
        rule = models.Rule(licenses=licenses, _text=_text)
        idx = index.LicenseIndex([rule])
        expected_idx = {'_tst_73_0': {u'copyright': [0, 5], u'license': [4, 9], u'mit': [2, 7]}}
        assert expected_idx == idx.to_dict()

        querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.'
        #           0  1         2        3   4  5       6  7   8         9       10  11 12      13  14
        qry = Query(query_string=querys, idx=idx)

        # convert tid to actual token strings
        tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks]

        expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None]
        #              0     1            2            3       4      5           6      7      8            9           10      11     12          13     14
        assert expected == tks_as_str(qry.tokens_with_unknowns())

        result = idx.match(query_string=querys)
        assert 1 == len(result)

        match = result[0]
        assert Span(0, 4) | Span(6, 10) == match.qspan
        assert Span(0, 9) == match.ispan
        assert 100 == match.coverage()
        qtext, itext = get_texts(match, query_string=querys, idx=idx)
        assert 'copyright reserved mit is license <is> [the] copyright reserved mit is license' == qtext
        assert 'copyright reserved mit is license copyright reserved mit is license' == itext
예제 #3
0
    def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self):
        _text = u'copyright reserved mit is license, {{}} copyright reserved mit is license'
        #                 0        1  2   3       4               5        6   7  8       9
        licenses = ['tst']
        rule = models.Rule(licenses=licenses, _text=_text)
        idx = index.LicenseIndex([rule])
        expected_idx = {'_tst_73_0': {u'copyright': [0, 5], u'license': [4, 9], u'mit': [2, 7]}}
        assert expected_idx == idx.to_dict()

        querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.'
        #           0  1         2        3   4  5       6  7   8         9       10  11 12      13  14
        qry = Query(query_string=querys, idx=idx)

        # convert tid to actual token strings
        tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks]

        expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None]
        #              0     1            2            3       4      5           6      7      8            9           10      11     12          13     14
        assert expected == tks_as_str(qry.tokens_with_unknowns())

        result = idx.match(query_string=querys)
        assert 1 == len(result)

        match = result[0]
        assert Span(0, 4) | Span(6, 10) == match.qspan
        assert Span(0, 9) == match.ispan
        assert 100 == match.coverage()
        qtext, itext = get_texts(match, query_string=querys, idx=idx)
        assert 'copyright reserved mit is license <is> [the] copyright reserved mit is license' == qtext
        assert 'copyright reserved mit is license copyright reserved mit is license' == itext
예제 #4
0
    def test_QueryRun_does_not_end_with_None(self):
        rule_text = 'Redistribution and use in source and binary forms, with or without modification, are permitted'
        idx = index.LicenseIndex(
            [Rule(stored_text=rule_text, license_expression='bsd')])

        querys = '''
            The
            Redistribution and use in source and binary forms, with or without modification, are permitted.

            Always



            bar
             modification
             foo
            '''

        # convert tid to actual token strings
        tks_as_str = lambda tks: [
            None if tid is None else idx.tokens_by_tid[tid] for tid in tks
        ]
        qry = Query(query_string=querys, idx=idx)
        expected = [
            None, 'redistribution', 'and', 'use', 'in', 'source', 'and',
            'binary', 'forms', 'with', 'or', 'without', 'modification', 'are',
            'permitted', None, None, 'modification', None
        ]
        assert [x for x in expected if x] == tks_as_str(qry.tokens)
        assert expected == tks_as_str(qry.tokens_with_unknowns())

        assert 2 == len(qry.query_runs)
        qrun = qry.query_runs[0]
        expected = [
            'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary',
            'forms', 'with', 'or', 'without', 'modification', 'are',
            'permitted'
        ]
        assert expected == tks_as_str(qrun.tokens)
        assert 0 == qrun.start
        assert 13 == qrun.end

        qrun = qry.query_runs[1]
        expected = ['modification']
        assert expected == tks_as_str(qrun.tokens)
        assert 14 == qrun.start
        assert 14 == qrun.end
예제 #5
0
    def test_QueryRun_does_not_end_with_None(self):
        rule_text = 'Redistribution and use in source and binary forms, with or without modification, are permitted'
        idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])])

        querys = '''
            The
            Redistribution and use in source and binary forms, with or without modification, are permitted.

            Always



            bar
             modification
             foo
            '''

        # convert tid to actual token strings
        tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid  in tks]
        qry = Query(query_string=querys, idx=idx)
        expected = [
            None,
            'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary',
            'forms', 'with', 'or', 'without', 'modification', 'are', 'permitted',
            None, None,
            'modification',
            None
        ]
        assert [x for x in expected if x] == tks_as_str(qry.tokens)
        assert expected == tks_as_str(qry.tokens_with_unknowns())

        assert 2 == len(qry.query_runs)
        qrun = qry.query_runs[0]
        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modification', 'are', 'permitted']
        assert expected == tks_as_str(qrun.tokens)
        assert 0 == qrun.start
        assert 13 == qrun.end

        qrun = qry.query_runs[1]
        expected = ['modification']
        assert expected == tks_as_str(qrun.tokens)
        assert 14 == qrun.start
        assert 14 == qrun.end
예제 #6
0
    def test_Query_known_and_unknown_positions(self):

        rule_text = 'Redistribution and use in source and binary forms'
        rule = Rule(stored_text=rule_text, license_expression='bsd')
        legalese = set([
            'redistribution',
            'form',
        ])
        idx = index.LicenseIndex([rule], _legalese=legalese)

        querys = 'The new Redistribution and use in other form always'
        qry = Query(query_string=querys, idx=idx, _test_mode=False)
        # we have only 4 known positions in this query, hence only 4 entries there on a single line
        # "Redistribution and use in"
        assert [1, 1, 1, 1, 1] == qry.line_by_pos

        # this show our 4 known token in this query with their known position
        # "Redistribution and use in"
        assert [1, 2, 3, 4, 0] == qry.tokens

        # the first two tokens are unknown, then starting after "in" we have three trailing unknown.
        assert {3: 1, 4: 1, -1: 2} == qry.unknowns_by_pos

        # This shows how knowns and unknowns are blended
        result = list(qry.tokens_with_unknowns())
        expected = [
            # The  new
            None,
            None,
            # Redistribution
            1,
            # and
            2,
            # use
            3,
            # in
            4,
            # other form always'
            None,
            0,
            None
        ]
        assert expected == result
예제 #7
0
    def test_Query_known_and_unknown_positions(self):
        rule_text = 'Redistribution and use in source and binary forms'
        idx = index.LicenseIndex(
            [Rule(stored_text=rule_text, license_expression='bsd')])
        querys = 'The new Redistribution and use in other form always'
        qry = Query(query_string=querys, idx=idx, _test_mode=False)
        # we have only 4 known positions in this query, hence only 4 entries there on a single line
        # "Redistribution and use in"
        assert [1, 1, 1, 1] == qry.line_by_pos

        # this show our 4 known token in this query with their known position
        # "Redistribution and use in"
        assert [6, 0, 3, 5] == qry.tokens

        # the first two tokens are unknown, then starting after "in" we have three trailing unknown.
        assert {
            -1: 2,
            3: 3,
        } == qry.unknowns_by_pos

        # This shows how knowns and unknowns are blended
        result = list(qry.tokens_with_unknowns())
        expected = [
            # The  new
            None,
            None,
            # Redistribution
            6,
            # and
            0,
            # use
            3,
            # in
            5,
            # other form always'
            None,
            None,
            None
        ]
        assert result == expected
    def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(
            self):
        _stored_text = u'copyright reserved mit is license, {{}} copyright reserved mit is license'
        #                 0        1  2   3       4               5        6   7  8       9
        license_expression = 'tst'
        rule = models.Rule(license_expression=license_expression,
                           stored_text=_stored_text)
        idx = MiniLicenseIndex([rule])

        querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.'
        #           0  1         2        3   4  5       6  7   8         9       10  11 12      13  14
        qry = Query(query_string=querys, idx=idx)

        # convert tid to actual token strings
        tks_as_str = lambda tks: [
            None if tid is None else idx.tokens_by_tid[tid] for tid in tks
        ]

        expected = [
            None, None, u'copyright', u'reserved', u'mit', u'is', u'license',
            u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license',
            None
        ]
        #              0     1            2            3       4      5           6      7      8            9           10      11     12          13     14
        assert tks_as_str(qry.tokens_with_unknowns()) == expected

        result = idx.match(query_string=querys)
        assert len(result) == 1

        match = result[0]
        assert match.qspan == Span(0, 4) | Span(6, 10)
        assert match.ispan == Span(0, 9)
        assert match.coverage() == 100
        qtext, itext = get_texts(match)
        assert qtext == 'copyright reserved mit is license [is] [the] copyright reserved mit is license'
        assert itext == 'copyright reserved mit is license copyright reserved mit is license'