Пример #1
0
def test_get_similarity():
    index = MinHashIndex(None, 0xFFFF, 1, 1)

    assert index.get_similarity(
        [{'a': 1}],
        [{'a': 0.5, 'b': 0.25, 'c': 0.25}],
    ) == 0.5

    assert index.get_similarity(
        [{'a': 1}],
        [{'b': 0.5, 'c': 0.5}],
    ) == 0

    index = MinHashIndex(None, 0xFFFF, 2, 1)

    assert index.get_similarity(
        [
            {'a': 1},
            {'a': 1},
        ],
        [
            {'a': 1},
            {'a': 1},
        ],
    ) == 1.0

    assert index.get_similarity(
        [
            {'a': 1},
            {'a': 1},
        ],
        [
            {'b': 1},
            {'b': 1},
        ],
    ) == 0

    assert index.get_similarity(
        [
            {'a': 1},
            {'b': 1},
        ],
        [
            {'b': 1},
            {'b': 1},
        ],
    ) == 0.5

    with pytest.raises(AssertionError):
        assert index.get_similarity(
            range(10),
            range(10),
        )

    with pytest.raises(AssertionError):
        assert index.get_similarity(
            range(1),
            range(10),
        )
Пример #2
0
    def test_index(self):
        index = MinHashIndex(
            redis.clusters.get('default'),
            0xFFFF,
            8,
            2,
        )

        index.record('example', '1', 'hello world')
        index.record('example', '2', 'hello world')
        index.record('example', '3', 'jello world')
        index.record('example', '4', 'yellow world')
        index.record('example', '4', 'mellow world')
        index.record('example', '5', 'pizza world')

        results = index.query('example', '1')
        assert results[0] == ('1', 1.0)
        assert results[1] == ('2', 1.0)  # identical contents
        assert results[2][0] == '3'
        assert results[3][0] == '4'
        assert results[4][0] == '5'
Пример #3
0
    def test_export_import(self):
        bands = 2
        retention = 12
        index = MinHashIndex(
            redis.clusters.get('default'),
            0xFFFF,
            bands,
            2,
            60 * 60,
            retention,
        )

        index.record('example', '1', [('index', 'hello world')])

        timestamp = int(time.time())
        result = index.export('example', [('index', 1)], timestamp=timestamp)
        assert len(result) == 1

        data = msgpack.unpackb(result[0])
        assert len(data) == bands

        for band in data:
            assert len(band) == (retention + 1)
            assert sum(
                sum(dict(bucket_frequencies).values())
                for index, bucket_frequencies in band) == 1

        # Copy the data from key 1 to key 2.
        index.import_('example', [('index', 2, result[0])],
                      timestamp=timestamp)

        assert index.export('example', [('index', 1)],
                            timestamp=timestamp) == index.export(
                                'example', [('index', 2)], timestamp=timestamp)

        # Copy the data again to key 2 (duplicating all of the data.)
        index.import_('example', [('index', 2, result[0])],
                      timestamp=timestamp)

        result = index.export('example', [('index', 2)], timestamp=timestamp)
        assert len(result) == 1

        data = msgpack.unpackb(result[0])
        assert len(data) == bands

        for band in data:
            assert len(band) == (retention + 1)
            assert sum(
                sum(dict(bucket_frequencies).values())
                for index, bucket_frequencies in band) == 2
Пример #4
0
    def test_index(self):
        index = MinHashIndex(
            redis.clusters.get('default'),
            0xFFFF,
            8,
            2,
            60 * 60,
            12,
        )

        index.record('example', '1', [('index', 'hello world')])
        index.record('example', '2', [('index', 'hello world')])
        index.record('example', '3', [('index', 'jello world')])
        index.record('example', '4', [('index', 'yellow world')])
        index.record('example', '4', [('index', 'mellow world')])
        index.record('example', '5', [('index', 'pizza world')])

        results = index.query('example', '1', ['index'])[0]
        assert results[0] == ('1', 1.0)
        assert results[1] == ('2', 1.0)  # identical contents
        assert results[2][0] in (
            '3', '4')  # equidistant pairs, order doesn't really matter
        assert results[3][0] in ('3', '4')
        assert results[4][0] == '5'

        index.delete('example', [('index', '3')])
        assert [key for key, _ in index.query('example', '1', ['index'])[0]
                ] == ['1', '2', '4', '5']
Пример #5
0
def test_get_similarity():
    index = MinHashIndex(None, 0xFFFF, 1, 1)

    assert index.get_similarity(
        [{
            'a': 1
        }],
        [{
            'a': 0.5,
            'b': 0.25,
            'c': 0.25
        }],
    ) == 0.5

    assert index.get_similarity(
        [{
            'a': 1
        }],
        [{
            'b': 0.5,
            'c': 0.5
        }],
    ) == 0

    index = MinHashIndex(None, 0xFFFF, 2, 1)

    assert index.get_similarity(
        [
            {
                'a': 1
            },
            {
                'a': 1
            },
        ],
        [
            {
                'a': 1
            },
            {
                'a': 1
            },
        ],
    ) == 1.0

    assert index.get_similarity(
        [
            {
                'a': 1
            },
            {
                'a': 1
            },
        ],
        [
            {
                'b': 1
            },
            {
                'b': 1
            },
        ],
    ) == 0

    assert index.get_similarity(
        [
            {
                'a': 1
            },
            {
                'b': 1
            },
        ],
        [
            {
                'b': 1
            },
            {
                'b': 1
            },
        ],
    ) == 0.5

    with pytest.raises(AssertionError):
        assert index.get_similarity(
            range(10),
            range(10),
        )

    with pytest.raises(AssertionError):
        assert index.get_similarity(
            range(1),
            range(10),
        )
Пример #6
0
    def test_index(self):
        index = MinHashIndex(
            redis.clusters.get('default'),
            0xFFFF,
            8,
            2,
        )

        index.record('example', '1', 'hello world')
        index.record('example', '2', 'hello world')
        index.record('example', '3', 'jello world')
        index.record('example', '4', 'yellow world')
        index.record('example', '4', 'mellow world')
        index.record('example', '5', 'pizza world')

        results = index.query('example', '1')
        assert results[0] == ('1', 1.0)
        assert results[1] == ('2', 1.0)  # identical contents
        assert results[2][0] == '3'
        assert results[3][0] == '4'
        assert results[4][0] == '5'
Пример #7
0
    def test_export_import(self):
        bands = 2
        retention = 12
        index = MinHashIndex(
            redis.clusters.get('default'),
            0xFFFF,
            bands,
            2,
            60 * 60,
            retention,
        )

        index.record('example', '1', [('index', 'hello world')])

        timestamp = int(time.time())
        result = index.export('example', [('index', 1)], timestamp=timestamp)
        assert len(result) == 1

        data = msgpack.unpackb(result[0])
        assert len(data) == bands

        for band in data:
            assert len(band) == (retention + 1)
            assert sum(sum(dict(bucket_frequencies).values()) for index, bucket_frequencies in band) == 1

        # Copy the data from key 1 to key 2.
        index.import_('example', [('index', 2, result[0])], timestamp=timestamp)

        assert index.export(
            'example',
            [('index', 1)],
            timestamp=timestamp
        ) == index.export(
            'example',
            [('index', 2)],
            timestamp=timestamp
        )

        # Copy the data again to key 2 (duplicating all of the data.)
        index.import_('example', [('index', 2, result[0])], timestamp=timestamp)

        result = index.export('example', [('index', 2)], timestamp=timestamp)
        assert len(result) == 1

        data = msgpack.unpackb(result[0])
        assert len(data) == bands

        for band in data:
            assert len(band) == (retention + 1)
            assert sum(sum(dict(bucket_frequencies).values()) for index, bucket_frequencies in band) == 2
Пример #8
0
    def test_index(self):
        index = MinHashIndex(
            redis.clusters.get('default'),
            0xFFFF,
            8,
            2,
            60 * 60,
            12,
        )

        index.record('example', '1', [('index', 'hello world')])
        index.record('example', '2', [('index', 'hello world')])
        index.record('example', '3', [('index', 'jello world')])
        index.record('example', '4', [('index', 'yellow world')])
        index.record('example', '4', [('index', 'mellow world')])
        index.record('example', '5', [('index', 'pizza world')])

        results = index.query('example', '1', ['index'])[0]
        assert results[0] == ('1', 1.0)
        assert results[1] == ('2', 1.0)  # identical contents
        assert results[2][0] in ('3', '4')  # equidistant pairs, order doesn't really matter
        assert results[3][0] in ('3', '4')
        assert results[4][0] == '5'

        index.delete('example', [('index', '3')])
        assert [key for key, _ in index.query('example', '1', ['index'])[0]] == [
            '1', '2', '4', '5'
        ]