示例#1
0
 def test_invalid_n(self):
     fhp = FieldHashingProperties(ngram=2, k=20, positional=True)
     fhp.ngram = -6
     with self.assertRaises(
             ValueError,
             msg='Expected raise ValueError on invalid n.'):
         tok = get_tokenizer(fhp)
         tok('prawn')
示例#2
0
def stream_bloom_filters(
        dataset,  # type: Iterable[Sequence[Text]]
        keys,  # type: Sequence[Sequence[bytes]]
        schema  # type: Schema
):
    # type: (...) -> Iterable[Tuple[bitarray, Text, int]]
    """ Compute composite Bloom filters (CLKs) for every record in an
        iterable dataset.

        :param dataset: An iterable of indexable records.
        :param schema: An instantiated Schema instance
        :param keys: A tuple of two lists of secret keys used in the HMAC.
        :return: Generator yielding bloom filters as 3-tuples
    """
    tokenizers = [
        tokenizer.get_tokenizer(field.hashing_properties)
        for field in schema.fields
    ]
    return (crypto_bloom_filter(s, tokenizers, schema, keys) for s in dataset)
示例#3
0
import unittest

from clkhash.field_formats import FieldHashingProperties
from clkhash.tokenizer import get_tokenizer

__author__ = 'shardy'

# some tokenizers

p1_20 = get_tokenizer(
    FieldHashingProperties(ngram=1, k=20)
)

p2_20 = get_tokenizer(
    FieldHashingProperties(ngram=2, k=20)
)

p1_20_true = get_tokenizer(
    FieldHashingProperties(ngram=1, k=20, positional=True)
)

dummy = get_tokenizer(None)

class TestTokenizer(unittest.TestCase):

    def test_unigram_1(self):
        self.assertEqual(list(p1_20("1/2/93", ignore='/')),
                         ['1', '2', '9', '3'])

    def test_unigram_2(self):
        self.assertEqual(list(p1_20("1*2*93", ignore='*')),
示例#4
0
 def test_bigram_duplicate(self):
     properties = FieldHashingProperties(ngram=2, positional=False)
     self.assertEqual(list(get_tokenizer(properties)("abab")),
                      [' a', 'ab', 'ba', 'ab', 'b '])
示例#5
0
 def test_unigram_1(self):
     properties = FieldHashingProperties(ngram=1, positional=False)
     self.assertEqual(list(get_tokenizer(properties)("1/2/93", ignore='/')),
                      ['1', '2', '9', '3'])
示例#6
0
 def test_bigram_2(self):
     properties = FieldHashingProperties(ngram=2, positional=False)
     self.assertEqual(list(get_tokenizer(properties)("steve", ignore='e')),
                      [' s', 'st', 'tv', 'v '])
示例#7
0
 def test_bigram_1(self):
     properties = FieldHashingProperties(ngram=2, positional=False)
     self.assertEqual(list(get_tokenizer(properties)("steve")),
                      [' s', 'st', 'te', 'ev', 've', 'e '])
示例#8
0
 def test_positional_unigram_duplicate(self):
     properties = FieldHashingProperties(ngram=1, positional=True)
     self.assertEqual(list(get_tokenizer(properties)("111")),
                      ['1 1', '2 1', '3 1'])
示例#9
0
 def test_positional_unigram_2(self):
     properties = FieldHashingProperties(ngram=1, positional=True)
     self.assertEqual(list(get_tokenizer(properties)("1*2*")),
                      ['1 1', '2 *', '3 2', '4 *'])
示例#10
0
 def test_unigram_duplicate(self):
     properties = FieldHashingProperties(ngram=1, positional=False)
     self.assertEqual(list(get_tokenizer(properties)("1212")),
                      ['1', '2', '1', '2'])