Пример #1
0
    def test_word_shingler(self):

        s = Shingler(span=5, skip=1, unique=True, tokenizer=RegexTokenizer())
        shingles = s.get_shingles("the quick brown fox jumps over a lazy dog")
        self.assertIn(("jumps", "a", "dog"), shingles)

        t = Shingler(span=5, skip=1, unique=False, tokenizer=RegexTokenizer())
        shingles = t.get_shingles("the quick brown fox jumps over a lazy dog")
        self.assertEqual(("the", "brown", "jumps"), shingles[0])
Пример #2
0
    def test_shingler(self):

        s = Shingler(span=5, skip=1, unique=True)
        shingles = s.get_shingles("abracadabra")
        self.assertIn(("d", "b", "a"), shingles)

        t = Shingler(span=5, skip=1, unique=False)
        shingles = t.get_shingles("abracadabra")
        self.assertEqual(("a", "r", "c"), shingles[0])
Пример #3
0
 def test_bills(self):
     """Should return 97 clusters of bills.
     """
     with open(get_resource_name('data/bills100.txt'), 'r') as fhandle:
         data = [line.rstrip().split('|') for line in fhandle]
     cluster = Cluster(width=20, bandwidth=5, seed=SEED)
     shingler = Shingler(span=3, tokenizer=RegexTokenizer())
     for label, text in data:
         shingles = shingler.get_shingles(text)
         cluster.add_item(shingles, label)
     clusters = cluster.get_clusters()
     self.assertEqual(len(clusters), 97)
Пример #4
0
 def test_names(self):
     """Should return 281 clusters of names.
     """
     with open(get_resource_name('data/perrys.csv'), 'r') as fhandle:
         data = set(line.rstrip() for line in fhandle)
     cluster = Cluster(width=20, bandwidth=5, seed=SEED)
     shingler = Shingler(3)
     for name in data:
         shingles = shingler.get_shingles(name)
         cluster.add_item(shingles, name)
     clusters = cluster.get_clusters()
     self.assertEqual(len(clusters), 209)
Пример #5
0
 def test_names_kmin_scheme(self):
     """Should return 145 clusters of names.
     """
     with open(get_resource_name('data/perrys.csv'), 'r') as fhandle:
         data = set(line.rstrip() for line in fhandle)
     cluster = Cluster(width=20, bandwidth=5, kmin=2, lsh_scheme="a1",
                       seed=SEED)
     shingler = Shingler(3)
     for name in data:
         shingles = shingler.get_shingles(name)
         cluster.add_item(shingles, name)
     clusters = cluster.get_clusters()
     # for cluster in clusters:
     #     print cluster
     self.assertEqual(len(clusters), 176)
Пример #6
0
    def run_simulated_manually(filepath, lines_to_read=sys.maxint,
                               cluster_args=None):
        with open(get_resource_name(filepath), 'r') as fhandle:
            data = [line.rstrip().split(' ')
                    for line in islice(fhandle, lines_to_read)]
        if cluster_args is None:
            cluster_args = dict()
        cluster = Cluster(**cluster_args)
        shingler = Shingler(span=3)
        content_dict = dict()
        for pair in data:
            if len(pair) > 1:
                label, text = pair
            else:
                label, text = pair[0], ''
            content_dict[label] = text
            shingles = shingler.get_shingles(text)
            cluster.add_item(shingles, label)
        clusters = cluster.get_clusters()

        is_label_positive = lambda lbl: ':' in lbl
        return dict(stats=describe_clusters(clusters, is_label_positive))
Пример #7
0
def get_default_shingler(**opts):
    shingler = Shingler(**opts)
    shingler._normalizer = None
    shingler._tokenizer = None
    return shingler
Пример #8
0
    def __init__(self,
                 cfg,
                 content_filter=None,
                 trace_every=0,
                 get_body=None,
                 get_label=None,
                 get_prefix=None,
                 min_support=None,
                 seed=0,
                 normalizer=None,
                 tokenizer=None):
        """Read configuration"""
        self.cfg = cfg
        self._get_body = get_body
        self._get_label = get_label
        self._get_prefix = get_prefix

        self.trace_every = trace_every

        # Set options
        self.content_filter = content_filter
        self.min_support = cfg[
            'min_support'] if min_support is None else min_support

        # normalizer and tokenizer
        self.normalizer = get_default_normalizer(
            **cfg.get('preprocessor', {}).get('normalizer', {})) \
            if normalizer is None else normalizer
        self.tokenizer = get_default_tokenizer(
        ) if tokenizer is None else tokenizer

        # Configure minhash signer
        sig_width = cfg['sig_width']
        lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options'])
        self.signer = MinHashSignature(sig_width,
                                       lsh_hasher=lsh_hasher,
                                       kmin=cfg['kmin'])

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        self.shingler = get_default_shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        cfg_sketch = cfg['sketch']
        self.sketch_enabled = cfg_sketch['enabled']
        self.sketch_dist_fn = None
        self.max_dist = None
        if self.sketch_enabled:
            algorithm_name = cfg_sketch['algorithm']
            try:
                sketch_algorithm = getattr(SketchModel, algorithm_name)
            except AttributeError:
                raise RuntimeError("Unknown sketch model specified: '%s'" %
                                   algorithm_name)
            self.sketch_bits = cfg_sketch['size']
            cfg_sketch_shingler = cfg_sketch['shingler']
            if not cfg_sketch_shingler['enabled']:
                # if sketch shingler is disabled, we also disable signer
                # as we will use default signer
                self.sketch_shingler = None
                self.sketch_signer = None
            elif sketch_algorithm == SketchModel.simhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = SimHashSignature(self.sketch_bits,
                                                      seed=seed)
            elif sketch_algorithm == SketchModel.minhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = MinHashSketchSignature(self.sketch_bits,
                                                            seed=seed)

            self.sketch_shingler._tokenizer = None
            self.sketch_shingler._normalizer = None

            self.max_dist = \
                int(floor(self.sketch_bits *
                          (1.0 - float(cfg_sketch['resemblance']))))
            self.sketch_dist_fn = hamming
            self.sketch_operator = OPERATOR_MAP[cfg_sketch.get(
                'operator', 'and')]
        self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn,
                                       max_dist=self.max_dist,
                                       min_support=self.min_support,
                                       sketch_operator=self.sketch_operator)