示例#1
0
    def __init__(self,
                 cfg,
                 content_filter=None,
                 trace_every=0,
                 get_body=None,
                 get_label=None,
                 get_prefix=None,
                 seed=None,
                 min_support=None,
                 normalizer=None,
                 tokenizer=None):
        """Read configuration"""
        self.cfg = cfg
        self._get_body = get_body
        self._get_label = get_label
        self._get_prefix = get_prefix

        self.trace_every = trace_every

        # Set options
        self.content_filter = content_filter
        self.random_state = cfg['random_state'] if seed is None else seed
        cfg_signer = cfg['signer']
        self.min_support = cfg_signer[
            'min_support'] if min_support is None else min_support

        # normalizer and tokenizer
        self.normalizer = get_default_normalizer(
            **cfg.get('preprocessor', {}).get('normalizer', {})) \
            if normalizer is None else normalizer
        self.tokenizer = get_default_tokenizer(
        ) if tokenizer is None else tokenizer

        # Configure minhash signer
        sig_width = cfg_signer['width']
        lsh_hasher = LSHC(width=sig_width,
                          seed=self.random_state,
                          **cfg_signer['lsh'])
        self.signer = MinHashSignature(
            sig_width,
            lsh_hasher=lsh_hasher,
            universe_size=cfg_signer['universe_size'],
            kmin=cfg_signer['kmin'],
            seed=self.random_state)

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        self.shingler = get_default_shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        self.sketch_enabled = False
        self.cluster_builder = Cluster(min_support=self.min_support)
示例#2
0
    def __init__(self,
                 width=12,
                 bandwidth=3,
                 lsh_scheme="a0",
                 universe_size=None,
                 kmin=1,
                 seed=0):
        """

        :param width: Number of bands
        :type width: int
        :param lsh_scheme: Adjusts number of combinatorial bands
        :type lsh_scheme: str
        :param bandwidth: Number of rows per band
        :type bandwidth: int
        :param universe_size: A prime number of size close to token universe
                              cardinality
        :type universe_size: long
        """
        lsh_hasher = LSHC(bandwidth, width=width, scheme=lsh_scheme) \
            if bandwidth > 1 \
            else None
        signer = MinHashSignature(width,
                                  lsh_hasher=lsh_hasher,
                                  universe_size=universe_size,
                                  kmin=kmin,
                                  seed=seed)
        super(MinHashCluster, self).__init__(signer=signer)
示例#3
0
    def test_signature_similarity(self):
        """The probability that two sets' signatures match at some index are
        equal is equal to the Jaccard similarity between the two
        """
        n_tests = 100
        expected_error = 1.0 / 10  # Expected error is O(1/sqrt(dim))
        mh = MinHashSignature(10 * 10)
        err = 0.0

        for _ in xrange(n_tests):
            # Create random sets and their signatures
            sets = (randset(), randset())
            sigs = map(mh.get_signature, sets)

            # Calculate true Jaccard similarity, and sim of signatures
            jsim = jaccard_sim(*sets)
            ssim = sigsim(*sigs, dim=100)

            # Accumulate error
            err += abs(jsim - ssim)

        # Over n_tests large, we should be within upper bound of expected error
        avg_err = err / n_tests
        self.assertGreaterEqual(expected_error,
                                avg_err,
                                msg="Accuracy test failed. (avg error: %f)" %
                                avg_err)
示例#4
0
    def __init__(self, cfg, content_filter=None, opts=None):

        """Read configuration"""
        self.cfg = cfg

        common_kwargs = dict(
            normalizer=HTMLNormalizer(),
            tokenizer=RegexTokenizer()
        )
        deepupdate(common_kwargs, opts or {})

        # Set options
        self.content_filter = content_filter
        min_support = cfg['min_support']

        # Configure minhash signer
        sig_width = cfg['sig_width']
        lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options'])
        self.signer = MinHashSignature(sig_width,
                                       lsh_hasher=lsh_hasher,
                                       kmin=cfg['kmin'])

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        cfg_key_shingle.update(common_kwargs)
        self.shingler = Shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        cfg_sketch = cfg['sketch']
        self.sketch_enabled = cfg_sketch['enabled']
        sketch_dist_fn = None
        xor_threshold = None
        if self.sketch_enabled:
            algorithm_name = cfg_sketch['algorithm']
            try:
                sketch_algorithm = getattr(SketchModel, algorithm_name)
            except AttributeError:
                raise RuntimeError("Unknown sketch model specified: '%s'"
                                   % algorithm_name)
            sketch_bits = cfg_sketch['size'] * 8
            cfg_sketch_shingle = cfg_sketch['shingler']
            cfg_sketch_shingle.update(common_kwargs)
            self.sketch_shingler = Shingler(**cfg_sketch_shingle)
            if sketch_algorithm == SketchModel.simhash:
                self.sketch_signer = SimHashSignature(bit_depth=sketch_bits)
            elif sketch_algorithm == SketchModel.minhash:
                self.sketch_signer = MinHashSketchSignature(sketch_bits)
            xor_threshold = \
                int(floor(sketch_bits *
                          (1.0 - float(cfg_sketch['resemblance']))))
            sketch_dist_fn = hamming

        self.cluster_builder = Cluster(sketch_dist_fn=sketch_dist_fn,
                                       max_dist=xor_threshold,
                                       min_support=min_support)
示例#5
0
    def __init__(self, cfg, content_filter=None, trace_every=0,
                 get_body=None, get_label=None, get_prefix=None, seed=None,
                 min_support=None, normalizer=None, tokenizer=None):

        """Read configuration"""
        self.cfg = cfg
        self._get_body = get_body
        self._get_label = get_label
        self._get_prefix = get_prefix

        self.trace_every = trace_every

        # Set options
        self.content_filter = content_filter
        self.random_state = cfg['random_state'] if seed is None else seed
        cfg_signer = cfg['signer']
        self.min_support = cfg_signer['min_support'] if min_support is None else min_support

        # normalizer and tokenizer
        self.normalizer = get_default_normalizer(
            **cfg.get('preprocessor', {}).get('normalizer', {})) \
            if normalizer is None else normalizer
        self.tokenizer = get_default_tokenizer() if tokenizer is None else tokenizer

        # Configure minhash signer
        sig_width = cfg_signer['width']
        lsh_hasher = LSHC(width=sig_width, seed=self.random_state, **cfg_signer['lsh'])
        self.signer = MinHashSignature(sig_width,
                                       lsh_hasher=lsh_hasher,
                                       universe_size=cfg_signer['universe_size'],
                                       kmin=cfg_signer['kmin'],
                                       seed=self.random_state)

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        self.shingler = get_default_shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        self.sketch_enabled = False
        self.cluster_builder = Cluster(min_support=self.min_support)
示例#6
0
class HDClustering(object):

    def __init__(self, cfg, content_filter=None, trace_every=0,
                 get_body=None, get_label=None, get_prefix=None, seed=None,
                 min_support=None, normalizer=None, tokenizer=None):

        """Read configuration"""
        self.cfg = cfg
        self._get_body = get_body
        self._get_label = get_label
        self._get_prefix = get_prefix

        self.trace_every = trace_every

        # Set options
        self.content_filter = content_filter
        self.random_state = cfg['random_state'] if seed is None else seed
        cfg_signer = cfg['signer']
        self.min_support = cfg_signer['min_support'] if min_support is None else min_support

        # normalizer and tokenizer
        self.normalizer = get_default_normalizer(
            **cfg.get('preprocessor', {}).get('normalizer', {})) \
            if normalizer is None else normalizer
        self.tokenizer = get_default_tokenizer() if tokenizer is None else tokenizer

        # Configure minhash signer
        sig_width = cfg_signer['width']
        lsh_hasher = LSHC(width=sig_width, seed=self.random_state, **cfg_signer['lsh'])
        self.signer = MinHashSignature(sig_width,
                                       lsh_hasher=lsh_hasher,
                                       universe_size=cfg_signer['universe_size'],
                                       kmin=cfg_signer['kmin'],
                                       seed=self.random_state)

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        self.shingler = get_default_shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        self.sketch_enabled = False
        self.cluster_builder = Cluster(min_support=self.min_support)

    def _map_iter(self, data):
        """Find clusters in an iterable"""

        get_body = self._get_body
        get_label = self._get_label
        get_prefix = self._get_prefix

        for i, obj in enumerate(data):
            body = obj if get_body is None else get_body(obj)
            label = i if get_label is None else get_label(obj)
            prefix = None if get_prefix is None else get_prefix(obj)

            for feat in self._map_item(obj, body, label, prefix):
                yield feat

    def _map_item(self, obj, body, label, prefix=None):

        # Extract features
        src = MessageSource.source(obj)
        obj_content = obj['content']
        normalized_content, meta = self.normalizer.normalize(obj_content)
        content_tokens = self.tokenizer.tokenize(normalized_content)

        if self.content_filter is not None:
            rule_accept, rule_score = self.content_filter.accept(
                obj, content_tokens=content_tokens, urls=meta.get('url_components', []), src=src)
        else:
            rule_accept = False
        if not rule_accept:
            features = self.shingler.get_shingles(content_tokens, prefix=prefix)
            keys = self.signer.get_signature(features)
            sketch = None
            yield (keys, (label, sketch))

    def clusters_from_iter(self, data):
        """Find clusters in an iterable"""

        cluster_builder = self.cluster_builder
        trace_every = self.trace_every
        for i, obj in enumerate(self._map_iter(data)):
            if trace_every > 0 and (not i % trace_every):
                LOG.info("Processing line " + str(i))

            keys, val = obj
            label, sketch = val \
                if isinstance(val, tuple) \
                else (val, None)
            cluster_builder.add_item(keys, label=label, sketch=sketch)

        return cluster_builder.get_clusters()

    def mapper(self, obj):
        """Perform a mapper task in MR"""
        get_body = self._get_body
        get_label = self._get_label
        get_prefix = self._get_prefix

        body = obj if get_body is None else get_body(obj)
        label = obj if get_label is None else get_label(obj)
        prefix = None if get_prefix is None else get_prefix(obj)

        for keys, val in self._map_item(obj, body, label, prefix):
            for key in keys:
                yield key, val

    def reducer(self, key, tuple_gen):
        """Perform a reducer task in MR

        If sketches enabled, data consists of:
            (key, [(lbl, sk), (lbl, sk), (lbl, sk)])
        Otherwise:
            (key, [lbl, lbl, lbl])
        """

        # If not using sketches, we are done
        return key, list(set(tuple_gen))
示例#7
0
class HDClustering(object):
    def __init__(self,
                 cfg,
                 content_filter=None,
                 trace_every=0,
                 get_body=None,
                 get_label=None,
                 get_prefix=None,
                 seed=None,
                 min_support=None,
                 normalizer=None,
                 tokenizer=None):
        """Read configuration"""
        self.cfg = cfg
        self._get_body = get_body
        self._get_label = get_label
        self._get_prefix = get_prefix

        self.trace_every = trace_every

        # Set options
        self.content_filter = content_filter
        self.random_state = cfg['random_state'] if seed is None else seed
        cfg_signer = cfg['signer']
        self.min_support = cfg_signer[
            'min_support'] if min_support is None else min_support

        # normalizer and tokenizer
        self.normalizer = get_default_normalizer(
            **cfg.get('preprocessor', {}).get('normalizer', {})) \
            if normalizer is None else normalizer
        self.tokenizer = get_default_tokenizer(
        ) if tokenizer is None else tokenizer

        # Configure minhash signer
        sig_width = cfg_signer['width']
        lsh_hasher = LSHC(width=sig_width,
                          seed=self.random_state,
                          **cfg_signer['lsh'])
        self.signer = MinHashSignature(
            sig_width,
            lsh_hasher=lsh_hasher,
            universe_size=cfg_signer['universe_size'],
            kmin=cfg_signer['kmin'],
            seed=self.random_state)

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        self.shingler = get_default_shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        self.sketch_enabled = False
        self.cluster_builder = Cluster(min_support=self.min_support)

    def _map_iter(self, data):
        """Find clusters in an iterable"""

        get_body = self._get_body
        get_label = self._get_label
        get_prefix = self._get_prefix

        for i, obj in enumerate(data):
            body = obj if get_body is None else get_body(obj)
            label = i if get_label is None else get_label(obj)
            prefix = None if get_prefix is None else get_prefix(obj)

            for feat in self._map_item(obj, body, label, prefix):
                yield feat

    def _map_item(self, obj, body, label, prefix=None):

        # Extract features
        src = MessageSource.source(obj)
        obj_content = obj['content']
        normalized_content, meta = self.normalizer.normalize(obj_content)
        content_tokens = self.tokenizer.tokenize(normalized_content)

        if self.content_filter is not None:
            rule_accept, rule_score = self.content_filter.accept(
                obj,
                content_tokens=content_tokens,
                urls=meta.get('url_components', []),
                src=src)
        else:
            rule_accept = False
        if not rule_accept:
            features = self.shingler.get_shingles(content_tokens,
                                                  prefix=prefix)
            keys = self.signer.get_signature(features)
            sketch = None
            yield (keys, (label, sketch))

    def clusters_from_iter(self, data):
        """Find clusters in an iterable"""

        cluster_builder = self.cluster_builder
        trace_every = self.trace_every
        for i, obj in enumerate(self._map_iter(data)):
            if trace_every > 0 and (not i % trace_every):
                LOG.info("Processing line " + str(i))

            keys, val = obj
            label, sketch = val \
                if isinstance(val, tuple) \
                else (val, None)
            cluster_builder.add_item(keys, label=label, sketch=sketch)

        return cluster_builder.get_clusters()

    def mapper(self, obj):
        """Perform a mapper task in MR"""
        get_body = self._get_body
        get_label = self._get_label
        get_prefix = self._get_prefix

        body = obj if get_body is None else get_body(obj)
        label = obj if get_label is None else get_label(obj)
        prefix = None if get_prefix is None else get_prefix(obj)

        for keys, val in self._map_item(obj, body, label, prefix):
            for key in keys:
                yield key, val

    def reducer(self, key, tuple_gen):
        """Perform a reducer task in MR

        If sketches enabled, data consists of:
            (key, [(lbl, sk), (lbl, sk), (lbl, sk)])
        Otherwise:
            (key, [lbl, lbl, lbl])
        """

        # If not using sketches, we are done
        return key, list(set(tuple_gen))
示例#8
0
class HDClustering(object):

    def __init__(self, cfg, content_filter=None, opts=None):

        """Read configuration"""
        self.cfg = cfg

        common_kwargs = dict(
            normalizer=HTMLNormalizer(),
            tokenizer=RegexTokenizer()
        )
        deepupdate(common_kwargs, opts or {})

        # Set options
        self.content_filter = content_filter
        min_support = cfg['min_support']

        # Configure minhash signer
        sig_width = cfg['sig_width']
        lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options'])
        self.signer = MinHashSignature(sig_width,
                                       lsh_hasher=lsh_hasher,
                                       kmin=cfg['kmin'])

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        cfg_key_shingle.update(common_kwargs)
        self.shingler = Shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        cfg_sketch = cfg['sketch']
        self.sketch_enabled = cfg_sketch['enabled']
        sketch_dist_fn = None
        xor_threshold = None
        if self.sketch_enabled:
            algorithm_name = cfg_sketch['algorithm']
            try:
                sketch_algorithm = getattr(SketchModel, algorithm_name)
            except AttributeError:
                raise RuntimeError("Unknown sketch model specified: '%s'"
                                   % algorithm_name)
            sketch_bits = cfg_sketch['size'] * 8
            cfg_sketch_shingle = cfg_sketch['shingler']
            cfg_sketch_shingle.update(common_kwargs)
            self.sketch_shingler = Shingler(**cfg_sketch_shingle)
            if sketch_algorithm == SketchModel.simhash:
                self.sketch_signer = SimHashSignature(bit_depth=sketch_bits)
            elif sketch_algorithm == SketchModel.minhash:
                self.sketch_signer = MinHashSketchSignature(sketch_bits)
            xor_threshold = \
                int(floor(sketch_bits *
                          (1.0 - float(cfg_sketch['resemblance']))))
            sketch_dist_fn = hamming

        self.cluster_builder = Cluster(sketch_dist_fn=sketch_dist_fn,
                                       max_dist=xor_threshold,
                                       min_support=min_support)

    def clusters_from_iter(self, data, get_body=None, get_label=None,
                           get_prefix=None):
        """Find clusters in an iterable"""

        cluster_builder = self.cluster_builder
        for i, obj in enumerate(data):
            if not i % 1000:
                print "Processing line " + str(i)
            body = obj if get_body is None else get_body(obj)
            label = i if get_label is None else get_label(obj)
            prefix = None if get_prefix is None else get_prefix(obj)

            # Step 1: Extract features
            if self.content_filter is None or \
                    not self.content_filter.accept(obj):
                features = self.shingler.get_shingles(body, prefix=prefix)
                keys = self.signer.get_signature(features)
                if self.sketch_enabled:
                    sketch_features = self.sketch_shingler.get_shingles(body)
                    sketch = self.sketch_signer.get_signature(sketch_features)
                else:
                    sketch = None

            # Step 2: Cluster given keys, sketch
            cluster_builder.add_set(keys, label=label, sketch=sketch)

        return cluster_builder.get_clusters()
示例#9
0
    def __init__(self,
                 cfg,
                 content_filter=None,
                 trace_every=0,
                 get_body=None,
                 get_label=None,
                 get_prefix=None,
                 min_support=None,
                 seed=0,
                 normalizer=None,
                 tokenizer=None):
        """Read configuration"""
        self.cfg = cfg
        self._get_body = get_body
        self._get_label = get_label
        self._get_prefix = get_prefix

        self.trace_every = trace_every

        # Set options
        self.content_filter = content_filter
        self.min_support = cfg[
            'min_support'] if min_support is None else min_support

        # normalizer and tokenizer
        self.normalizer = get_default_normalizer(
            **cfg.get('preprocessor', {}).get('normalizer', {})) \
            if normalizer is None else normalizer
        self.tokenizer = get_default_tokenizer(
        ) if tokenizer is None else tokenizer

        # Configure minhash signer
        sig_width = cfg['sig_width']
        lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options'])
        self.signer = MinHashSignature(sig_width,
                                       lsh_hasher=lsh_hasher,
                                       kmin=cfg['kmin'])

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        self.shingler = get_default_shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        cfg_sketch = cfg['sketch']
        self.sketch_enabled = cfg_sketch['enabled']
        self.sketch_dist_fn = None
        self.max_dist = None
        if self.sketch_enabled:
            algorithm_name = cfg_sketch['algorithm']
            try:
                sketch_algorithm = getattr(SketchModel, algorithm_name)
            except AttributeError:
                raise RuntimeError("Unknown sketch model specified: '%s'" %
                                   algorithm_name)
            self.sketch_bits = cfg_sketch['size']
            cfg_sketch_shingler = cfg_sketch['shingler']
            if not cfg_sketch_shingler['enabled']:
                # if sketch shingler is disabled, we also disable signer
                # as we will use default signer
                self.sketch_shingler = None
                self.sketch_signer = None
            elif sketch_algorithm == SketchModel.simhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = SimHashSignature(self.sketch_bits,
                                                      seed=seed)
            elif sketch_algorithm == SketchModel.minhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = MinHashSketchSignature(self.sketch_bits,
                                                            seed=seed)

            self.sketch_shingler._tokenizer = None
            self.sketch_shingler._normalizer = None

            self.max_dist = \
                int(floor(self.sketch_bits *
                          (1.0 - float(cfg_sketch['resemblance']))))
            self.sketch_dist_fn = hamming
            self.sketch_operator = OPERATOR_MAP[cfg_sketch.get(
                'operator', 'and')]
        self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn,
                                       max_dist=self.max_dist,
                                       min_support=self.min_support,
                                       sketch_operator=self.sketch_operator)
示例#10
0
class HDClustering(object):
    def __init__(self,
                 cfg,
                 content_filter=None,
                 trace_every=0,
                 get_body=None,
                 get_label=None,
                 get_prefix=None,
                 min_support=None,
                 seed=0,
                 normalizer=None,
                 tokenizer=None):
        """Read configuration"""
        self.cfg = cfg
        self._get_body = get_body
        self._get_label = get_label
        self._get_prefix = get_prefix

        self.trace_every = trace_every

        # Set options
        self.content_filter = content_filter
        self.min_support = cfg[
            'min_support'] if min_support is None else min_support

        # normalizer and tokenizer
        self.normalizer = get_default_normalizer(
            **cfg.get('preprocessor', {}).get('normalizer', {})) \
            if normalizer is None else normalizer
        self.tokenizer = get_default_tokenizer(
        ) if tokenizer is None else tokenizer

        # Configure minhash signer
        sig_width = cfg['sig_width']
        lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options'])
        self.signer = MinHashSignature(sig_width,
                                       lsh_hasher=lsh_hasher,
                                       kmin=cfg['kmin'])

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        self.shingler = get_default_shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        cfg_sketch = cfg['sketch']
        self.sketch_enabled = cfg_sketch['enabled']
        self.sketch_dist_fn = None
        self.max_dist = None
        if self.sketch_enabled:
            algorithm_name = cfg_sketch['algorithm']
            try:
                sketch_algorithm = getattr(SketchModel, algorithm_name)
            except AttributeError:
                raise RuntimeError("Unknown sketch model specified: '%s'" %
                                   algorithm_name)
            self.sketch_bits = cfg_sketch['size']
            cfg_sketch_shingler = cfg_sketch['shingler']
            if not cfg_sketch_shingler['enabled']:
                # if sketch shingler is disabled, we also disable signer
                # as we will use default signer
                self.sketch_shingler = None
                self.sketch_signer = None
            elif sketch_algorithm == SketchModel.simhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = SimHashSignature(self.sketch_bits,
                                                      seed=seed)
            elif sketch_algorithm == SketchModel.minhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = MinHashSketchSignature(self.sketch_bits,
                                                            seed=seed)

            self.sketch_shingler._tokenizer = None
            self.sketch_shingler._normalizer = None

            self.max_dist = \
                int(floor(self.sketch_bits *
                          (1.0 - float(cfg_sketch['resemblance']))))
            self.sketch_dist_fn = hamming
            self.sketch_operator = OPERATOR_MAP[cfg_sketch.get(
                'operator', 'and')]
        self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn,
                                       max_dist=self.max_dist,
                                       min_support=self.min_support,
                                       sketch_operator=self.sketch_operator)

    def _map_iter(self, data):
        """Find clusters in an iterable"""

        get_body = self._get_body
        get_label = self._get_label
        get_prefix = self._get_prefix

        for i, obj in enumerate(data):
            body = obj if get_body is None else get_body(obj)
            label = i if get_label is None else get_label(obj)
            prefix = None if get_prefix is None else get_prefix(obj)

            for feat in self._map_item(obj, body, label, prefix):
                yield feat

    def _map_item(self, obj, body, label, prefix=None):

        # Extract features
        src = MessageSource.source(obj)
        obj_content = obj['content']
        normalized_content, meta = self.normalizer.normalize(obj_content)
        content_tokens = self.tokenizer.tokenize(normalized_content)

        if self.content_filter is not None:
            rule_accept, rule_score = self.content_filter.accept(
                obj,
                content_tokens=content_tokens,
                urls=meta.get('url_components', []),
                src=src)
        else:
            rule_accept = False
        if not rule_accept:
            features = self.shingler.get_shingles(content_tokens,
                                                  prefix=prefix)
            if self.sketch_enabled and (self.sketch_shingler is None
                                        or self.sketch_signer is None):
                keys, sketch = self.signer.get_signature(features,
                                                         with_sketch=True)
            elif self.sketch_enabled and (self.sketch_shingler is not None
                                          and self.sketch_signer is not None):
                keys = self.signer.get_signature(features)
                sketch_features = self.sketch_shingler.get_shingles(
                    content_tokens)
                sketch = self.sketch_signer.get_signature(sketch_features)
            else:
                keys = self.signer.get_signature(features)
                sketch = None
            yield (keys, (label, sketch))

    def clusters_from_iter(self, data):
        """Find clusters in an iterable"""

        cluster_builder = self.cluster_builder
        trace_every = self.trace_every
        for i, obj in enumerate(self._map_iter(data)):
            if trace_every > 0 and (not i % trace_every):
                LOG.info("Processing line " + str(i))

            keys, val = obj
            label, sketch = val \
                if isinstance(val, tuple) \
                else (val, None)
            cluster_builder.add_item(keys, label=label, sketch=sketch)

        return cluster_builder.get_clusters()

    def mapper(self, obj):
        """Perform a mapper task in MR"""
        get_body = self._get_body
        get_label = self._get_label
        get_prefix = self._get_prefix

        body = obj if get_body is None else get_body(obj)
        label = obj if get_label is None else get_label(obj)
        prefix = None if get_prefix is None else get_prefix(obj)

        for keys, val in self._map_item(obj, body, label, prefix):
            for key in keys:
                yield key, val

    def reducer(self, key, tuple_gen):
        """Perform a reducer task in MR

        If sketches enabled, data consists of:
            (key, [(lbl, sk), (lbl, sk), (lbl, sk)])
        Otherwise:
            (key, [lbl, lbl, lbl])
        """

        # If not using sketches, we are done
        if self.sketch_dist_fn is None:
            return key, list(set(tuple_gen))

        # create a dict mappipng a label to a sketch
        return key, dict(tuple_gen).items()
示例#11
0
    def __init__(self, cfg, content_filter=None, trace_every=0,
                 get_body=None, get_label=None, get_prefix=None, min_support=None,
                 seed=0, normalizer=None, tokenizer=None):

        """Read configuration"""
        self.cfg = cfg
        self._get_body = get_body
        self._get_label = get_label
        self._get_prefix = get_prefix

        self.trace_every = trace_every

        # Set options
        self.content_filter = content_filter
        self.min_support = cfg['min_support'] if min_support is None else min_support

        # normalizer and tokenizer
        self.normalizer = get_default_normalizer(
            **cfg.get('preprocessor', {}).get('normalizer', {})) \
            if normalizer is None else normalizer
        self.tokenizer = get_default_tokenizer() if tokenizer is None else tokenizer

        # Configure minhash signer
        sig_width = cfg['sig_width']
        lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options'])
        self.signer = MinHashSignature(sig_width,
                                       lsh_hasher=lsh_hasher,
                                       kmin=cfg['kmin'])

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        self.shingler = get_default_shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        cfg_sketch = cfg['sketch']
        self.sketch_enabled = cfg_sketch['enabled']
        self.sketch_dist_fn = None
        self.max_dist = None
        if self.sketch_enabled:
            algorithm_name = cfg_sketch['algorithm']
            try:
                sketch_algorithm = getattr(SketchModel, algorithm_name)
            except AttributeError:
                raise RuntimeError("Unknown sketch model specified: '%s'"
                                   % algorithm_name)
            self.sketch_bits = cfg_sketch['size']
            cfg_sketch_shingler = cfg_sketch['shingler']
            if not cfg_sketch_shingler['enabled']:
                # if sketch shingler is disabled, we also disable signer
                # as we will use default signer
                self.sketch_shingler = None
                self.sketch_signer = None
            elif sketch_algorithm == SketchModel.simhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = SimHashSignature(self.sketch_bits, seed=seed)
            elif sketch_algorithm == SketchModel.minhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = MinHashSketchSignature(self.sketch_bits, seed=seed)

            self.sketch_shingler._tokenizer = None
            self.sketch_shingler._normalizer = None

            self.max_dist = \
                int(floor(self.sketch_bits *
                          (1.0 - float(cfg_sketch['resemblance']))))
            self.sketch_dist_fn = hamming
            self.sketch_operator = OPERATOR_MAP[cfg_sketch.get('operator', 'and')]
        self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn,
                                       max_dist=self.max_dist,
                                       min_support=self.min_support,
                                       sketch_operator=self.sketch_operator)
示例#12
0
class HDClustering(object):

    def __init__(self, cfg, content_filter=None, trace_every=0,
                 get_body=None, get_label=None, get_prefix=None, min_support=None,
                 seed=0, normalizer=None, tokenizer=None):

        """Read configuration"""
        self.cfg = cfg
        self._get_body = get_body
        self._get_label = get_label
        self._get_prefix = get_prefix

        self.trace_every = trace_every

        # Set options
        self.content_filter = content_filter
        self.min_support = cfg['min_support'] if min_support is None else min_support

        # normalizer and tokenizer
        self.normalizer = get_default_normalizer(
            **cfg.get('preprocessor', {}).get('normalizer', {})) \
            if normalizer is None else normalizer
        self.tokenizer = get_default_tokenizer() if tokenizer is None else tokenizer

        # Configure minhash signer
        sig_width = cfg['sig_width']
        lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options'])
        self.signer = MinHashSignature(sig_width,
                                       lsh_hasher=lsh_hasher,
                                       kmin=cfg['kmin'])

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        self.shingler = get_default_shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        cfg_sketch = cfg['sketch']
        self.sketch_enabled = cfg_sketch['enabled']
        self.sketch_dist_fn = None
        self.max_dist = None
        if self.sketch_enabled:
            algorithm_name = cfg_sketch['algorithm']
            try:
                sketch_algorithm = getattr(SketchModel, algorithm_name)
            except AttributeError:
                raise RuntimeError("Unknown sketch model specified: '%s'"
                                   % algorithm_name)
            self.sketch_bits = cfg_sketch['size']
            cfg_sketch_shingler = cfg_sketch['shingler']
            if not cfg_sketch_shingler['enabled']:
                # if sketch shingler is disabled, we also disable signer
                # as we will use default signer
                self.sketch_shingler = None
                self.sketch_signer = None
            elif sketch_algorithm == SketchModel.simhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = SimHashSignature(self.sketch_bits, seed=seed)
            elif sketch_algorithm == SketchModel.minhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = MinHashSketchSignature(self.sketch_bits, seed=seed)

            self.sketch_shingler._tokenizer = None
            self.sketch_shingler._normalizer = None

            self.max_dist = \
                int(floor(self.sketch_bits *
                          (1.0 - float(cfg_sketch['resemblance']))))
            self.sketch_dist_fn = hamming
            self.sketch_operator = OPERATOR_MAP[cfg_sketch.get('operator', 'and')]
        self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn,
                                       max_dist=self.max_dist,
                                       min_support=self.min_support,
                                       sketch_operator=self.sketch_operator)

    def _map_iter(self, data):
        """Find clusters in an iterable"""

        get_body = self._get_body
        get_label = self._get_label
        get_prefix = self._get_prefix

        for i, obj in enumerate(data):
            body = obj if get_body is None else get_body(obj)
            label = i if get_label is None else get_label(obj)
            prefix = None if get_prefix is None else get_prefix(obj)

            for feat in self._map_item(obj, body, label, prefix):
                yield feat

    def _map_item(self, obj, body, label, prefix=None):

        # Extract features
        src = MessageSource.source(obj)
        obj_content = obj['content']
        normalized_content, meta = self.normalizer.normalize(obj_content)
        content_tokens = self.tokenizer.tokenize(normalized_content)

        if self.content_filter is not None:
            rule_accept, rule_score = self.content_filter.accept(
                obj, content_tokens=content_tokens, urls=meta.get('url_components', []), src=src)
        else:
            rule_accept = False
        if not rule_accept:
            features = self.shingler.get_shingles(content_tokens, prefix=prefix)
            if self.sketch_enabled and (self.sketch_shingler is None or self.sketch_signer is None):
                keys, sketch = self.signer.get_signature(features, with_sketch=True)
            elif self.sketch_enabled and (self.sketch_shingler is not None and self.sketch_signer is not None):
                keys = self.signer.get_signature(features)
                sketch_features = self.sketch_shingler.get_shingles(content_tokens)
                sketch = self.sketch_signer.get_signature(sketch_features)
            else:
                keys = self.signer.get_signature(features)
                sketch = None
            yield (keys, (label, sketch))

    def clusters_from_iter(self, data):
        """Find clusters in an iterable"""

        cluster_builder = self.cluster_builder
        trace_every = self.trace_every
        for i, obj in enumerate(self._map_iter(data)):
            if trace_every > 0 and (not i % trace_every):
                LOG.info("Processing line " + str(i))

            keys, val = obj
            label, sketch = val \
                if isinstance(val, tuple) \
                else (val, None)
            cluster_builder.add_item(keys, label=label, sketch=sketch)

        return cluster_builder.get_clusters()

    def mapper(self, obj):
        """Perform a mapper task in MR"""
        get_body = self._get_body
        get_label = self._get_label
        get_prefix = self._get_prefix

        body = obj if get_body is None else get_body(obj)
        label = obj if get_label is None else get_label(obj)
        prefix = None if get_prefix is None else get_prefix(obj)

        for keys, val in self._map_item(obj, body, label, prefix):
            for key in keys:
                yield key, val

    def reducer(self, key, tuple_gen):
        """Perform a reducer task in MR

        If sketches enabled, data consists of:
            (key, [(lbl, sk), (lbl, sk), (lbl, sk)])
        Otherwise:
            (key, [lbl, lbl, lbl])
        """

        # If not using sketches, we are done
        if self.sketch_dist_fn is None:
            return key, list(set(tuple_gen))

        # create a dict mappipng a label to a sketch
        return key, dict(tuple_gen).items()
示例#13
0
 def test_consistent_signature(self):
     """Signatures should be consistent"""
     mh = MinHashSignature(10 * 10)
     s = randset()
     self.assertEqual(mh.get_signature(s), mh.get_signature(s))
示例#14
0
 def test_signature_length(self):
     """Signatures should have correct dimension"""
     mh = MinHashSignature(10 * 10)
     self.assertEqual(100, len(mh.get_signature(randset())))
示例#15
0
 def test_consistent_signature(self):
     """Signatures should be consistent"""
     mh = MinHashSignature(10 * 10)
     s = randset()
     self.assertEqual(mh.get_signature(s), mh.get_signature(s))
示例#16
0
 def test_signature_length(self):
     """Signatures should have correct dimension"""
     mh = MinHashSignature(10 * 10)
     self.assertEqual(100, len(mh.get_signature(randset())))