def __init__(self, cfg, content_filter=None, trace_every=0, get_body=None, get_label=None, get_prefix=None, seed=None, min_support=None, normalizer=None, tokenizer=None): """Read configuration""" self.cfg = cfg self._get_body = get_body self._get_label = get_label self._get_prefix = get_prefix self.trace_every = trace_every # Set options self.content_filter = content_filter self.random_state = cfg['random_state'] if seed is None else seed cfg_signer = cfg['signer'] self.min_support = cfg_signer[ 'min_support'] if min_support is None else min_support # normalizer and tokenizer self.normalizer = get_default_normalizer( **cfg.get('preprocessor', {}).get('normalizer', {})) \ if normalizer is None else normalizer self.tokenizer = get_default_tokenizer( ) if tokenizer is None else tokenizer # Configure minhash signer sig_width = cfg_signer['width'] lsh_hasher = LSHC(width=sig_width, seed=self.random_state, **cfg_signer['lsh']) self.signer = MinHashSignature( sig_width, lsh_hasher=lsh_hasher, universe_size=cfg_signer['universe_size'], kmin=cfg_signer['kmin'], seed=self.random_state) # Configure shingler cfg_key_shingle = cfg['shingler'] self.shingler = get_default_shingler(**cfg_key_shingle) # Configure sketch comparison algorithm self.sketch_enabled = False self.cluster_builder = Cluster(min_support=self.min_support)
def __init__(self, width=12, bandwidth=3, lsh_scheme="a0", universe_size=None, kmin=1, seed=0): """ :param width: Number of bands :type width: int :param lsh_scheme: Adjusts number of combinatorial bands :type lsh_scheme: str :param bandwidth: Number of rows per band :type bandwidth: int :param universe_size: A prime number of size close to token universe cardinality :type universe_size: long """ lsh_hasher = LSHC(bandwidth, width=width, scheme=lsh_scheme) \ if bandwidth > 1 \ else None signer = MinHashSignature(width, lsh_hasher=lsh_hasher, universe_size=universe_size, kmin=kmin, seed=seed) super(MinHashCluster, self).__init__(signer=signer)
def test_signature_similarity(self): """The probability that two sets' signatures match at some index are equal is equal to the Jaccard similarity between the two """ n_tests = 100 expected_error = 1.0 / 10 # Expected error is O(1/sqrt(dim)) mh = MinHashSignature(10 * 10) err = 0.0 for _ in xrange(n_tests): # Create random sets and their signatures sets = (randset(), randset()) sigs = map(mh.get_signature, sets) # Calculate true Jaccard similarity, and sim of signatures jsim = jaccard_sim(*sets) ssim = sigsim(*sigs, dim=100) # Accumulate error err += abs(jsim - ssim) # Over n_tests large, we should be within upper bound of expected error avg_err = err / n_tests self.assertGreaterEqual(expected_error, avg_err, msg="Accuracy test failed. (avg error: %f)" % avg_err)
def __init__(self, cfg, content_filter=None, opts=None): """Read configuration""" self.cfg = cfg common_kwargs = dict( normalizer=HTMLNormalizer(), tokenizer=RegexTokenizer() ) deepupdate(common_kwargs, opts or {}) # Set options self.content_filter = content_filter min_support = cfg['min_support'] # Configure minhash signer sig_width = cfg['sig_width'] lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options']) self.signer = MinHashSignature(sig_width, lsh_hasher=lsh_hasher, kmin=cfg['kmin']) # Configure shingler cfg_key_shingle = cfg['shingler'] cfg_key_shingle.update(common_kwargs) self.shingler = Shingler(**cfg_key_shingle) # Configure sketch comparison algorithm cfg_sketch = cfg['sketch'] self.sketch_enabled = cfg_sketch['enabled'] sketch_dist_fn = None xor_threshold = None if self.sketch_enabled: algorithm_name = cfg_sketch['algorithm'] try: sketch_algorithm = getattr(SketchModel, algorithm_name) except AttributeError: raise RuntimeError("Unknown sketch model specified: '%s'" % algorithm_name) sketch_bits = cfg_sketch['size'] * 8 cfg_sketch_shingle = cfg_sketch['shingler'] cfg_sketch_shingle.update(common_kwargs) self.sketch_shingler = Shingler(**cfg_sketch_shingle) if sketch_algorithm == SketchModel.simhash: self.sketch_signer = SimHashSignature(bit_depth=sketch_bits) elif sketch_algorithm == SketchModel.minhash: self.sketch_signer = MinHashSketchSignature(sketch_bits) xor_threshold = \ int(floor(sketch_bits * (1.0 - float(cfg_sketch['resemblance'])))) sketch_dist_fn = hamming self.cluster_builder = Cluster(sketch_dist_fn=sketch_dist_fn, max_dist=xor_threshold, min_support=min_support)
def __init__(self, cfg, content_filter=None, trace_every=0, get_body=None, get_label=None, get_prefix=None, seed=None, min_support=None, normalizer=None, tokenizer=None): """Read configuration""" self.cfg = cfg self._get_body = get_body self._get_label = get_label self._get_prefix = get_prefix self.trace_every = trace_every # Set options self.content_filter = content_filter self.random_state = cfg['random_state'] if seed is None else seed cfg_signer = cfg['signer'] self.min_support = cfg_signer['min_support'] if min_support is None else min_support # normalizer and tokenizer self.normalizer = get_default_normalizer( **cfg.get('preprocessor', {}).get('normalizer', {})) \ if normalizer is None else normalizer self.tokenizer = get_default_tokenizer() if tokenizer is None else tokenizer # Configure minhash signer sig_width = cfg_signer['width'] lsh_hasher = LSHC(width=sig_width, seed=self.random_state, **cfg_signer['lsh']) self.signer = MinHashSignature(sig_width, lsh_hasher=lsh_hasher, universe_size=cfg_signer['universe_size'], kmin=cfg_signer['kmin'], seed=self.random_state) # Configure shingler cfg_key_shingle = cfg['shingler'] self.shingler = get_default_shingler(**cfg_key_shingle) # Configure sketch comparison algorithm self.sketch_enabled = False self.cluster_builder = Cluster(min_support=self.min_support)
class HDClustering(object): def __init__(self, cfg, content_filter=None, trace_every=0, get_body=None, get_label=None, get_prefix=None, seed=None, min_support=None, normalizer=None, tokenizer=None): """Read configuration""" self.cfg = cfg self._get_body = get_body self._get_label = get_label self._get_prefix = get_prefix self.trace_every = trace_every # Set options self.content_filter = content_filter self.random_state = cfg['random_state'] if seed is None else seed cfg_signer = cfg['signer'] self.min_support = cfg_signer['min_support'] if min_support is None else min_support # normalizer and tokenizer self.normalizer = get_default_normalizer( **cfg.get('preprocessor', {}).get('normalizer', {})) \ if normalizer is None else normalizer self.tokenizer = get_default_tokenizer() if tokenizer is None else tokenizer # Configure minhash signer sig_width = cfg_signer['width'] lsh_hasher = LSHC(width=sig_width, seed=self.random_state, **cfg_signer['lsh']) self.signer = MinHashSignature(sig_width, lsh_hasher=lsh_hasher, universe_size=cfg_signer['universe_size'], kmin=cfg_signer['kmin'], seed=self.random_state) # Configure shingler cfg_key_shingle = cfg['shingler'] self.shingler = get_default_shingler(**cfg_key_shingle) # Configure sketch comparison algorithm self.sketch_enabled = False self.cluster_builder = Cluster(min_support=self.min_support) def _map_iter(self, data): """Find clusters in an iterable""" get_body = self._get_body get_label = self._get_label get_prefix = self._get_prefix for i, obj in enumerate(data): body = obj if get_body is None else get_body(obj) label = i if get_label is None else get_label(obj) prefix = None if get_prefix is None else get_prefix(obj) for feat in self._map_item(obj, body, label, prefix): yield feat def _map_item(self, obj, body, label, prefix=None): # Extract features src = MessageSource.source(obj) obj_content = obj['content'] normalized_content, meta = self.normalizer.normalize(obj_content) content_tokens = self.tokenizer.tokenize(normalized_content) if self.content_filter is not None: rule_accept, rule_score = self.content_filter.accept( obj, content_tokens=content_tokens, urls=meta.get('url_components', []), src=src) else: rule_accept = False if not rule_accept: features = self.shingler.get_shingles(content_tokens, prefix=prefix) keys = self.signer.get_signature(features) sketch = None yield (keys, (label, sketch)) def clusters_from_iter(self, data): """Find clusters in an iterable""" cluster_builder = self.cluster_builder trace_every = self.trace_every for i, obj in enumerate(self._map_iter(data)): if trace_every > 0 and (not i % trace_every): LOG.info("Processing line " + str(i)) keys, val = obj label, sketch = val \ if isinstance(val, tuple) \ else (val, None) cluster_builder.add_item(keys, label=label, sketch=sketch) return cluster_builder.get_clusters() def mapper(self, obj): """Perform a mapper task in MR""" get_body = self._get_body get_label = self._get_label get_prefix = self._get_prefix body = obj if get_body is None else get_body(obj) label = obj if get_label is None else get_label(obj) prefix = None if get_prefix is None else get_prefix(obj) for keys, val in self._map_item(obj, body, label, prefix): for key in keys: yield key, val def reducer(self, key, tuple_gen): """Perform a reducer task in MR If sketches enabled, data consists of: (key, [(lbl, sk), (lbl, sk), (lbl, sk)]) Otherwise: (key, [lbl, lbl, lbl]) """ # If not using sketches, we are done return key, list(set(tuple_gen))
class HDClustering(object): def __init__(self, cfg, content_filter=None, trace_every=0, get_body=None, get_label=None, get_prefix=None, seed=None, min_support=None, normalizer=None, tokenizer=None): """Read configuration""" self.cfg = cfg self._get_body = get_body self._get_label = get_label self._get_prefix = get_prefix self.trace_every = trace_every # Set options self.content_filter = content_filter self.random_state = cfg['random_state'] if seed is None else seed cfg_signer = cfg['signer'] self.min_support = cfg_signer[ 'min_support'] if min_support is None else min_support # normalizer and tokenizer self.normalizer = get_default_normalizer( **cfg.get('preprocessor', {}).get('normalizer', {})) \ if normalizer is None else normalizer self.tokenizer = get_default_tokenizer( ) if tokenizer is None else tokenizer # Configure minhash signer sig_width = cfg_signer['width'] lsh_hasher = LSHC(width=sig_width, seed=self.random_state, **cfg_signer['lsh']) self.signer = MinHashSignature( sig_width, lsh_hasher=lsh_hasher, universe_size=cfg_signer['universe_size'], kmin=cfg_signer['kmin'], seed=self.random_state) # Configure shingler cfg_key_shingle = cfg['shingler'] self.shingler = get_default_shingler(**cfg_key_shingle) # Configure sketch comparison algorithm self.sketch_enabled = False self.cluster_builder = Cluster(min_support=self.min_support) def _map_iter(self, data): """Find clusters in an iterable""" get_body = self._get_body get_label = self._get_label get_prefix = self._get_prefix for i, obj in enumerate(data): body = obj if get_body is None else get_body(obj) label = i if get_label is None else get_label(obj) prefix = None if get_prefix is None else get_prefix(obj) for feat in self._map_item(obj, body, label, prefix): yield feat def _map_item(self, obj, body, label, prefix=None): # Extract features src = MessageSource.source(obj) obj_content = obj['content'] normalized_content, meta = self.normalizer.normalize(obj_content) content_tokens = self.tokenizer.tokenize(normalized_content) if self.content_filter is not None: rule_accept, rule_score = self.content_filter.accept( obj, content_tokens=content_tokens, urls=meta.get('url_components', []), src=src) else: rule_accept = False if not rule_accept: features = self.shingler.get_shingles(content_tokens, prefix=prefix) keys = self.signer.get_signature(features) sketch = None yield (keys, (label, sketch)) def clusters_from_iter(self, data): """Find clusters in an iterable""" cluster_builder = self.cluster_builder trace_every = self.trace_every for i, obj in enumerate(self._map_iter(data)): if trace_every > 0 and (not i % trace_every): LOG.info("Processing line " + str(i)) keys, val = obj label, sketch = val \ if isinstance(val, tuple) \ else (val, None) cluster_builder.add_item(keys, label=label, sketch=sketch) return cluster_builder.get_clusters() def mapper(self, obj): """Perform a mapper task in MR""" get_body = self._get_body get_label = self._get_label get_prefix = self._get_prefix body = obj if get_body is None else get_body(obj) label = obj if get_label is None else get_label(obj) prefix = None if get_prefix is None else get_prefix(obj) for keys, val in self._map_item(obj, body, label, prefix): for key in keys: yield key, val def reducer(self, key, tuple_gen): """Perform a reducer task in MR If sketches enabled, data consists of: (key, [(lbl, sk), (lbl, sk), (lbl, sk)]) Otherwise: (key, [lbl, lbl, lbl]) """ # If not using sketches, we are done return key, list(set(tuple_gen))
class HDClustering(object): def __init__(self, cfg, content_filter=None, opts=None): """Read configuration""" self.cfg = cfg common_kwargs = dict( normalizer=HTMLNormalizer(), tokenizer=RegexTokenizer() ) deepupdate(common_kwargs, opts or {}) # Set options self.content_filter = content_filter min_support = cfg['min_support'] # Configure minhash signer sig_width = cfg['sig_width'] lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options']) self.signer = MinHashSignature(sig_width, lsh_hasher=lsh_hasher, kmin=cfg['kmin']) # Configure shingler cfg_key_shingle = cfg['shingler'] cfg_key_shingle.update(common_kwargs) self.shingler = Shingler(**cfg_key_shingle) # Configure sketch comparison algorithm cfg_sketch = cfg['sketch'] self.sketch_enabled = cfg_sketch['enabled'] sketch_dist_fn = None xor_threshold = None if self.sketch_enabled: algorithm_name = cfg_sketch['algorithm'] try: sketch_algorithm = getattr(SketchModel, algorithm_name) except AttributeError: raise RuntimeError("Unknown sketch model specified: '%s'" % algorithm_name) sketch_bits = cfg_sketch['size'] * 8 cfg_sketch_shingle = cfg_sketch['shingler'] cfg_sketch_shingle.update(common_kwargs) self.sketch_shingler = Shingler(**cfg_sketch_shingle) if sketch_algorithm == SketchModel.simhash: self.sketch_signer = SimHashSignature(bit_depth=sketch_bits) elif sketch_algorithm == SketchModel.minhash: self.sketch_signer = MinHashSketchSignature(sketch_bits) xor_threshold = \ int(floor(sketch_bits * (1.0 - float(cfg_sketch['resemblance'])))) sketch_dist_fn = hamming self.cluster_builder = Cluster(sketch_dist_fn=sketch_dist_fn, max_dist=xor_threshold, min_support=min_support) def clusters_from_iter(self, data, get_body=None, get_label=None, get_prefix=None): """Find clusters in an iterable""" cluster_builder = self.cluster_builder for i, obj in enumerate(data): if not i % 1000: print "Processing line " + str(i) body = obj if get_body is None else get_body(obj) label = i if get_label is None else get_label(obj) prefix = None if get_prefix is None else get_prefix(obj) # Step 1: Extract features if self.content_filter is None or \ not self.content_filter.accept(obj): features = self.shingler.get_shingles(body, prefix=prefix) keys = self.signer.get_signature(features) if self.sketch_enabled: sketch_features = self.sketch_shingler.get_shingles(body) sketch = self.sketch_signer.get_signature(sketch_features) else: sketch = None # Step 2: Cluster given keys, sketch cluster_builder.add_set(keys, label=label, sketch=sketch) return cluster_builder.get_clusters()
def __init__(self, cfg, content_filter=None, trace_every=0, get_body=None, get_label=None, get_prefix=None, min_support=None, seed=0, normalizer=None, tokenizer=None): """Read configuration""" self.cfg = cfg self._get_body = get_body self._get_label = get_label self._get_prefix = get_prefix self.trace_every = trace_every # Set options self.content_filter = content_filter self.min_support = cfg[ 'min_support'] if min_support is None else min_support # normalizer and tokenizer self.normalizer = get_default_normalizer( **cfg.get('preprocessor', {}).get('normalizer', {})) \ if normalizer is None else normalizer self.tokenizer = get_default_tokenizer( ) if tokenizer is None else tokenizer # Configure minhash signer sig_width = cfg['sig_width'] lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options']) self.signer = MinHashSignature(sig_width, lsh_hasher=lsh_hasher, kmin=cfg['kmin']) # Configure shingler cfg_key_shingle = cfg['shingler'] self.shingler = get_default_shingler(**cfg_key_shingle) # Configure sketch comparison algorithm cfg_sketch = cfg['sketch'] self.sketch_enabled = cfg_sketch['enabled'] self.sketch_dist_fn = None self.max_dist = None if self.sketch_enabled: algorithm_name = cfg_sketch['algorithm'] try: sketch_algorithm = getattr(SketchModel, algorithm_name) except AttributeError: raise RuntimeError("Unknown sketch model specified: '%s'" % algorithm_name) self.sketch_bits = cfg_sketch['size'] cfg_sketch_shingler = cfg_sketch['shingler'] if not cfg_sketch_shingler['enabled']: # if sketch shingler is disabled, we also disable signer # as we will use default signer self.sketch_shingler = None self.sketch_signer = None elif sketch_algorithm == SketchModel.simhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = SimHashSignature(self.sketch_bits, seed=seed) elif sketch_algorithm == SketchModel.minhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = MinHashSketchSignature(self.sketch_bits, seed=seed) self.sketch_shingler._tokenizer = None self.sketch_shingler._normalizer = None self.max_dist = \ int(floor(self.sketch_bits * (1.0 - float(cfg_sketch['resemblance'])))) self.sketch_dist_fn = hamming self.sketch_operator = OPERATOR_MAP[cfg_sketch.get( 'operator', 'and')] self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn, max_dist=self.max_dist, min_support=self.min_support, sketch_operator=self.sketch_operator)
class HDClustering(object): def __init__(self, cfg, content_filter=None, trace_every=0, get_body=None, get_label=None, get_prefix=None, min_support=None, seed=0, normalizer=None, tokenizer=None): """Read configuration""" self.cfg = cfg self._get_body = get_body self._get_label = get_label self._get_prefix = get_prefix self.trace_every = trace_every # Set options self.content_filter = content_filter self.min_support = cfg[ 'min_support'] if min_support is None else min_support # normalizer and tokenizer self.normalizer = get_default_normalizer( **cfg.get('preprocessor', {}).get('normalizer', {})) \ if normalizer is None else normalizer self.tokenizer = get_default_tokenizer( ) if tokenizer is None else tokenizer # Configure minhash signer sig_width = cfg['sig_width'] lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options']) self.signer = MinHashSignature(sig_width, lsh_hasher=lsh_hasher, kmin=cfg['kmin']) # Configure shingler cfg_key_shingle = cfg['shingler'] self.shingler = get_default_shingler(**cfg_key_shingle) # Configure sketch comparison algorithm cfg_sketch = cfg['sketch'] self.sketch_enabled = cfg_sketch['enabled'] self.sketch_dist_fn = None self.max_dist = None if self.sketch_enabled: algorithm_name = cfg_sketch['algorithm'] try: sketch_algorithm = getattr(SketchModel, algorithm_name) except AttributeError: raise RuntimeError("Unknown sketch model specified: '%s'" % algorithm_name) self.sketch_bits = cfg_sketch['size'] cfg_sketch_shingler = cfg_sketch['shingler'] if not cfg_sketch_shingler['enabled']: # if sketch shingler is disabled, we also disable signer # as we will use default signer self.sketch_shingler = None self.sketch_signer = None elif sketch_algorithm == SketchModel.simhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = SimHashSignature(self.sketch_bits, seed=seed) elif sketch_algorithm == SketchModel.minhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = MinHashSketchSignature(self.sketch_bits, seed=seed) self.sketch_shingler._tokenizer = None self.sketch_shingler._normalizer = None self.max_dist = \ int(floor(self.sketch_bits * (1.0 - float(cfg_sketch['resemblance'])))) self.sketch_dist_fn = hamming self.sketch_operator = OPERATOR_MAP[cfg_sketch.get( 'operator', 'and')] self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn, max_dist=self.max_dist, min_support=self.min_support, sketch_operator=self.sketch_operator) def _map_iter(self, data): """Find clusters in an iterable""" get_body = self._get_body get_label = self._get_label get_prefix = self._get_prefix for i, obj in enumerate(data): body = obj if get_body is None else get_body(obj) label = i if get_label is None else get_label(obj) prefix = None if get_prefix is None else get_prefix(obj) for feat in self._map_item(obj, body, label, prefix): yield feat def _map_item(self, obj, body, label, prefix=None): # Extract features src = MessageSource.source(obj) obj_content = obj['content'] normalized_content, meta = self.normalizer.normalize(obj_content) content_tokens = self.tokenizer.tokenize(normalized_content) if self.content_filter is not None: rule_accept, rule_score = self.content_filter.accept( obj, content_tokens=content_tokens, urls=meta.get('url_components', []), src=src) else: rule_accept = False if not rule_accept: features = self.shingler.get_shingles(content_tokens, prefix=prefix) if self.sketch_enabled and (self.sketch_shingler is None or self.sketch_signer is None): keys, sketch = self.signer.get_signature(features, with_sketch=True) elif self.sketch_enabled and (self.sketch_shingler is not None and self.sketch_signer is not None): keys = self.signer.get_signature(features) sketch_features = self.sketch_shingler.get_shingles( content_tokens) sketch = self.sketch_signer.get_signature(sketch_features) else: keys = self.signer.get_signature(features) sketch = None yield (keys, (label, sketch)) def clusters_from_iter(self, data): """Find clusters in an iterable""" cluster_builder = self.cluster_builder trace_every = self.trace_every for i, obj in enumerate(self._map_iter(data)): if trace_every > 0 and (not i % trace_every): LOG.info("Processing line " + str(i)) keys, val = obj label, sketch = val \ if isinstance(val, tuple) \ else (val, None) cluster_builder.add_item(keys, label=label, sketch=sketch) return cluster_builder.get_clusters() def mapper(self, obj): """Perform a mapper task in MR""" get_body = self._get_body get_label = self._get_label get_prefix = self._get_prefix body = obj if get_body is None else get_body(obj) label = obj if get_label is None else get_label(obj) prefix = None if get_prefix is None else get_prefix(obj) for keys, val in self._map_item(obj, body, label, prefix): for key in keys: yield key, val def reducer(self, key, tuple_gen): """Perform a reducer task in MR If sketches enabled, data consists of: (key, [(lbl, sk), (lbl, sk), (lbl, sk)]) Otherwise: (key, [lbl, lbl, lbl]) """ # If not using sketches, we are done if self.sketch_dist_fn is None: return key, list(set(tuple_gen)) # create a dict mappipng a label to a sketch return key, dict(tuple_gen).items()
def __init__(self, cfg, content_filter=None, trace_every=0, get_body=None, get_label=None, get_prefix=None, min_support=None, seed=0, normalizer=None, tokenizer=None): """Read configuration""" self.cfg = cfg self._get_body = get_body self._get_label = get_label self._get_prefix = get_prefix self.trace_every = trace_every # Set options self.content_filter = content_filter self.min_support = cfg['min_support'] if min_support is None else min_support # normalizer and tokenizer self.normalizer = get_default_normalizer( **cfg.get('preprocessor', {}).get('normalizer', {})) \ if normalizer is None else normalizer self.tokenizer = get_default_tokenizer() if tokenizer is None else tokenizer # Configure minhash signer sig_width = cfg['sig_width'] lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options']) self.signer = MinHashSignature(sig_width, lsh_hasher=lsh_hasher, kmin=cfg['kmin']) # Configure shingler cfg_key_shingle = cfg['shingler'] self.shingler = get_default_shingler(**cfg_key_shingle) # Configure sketch comparison algorithm cfg_sketch = cfg['sketch'] self.sketch_enabled = cfg_sketch['enabled'] self.sketch_dist_fn = None self.max_dist = None if self.sketch_enabled: algorithm_name = cfg_sketch['algorithm'] try: sketch_algorithm = getattr(SketchModel, algorithm_name) except AttributeError: raise RuntimeError("Unknown sketch model specified: '%s'" % algorithm_name) self.sketch_bits = cfg_sketch['size'] cfg_sketch_shingler = cfg_sketch['shingler'] if not cfg_sketch_shingler['enabled']: # if sketch shingler is disabled, we also disable signer # as we will use default signer self.sketch_shingler = None self.sketch_signer = None elif sketch_algorithm == SketchModel.simhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = SimHashSignature(self.sketch_bits, seed=seed) elif sketch_algorithm == SketchModel.minhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = MinHashSketchSignature(self.sketch_bits, seed=seed) self.sketch_shingler._tokenizer = None self.sketch_shingler._normalizer = None self.max_dist = \ int(floor(self.sketch_bits * (1.0 - float(cfg_sketch['resemblance'])))) self.sketch_dist_fn = hamming self.sketch_operator = OPERATOR_MAP[cfg_sketch.get('operator', 'and')] self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn, max_dist=self.max_dist, min_support=self.min_support, sketch_operator=self.sketch_operator)
class HDClustering(object): def __init__(self, cfg, content_filter=None, trace_every=0, get_body=None, get_label=None, get_prefix=None, min_support=None, seed=0, normalizer=None, tokenizer=None): """Read configuration""" self.cfg = cfg self._get_body = get_body self._get_label = get_label self._get_prefix = get_prefix self.trace_every = trace_every # Set options self.content_filter = content_filter self.min_support = cfg['min_support'] if min_support is None else min_support # normalizer and tokenizer self.normalizer = get_default_normalizer( **cfg.get('preprocessor', {}).get('normalizer', {})) \ if normalizer is None else normalizer self.tokenizer = get_default_tokenizer() if tokenizer is None else tokenizer # Configure minhash signer sig_width = cfg['sig_width'] lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options']) self.signer = MinHashSignature(sig_width, lsh_hasher=lsh_hasher, kmin=cfg['kmin']) # Configure shingler cfg_key_shingle = cfg['shingler'] self.shingler = get_default_shingler(**cfg_key_shingle) # Configure sketch comparison algorithm cfg_sketch = cfg['sketch'] self.sketch_enabled = cfg_sketch['enabled'] self.sketch_dist_fn = None self.max_dist = None if self.sketch_enabled: algorithm_name = cfg_sketch['algorithm'] try: sketch_algorithm = getattr(SketchModel, algorithm_name) except AttributeError: raise RuntimeError("Unknown sketch model specified: '%s'" % algorithm_name) self.sketch_bits = cfg_sketch['size'] cfg_sketch_shingler = cfg_sketch['shingler'] if not cfg_sketch_shingler['enabled']: # if sketch shingler is disabled, we also disable signer # as we will use default signer self.sketch_shingler = None self.sketch_signer = None elif sketch_algorithm == SketchModel.simhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = SimHashSignature(self.sketch_bits, seed=seed) elif sketch_algorithm == SketchModel.minhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = MinHashSketchSignature(self.sketch_bits, seed=seed) self.sketch_shingler._tokenizer = None self.sketch_shingler._normalizer = None self.max_dist = \ int(floor(self.sketch_bits * (1.0 - float(cfg_sketch['resemblance'])))) self.sketch_dist_fn = hamming self.sketch_operator = OPERATOR_MAP[cfg_sketch.get('operator', 'and')] self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn, max_dist=self.max_dist, min_support=self.min_support, sketch_operator=self.sketch_operator) def _map_iter(self, data): """Find clusters in an iterable""" get_body = self._get_body get_label = self._get_label get_prefix = self._get_prefix for i, obj in enumerate(data): body = obj if get_body is None else get_body(obj) label = i if get_label is None else get_label(obj) prefix = None if get_prefix is None else get_prefix(obj) for feat in self._map_item(obj, body, label, prefix): yield feat def _map_item(self, obj, body, label, prefix=None): # Extract features src = MessageSource.source(obj) obj_content = obj['content'] normalized_content, meta = self.normalizer.normalize(obj_content) content_tokens = self.tokenizer.tokenize(normalized_content) if self.content_filter is not None: rule_accept, rule_score = self.content_filter.accept( obj, content_tokens=content_tokens, urls=meta.get('url_components', []), src=src) else: rule_accept = False if not rule_accept: features = self.shingler.get_shingles(content_tokens, prefix=prefix) if self.sketch_enabled and (self.sketch_shingler is None or self.sketch_signer is None): keys, sketch = self.signer.get_signature(features, with_sketch=True) elif self.sketch_enabled and (self.sketch_shingler is not None and self.sketch_signer is not None): keys = self.signer.get_signature(features) sketch_features = self.sketch_shingler.get_shingles(content_tokens) sketch = self.sketch_signer.get_signature(sketch_features) else: keys = self.signer.get_signature(features) sketch = None yield (keys, (label, sketch)) def clusters_from_iter(self, data): """Find clusters in an iterable""" cluster_builder = self.cluster_builder trace_every = self.trace_every for i, obj in enumerate(self._map_iter(data)): if trace_every > 0 and (not i % trace_every): LOG.info("Processing line " + str(i)) keys, val = obj label, sketch = val \ if isinstance(val, tuple) \ else (val, None) cluster_builder.add_item(keys, label=label, sketch=sketch) return cluster_builder.get_clusters() def mapper(self, obj): """Perform a mapper task in MR""" get_body = self._get_body get_label = self._get_label get_prefix = self._get_prefix body = obj if get_body is None else get_body(obj) label = obj if get_label is None else get_label(obj) prefix = None if get_prefix is None else get_prefix(obj) for keys, val in self._map_item(obj, body, label, prefix): for key in keys: yield key, val def reducer(self, key, tuple_gen): """Perform a reducer task in MR If sketches enabled, data consists of: (key, [(lbl, sk), (lbl, sk), (lbl, sk)]) Otherwise: (key, [lbl, lbl, lbl]) """ # If not using sketches, we are done if self.sketch_dist_fn is None: return key, list(set(tuple_gen)) # create a dict mappipng a label to a sketch return key, dict(tuple_gen).items()
def test_consistent_signature(self): """Signatures should be consistent""" mh = MinHashSignature(10 * 10) s = randset() self.assertEqual(mh.get_signature(s), mh.get_signature(s))
def test_signature_length(self): """Signatures should have correct dimension""" mh = MinHashSignature(10 * 10) self.assertEqual(100, len(mh.get_signature(randset())))