def test_word_shingler(self): s = Shingler(span=5, skip=1, unique=True, tokenizer=RegexTokenizer()) shingles = s.get_shingles("the quick brown fox jumps over a lazy dog") self.assertIn(("jumps", "a", "dog"), shingles) t = Shingler(span=5, skip=1, unique=False, tokenizer=RegexTokenizer()) shingles = t.get_shingles("the quick brown fox jumps over a lazy dog") self.assertEqual(("the", "brown", "jumps"), shingles[0])
def test_shingler(self): s = Shingler(span=5, skip=1, unique=True) shingles = s.get_shingles("abracadabra") self.assertIn(("d", "b", "a"), shingles) t = Shingler(span=5, skip=1, unique=False) shingles = t.get_shingles("abracadabra") self.assertEqual(("a", "r", "c"), shingles[0])
def test_bills(self): """Should return 97 clusters of bills. """ with open(get_resource_name('data/bills100.txt'), 'r') as fhandle: data = [line.rstrip().split('|') for line in fhandle] cluster = Cluster(width=20, bandwidth=5, seed=SEED) shingler = Shingler(span=3, tokenizer=RegexTokenizer()) for label, text in data: shingles = shingler.get_shingles(text) cluster.add_item(shingles, label) clusters = cluster.get_clusters() self.assertEqual(len(clusters), 97)
def test_names(self): """Should return 281 clusters of names. """ with open(get_resource_name('data/perrys.csv'), 'r') as fhandle: data = set(line.rstrip() for line in fhandle) cluster = Cluster(width=20, bandwidth=5, seed=SEED) shingler = Shingler(3) for name in data: shingles = shingler.get_shingles(name) cluster.add_item(shingles, name) clusters = cluster.get_clusters() self.assertEqual(len(clusters), 209)
def test_names_kmin_scheme(self): """Should return 145 clusters of names. """ with open(get_resource_name('data/perrys.csv'), 'r') as fhandle: data = set(line.rstrip() for line in fhandle) cluster = Cluster(width=20, bandwidth=5, kmin=2, lsh_scheme="a1", seed=SEED) shingler = Shingler(3) for name in data: shingles = shingler.get_shingles(name) cluster.add_item(shingles, name) clusters = cluster.get_clusters() # for cluster in clusters: # print cluster self.assertEqual(len(clusters), 176)
def run_simulated_manually(filepath, lines_to_read=sys.maxint, cluster_args=None): with open(get_resource_name(filepath), 'r') as fhandle: data = [line.rstrip().split(' ') for line in islice(fhandle, lines_to_read)] if cluster_args is None: cluster_args = dict() cluster = Cluster(**cluster_args) shingler = Shingler(span=3) content_dict = dict() for pair in data: if len(pair) > 1: label, text = pair else: label, text = pair[0], '' content_dict[label] = text shingles = shingler.get_shingles(text) cluster.add_item(shingles, label) clusters = cluster.get_clusters() is_label_positive = lambda lbl: ':' in lbl return dict(stats=describe_clusters(clusters, is_label_positive))
def get_default_shingler(**opts): shingler = Shingler(**opts) shingler._normalizer = None shingler._tokenizer = None return shingler
def __init__(self, cfg, content_filter=None, trace_every=0, get_body=None, get_label=None, get_prefix=None, min_support=None, seed=0, normalizer=None, tokenizer=None): """Read configuration""" self.cfg = cfg self._get_body = get_body self._get_label = get_label self._get_prefix = get_prefix self.trace_every = trace_every # Set options self.content_filter = content_filter self.min_support = cfg[ 'min_support'] if min_support is None else min_support # normalizer and tokenizer self.normalizer = get_default_normalizer( **cfg.get('preprocessor', {}).get('normalizer', {})) \ if normalizer is None else normalizer self.tokenizer = get_default_tokenizer( ) if tokenizer is None else tokenizer # Configure minhash signer sig_width = cfg['sig_width'] lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options']) self.signer = MinHashSignature(sig_width, lsh_hasher=lsh_hasher, kmin=cfg['kmin']) # Configure shingler cfg_key_shingle = cfg['shingler'] self.shingler = get_default_shingler(**cfg_key_shingle) # Configure sketch comparison algorithm cfg_sketch = cfg['sketch'] self.sketch_enabled = cfg_sketch['enabled'] self.sketch_dist_fn = None self.max_dist = None if self.sketch_enabled: algorithm_name = cfg_sketch['algorithm'] try: sketch_algorithm = getattr(SketchModel, algorithm_name) except AttributeError: raise RuntimeError("Unknown sketch model specified: '%s'" % algorithm_name) self.sketch_bits = cfg_sketch['size'] cfg_sketch_shingler = cfg_sketch['shingler'] if not cfg_sketch_shingler['enabled']: # if sketch shingler is disabled, we also disable signer # as we will use default signer self.sketch_shingler = None self.sketch_signer = None elif sketch_algorithm == SketchModel.simhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = SimHashSignature(self.sketch_bits, seed=seed) elif sketch_algorithm == SketchModel.minhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = MinHashSketchSignature(self.sketch_bits, seed=seed) self.sketch_shingler._tokenizer = None self.sketch_shingler._normalizer = None self.max_dist = \ int(floor(self.sketch_bits * (1.0 - float(cfg_sketch['resemblance'])))) self.sketch_dist_fn = hamming self.sketch_operator = OPERATOR_MAP[cfg_sketch.get( 'operator', 'and')] self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn, max_dist=self.max_dist, min_support=self.min_support, sketch_operator=self.sketch_operator)