def create(cls, blob_key, **kwargs): blob_key = blob_key filename = kwargs.get('filename') rows = kwargs.get('rows', 5) bands = kwargs.get('bands', 40) buckets_per_band = kwargs.get('buckets_per_band', 100) shingle_type = kwargs.get('shingle_type', 'c4') minhash_modulo = kwargs.get('minhash_modulo', 5000) max_hashes = calculate_max_hashes(rows, bands) dataset = cls.get(blob_key) if not dataset: dataset = BlobDataset( filename = filename, blob_key = blob_key, random_seeds = get_random_bits(max_hashes), rows = rows, bands = bands, buckets_per_band = buckets_per_band, shingle_type = shingle_type, minhash_modulo = minhash_modulo) else: dataset.filename = filename return dataset.put()
def create(cls, blob_key, **kwargs): blob_key = blob_key filename = kwargs.get('filename') rows = kwargs.get('rows', 5) bands = kwargs.get('bands', 40) buckets_per_band = kwargs.get('buckets_per_band', 100) shingle_type = kwargs.get('shingle_type', 'c4') minhash_modulo = kwargs.get('minhash_modulo', 5000) max_hashes = calculate_max_hashes(rows, bands) dataset = cls.get(blob_key) if not dataset: dataset = BlobDataset(filename=filename, blob_key=blob_key, random_seeds=get_random_bits(max_hashes), rows=rows, bands=bands, buckets_per_band=buckets_per_band, shingle_type=shingle_type, minhash_modulo=minhash_modulo) else: dataset.filename = filename return dataset.put()
def map(dataset, text, id=None): logging.info("OpenLSH > map() called.") start = datetime.datetime.utcnow() hashes = calculate_max_hashes(dataset.rows, dataset.bands) if len(dataset.random_seeds) < hashes: dataset.random_seeds = get_random_bits(hashes) dataset.put() sh_type = dataset.shingle_type modulo = dataset.minhash_modulo seeds = list(dataset.random_seeds) minhashes = calc_minhashes(text, sh_type, hashes, seeds, modulo) buckets = [] buckets_per_band = dataset.buckets_per_band for band in xrange(dataset.bands): minhashes_in_band = [minhashes[band*dataset.rows + row] for row in xrange(dataset.rows)] if len(set(minhashes_in_band)) <= 1: buckets.append( (band * buckets_per_band) + hash(minhashes_in_band[0]) % buckets_per_band ) end = datetime.datetime.utcnow() if 0 == (start.second % 20): logging.info('id %s, length %d, time %d', id, len(text), int((end-start).total_seconds())) for bkt in buckets: yield (bkt, '/view/%s/%s' % (dataset.filename, id))
def map(dataset, text, id=None): logging.info("OpenLSH > map() called.") start = datetime.datetime.utcnow() hashes = calculate_max_hashes(dataset.rows, dataset.bands) if len(dataset.random_seeds) < hashes: dataset.random_seeds = get_random_bits(hashes) dataset.put() sh_type = dataset.shingle_type modulo = dataset.minhash_modulo seeds = list(dataset.random_seeds) minhashes = calc_minhashes(text, sh_type, hashes, seeds, modulo) buckets = [] buckets_per_band = dataset.buckets_per_band for band in xrange(dataset.bands): minhashes_in_band = [ minhashes[band * dataset.rows + row] for row in xrange(dataset.rows) ] if len(set(minhashes_in_band)) <= 1: buckets.append((band * buckets_per_band) + hash(minhashes_in_band[0]) % buckets_per_band) end = datetime.datetime.utcnow() if 0 == (start.second % 20): logging.info('id %s, length %d, time %d', id, len(text), int((end - start).total_seconds())) for bkt in buckets: yield (bkt, '/view/%s/%s' % (dataset.filename, id))