Пример #1
0
 def test_float(self):
     v1 = [
         0,          1.0497366,  0.8494359,  0.66231006, 0.66231006, 0.8494359,
         0,          0.66231006, 0.33652836, 0,           0,         0.5359344,
         0.8494359,  0.66231006, 1.0497366,  0.33652836, 0.66231006, 0.8494359,
         0.6800841,  0.33652836]
     gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, seed=7, verbosity=2)
     vars = libMHCUDA.minhash_cuda_retrieve_vars(gen)
     bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator)
     bgen.dim = len(v1)
     bgen.rs, bgen.ln_cs, bgen.betas = vars
     bgen.sample_size = 128
     bgen.seed = None
     m = csr_matrix(numpy.array(v1, dtype=numpy.float32))
     hashes = libMHCUDA.minhash_cuda_calc(gen, m).astype(numpy.int32)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (1, 128, 2))
     true_hashes = numpy.array([bgen.minhash(v1).hashvalues], dtype=numpy.int32)
     self.assertEqual(true_hashes.shape, (1, 128, 2))
     try:
         self.assertTrue((hashes == true_hashes).all())
     except AssertionError as e:
         print("---- TRUE ----")
         print(true_hashes)
         print("---- FALSE ----")
         print(hashes)
         raise e from None
Пример #2
0
 def _test_calc_big(self, devices):
     numpy.random.seed(0)
     data = numpy.random.randint(0, 100, (6400, 130))
     mask = numpy.random.randint(0, 5, data.shape)
     data *= (mask >= 4)
     del mask
     bgen = WeightedMinHashGenerator(data.shape[-1])
     gen = libMHCUDA.minhash_cuda_init(data.shape[-1], 128, devices=devices, verbosity=2)
     libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs, bgen.betas)
     m = csr_matrix(data, dtype=numpy.float32)
     print(m.nnz / (m.shape[0] * m.shape[1]))
     ts = time()
     hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     print("libMHCUDA:", time() - ts)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (len(data), 128, 2))
     ts = time()
     true_hashes = numpy.array([bgen.minhash(line).hashvalues for line in data],
                               dtype=numpy.uint32)
     print("datasketch:", time() - ts)
     self.assertEqual(true_hashes.shape, (len(data), 128, 2))
     try:
         self.assertTrue((hashes == true_hashes).all())
     except AssertionError as e:
         for r in range(hashes.shape[0]):
             if (hashes[r] != true_hashes[r]).any():
                 print("first invalid row:", r)
                 print(hashes[r])
                 print(true_hashes[r])
                 break
         raise e from None
Пример #3
0
 def test_deferred(self):
     v1 = [1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10, 4]
     v2 = [2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0, 0]
     gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, verbosity=2)
     vars = libMHCUDA.minhash_cuda_retrieve_vars(gen)
     libMHCUDA.minhash_cuda_fini(gen)
     gen = libMHCUDA.minhash_cuda_init(
         len(v1), 128, devices=1, deferred=True, verbosity=2)
     libMHCUDA.minhash_cuda_assign_vars(gen, *vars)
     bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator)
     bgen.dim = len(v1)
     bgen.rs, bgen.ln_cs, bgen.betas = vars
     bgen.sample_size = 128
     bgen.seed = None
     m = csr_matrix(numpy.array([v1, v2], dtype=numpy.float32))
     hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (2, 128, 2))
     true_hashes = numpy.array([bgen.minhash(v1).hashvalues,
                                bgen.minhash(v2).hashvalues], dtype=numpy.uint32)
     self.assertEqual(true_hashes.shape, (2, 128, 2))
     try:
         self.assertTrue((hashes == true_hashes).all())
     except AssertionError as e:
         print("---- TRUE ----")
         print(true_hashes)
         print("---- FALSE ----")
         print(hashes)
         raise e from None
Пример #4
0
 def test_calc_tiny(self):
     v1 = [
         1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10,
         4
     ]
     v2 = [
         2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0,
         0
     ]
     bgen = WeightedMinHashGenerator(len(v1))
     gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, verbosity=2)
     libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs,
                                        bgen.betas)
     m = csr_matrix(numpy.array([v1, v2], dtype=numpy.float32))
     hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (2, 128, 2))
     true_hashes = numpy.array(
         [bgen.minhash(v1).hashvalues,
          bgen.minhash(v2).hashvalues],
         dtype=numpy.uint32)
     self.assertEqual(true_hashes.shape, (2, 128, 2))
     try:
         self.assertTrue((hashes == true_hashes).all())
     except AssertionError as e:
         print("---- TRUE ----")
         print(true_hashes)
         print("---- FALSE ----")
         print(hashes)
         raise e from None
Пример #5
0
 def test_slice(self):
     numpy.random.seed(0)
     data = numpy.random.randint(0, 100, (6400, 130))
     mask = numpy.random.randint(0, 5, data.shape)
     data *= (mask >= 4)
     del mask
     gen = libMHCUDA.minhash_cuda_init(data.shape[-1], 128, verbosity=2)
     m = csr_matrix(data, dtype=numpy.float32)
     hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     hashes2 = libMHCUDA.minhash_cuda_calc(
         gen, m, row_start=3200, row_finish=4800)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertTrue((hashes[3200:4800] == hashes2).all())
Пример #6
0
 def run_test(v):
     k = sum([len(part) for part in v])
     bgen = WeightedMinHashGenerator(len(k))
     gen = libMHCUDA.minhash_cuda_init(len(k), 128, devices=4, verbosity=2)
     libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs, bgen.betas)
     m = csr_matrix(numpy.array(v, dtype=numpy.float32))
     hashes = None
     try:
         hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     finally:
         self.assertIsNotNone(hashes)
         self.assertEqual(hashes.shape, (1, 128, 2))
         libMHCUDA.minhash_cuda_fini(gen)
Пример #7
0
 def test_random_vars(self):
     gen = libMHCUDA.minhash_cuda_init(1000, 128, devices=1, verbosity=2)
     rs, ln_cs, betas = libMHCUDA.minhash_cuda_retrieve_vars(gen)
     libMHCUDA.minhash_cuda_fini(gen)
     cs = numpy.exp(ln_cs)
     a, loc, scale = gamma.fit(rs)
     self.assertTrue(1.97 < a < 2.03)
     self.assertTrue(-0.01 < loc < 0.01)
     self.assertTrue(0.98 < scale < 1.02)
     a, loc, scale = gamma.fit(cs)
     self.assertTrue(1.97 < a < 2.03)
     self.assertTrue(-0.01 < loc < 0.01)
     self.assertTrue(0.98 < scale < 1.02)
     bmin, bmax = uniform.fit(betas)
     self.assertTrue(0 <= bmin < 0.001)
     self.assertTrue(0.999 <= bmax <= 1)
Пример #8
0
 def test_integration(self):
     numpy.random.seed(1)
     data = numpy.random.randint(0, 100, (6400, 130))
     mask = numpy.random.randint(0, 5, data.shape)
     data *= (mask >= 4)
     del mask
     gen = libMHCUDA.minhash_cuda_init(data.shape[-1], 128, seed=1, verbosity=1)
     m = csr_matrix(data, dtype=numpy.float32)
     print(m.nnz / (m.shape[0] * m.shape[1]))
     hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (len(data), 128, 2))
     h1 = WeightedMinHash(0, hashes[0])
     h2 = WeightedMinHash(0, hashes[1])
     cudamh = h1.jaccard(h2)
     print(cudamh)
     truemh = numpy.amin(data[:2], axis=0).sum() / numpy.amax(data[:2], axis=0).sum()
     print(truemh)
     self.assertTrue(abs(truemh - cudamh) < 0.005)
Пример #9
0
def hash_batches(args):
    log = logging.getLogger("hash")
    log.info("Loading files from %s", args.input)
    loader = BOWLoader(args.input)
    log.info("%d batches", len(loader))

    # Check batches
    if not loader:
        return

    htnum, band_size = calc_hashtable_params(
        args.threshold, args.size, args.false_positive_weight, args.false_negative_weight)
    log.info("Number of hash tables: %d", htnum)
    log.info("Band size: %d", band_size)
    cassandra_utils.configure(args)
    spark_args = filter_kwargs(args.__dict__, create_spark)
    spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext
    import libMHCUDA  # delayed import which requires CUDA and friends
    tables = args.tables
    gen = voc_size = None
    try:
        for i, bow in enumerate(loader):
            if voc_size is None:
                voc_size = bow.matrix.shape[-1]
                log.info("Initializing the generator")
                deferred = os.path.isfile(args.params)
                gen = libMHCUDA.minhash_cuda_init(
                    voc_size, args.size, seed=args.seed, devices=args.devices,
                    verbosity=args.mhc_verbosity,
                    deferred=deferred)
                if deferred:
                    model = WeightedMinHashParameters().load(args.params)
                    libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas)
                else:
                    log.info("Writing %s", args.params)
                    params = libMHCUDA.minhash_cuda_retrieve_vars(gen)
                    WeightedMinHashParameters().construct(*params).save(args.params)
            if bow.matrix.shape[-1] != voc_size:
                raise ValueError("The vocabulary sizes do not match: %d != %d"
                                 % (bow.matrix.shape[-1], voc_size))
            log.info("Processing batch %d / %d", i + 1, len(loader))
            # Modify features if needed
            # TODO(vmarkovtsev): port to the new structure
            # batches = modify_feature_weights(batches, args)
            hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix)
            job = [(k, h) for k, h in zip(bow.documents, hashes)]
            log.info("Saving the hashtables")
            df = spark.parallelize(job).flatMap(HashExploder(htnum, band_size)).toDF()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables"], keyspace=args.keyspace) \
                .save()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables2"], keyspace=args.keyspace) \
                .save()
            log.info("Saving the hashes")
            spark.parallelize(job) \
                .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \
                .toDF() \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashes"], keyspace=args.keyspace) \
                .save()
    finally:
        libMHCUDA.minhash_cuda_fini(gen)
Пример #10
0
def hash_batches(args):
    log = logging.getLogger("hash")
    log.info("Loading files from %s", args.input)
    loader = BOWLoader(args.input)
    log.info("%d batches", len(loader))

    # Check batches
    if not loader:
        return

    htnum, band_size = calc_hashtable_params(args.threshold, args.size,
                                             args.false_positive_weight,
                                             args.false_negative_weight)
    log.info("Number of hash tables: %d", htnum)
    log.info("Band size: %d", band_size)
    cassandra_utils.configure(args)
    spark = create_spark("hash-%s" % uuid4(), **args.__dict__).sparkContext
    import libMHCUDA  # delayed import which requires CUDA and friends
    tables = args.tables
    gen = voc_size = None
    try:
        for i, bow in enumerate(loader):
            if voc_size is None:
                voc_size = bow.matrix.shape[-1]
                log.info("Initializing the generator")
                deferred = os.path.isfile(args.params)
                gen = libMHCUDA.minhash_cuda_init(voc_size,
                                                  args.size,
                                                  seed=args.seed,
                                                  devices=args.devices,
                                                  verbosity=args.mhc_verbosity,
                                                  deferred=deferred)
                if deferred:
                    model = WeightedMinHashParameters().load(args.params)
                    libMHCUDA.minhash_cuda_assign_vars(gen, model.rs,
                                                       model.ln_cs,
                                                       model.betas)
                else:
                    log.info("Writing %s", args.params)
                    params = libMHCUDA.minhash_cuda_retrieve_vars(gen)
                    WeightedMinHashParameters().construct(*params).save(
                        args.params)
            if bow.matrix.shape[-1] != voc_size:
                raise ValueError(
                    "The vocabulary sizes do not match: %d != %d" %
                    (bow.matrix.shape[-1], voc_size))
            log.info("Processing batch %d / %d", i + 1, len(loader))
            # Modify features if needed
            # TODO(vmarkovtsev): port to the new structure
            # batches = modify_feature_weights(batches, args)
            hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix)
            job = [(k, h) for k, h in zip(bow.documents, hashes)]
            log.info("Saving the hashtables")
            df = spark.parallelize(job).flatMap(HashExploder(
                htnum, band_size)).toDF()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables"], keyspace=args.keyspace) \
                .save()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables2"], keyspace=args.keyspace) \
                .save()
            log.info("Saving the hashes")
            spark.parallelize(job) \
                .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \
                .toDF() \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashes"], keyspace=args.keyspace) \
                .save()
    finally:
        libMHCUDA.minhash_cuda_fini(gen)