Exemplo n.º 1
0
def synthetic_data_test_noisy(M, N, R, block_size, n_samples, noise_level):
    """
    Same as the synthetic dataset test, except some noise is added to the
    repeated row copies to make them not quite identical. This tests the
    robustness of the delta encoding scheme to rows that are similar but not
    identical to each other. The amount of noise is controlled by the additional
    parameter noise_level.
    """
    print("[%s] Starting synthetic data test..." % datetime.now().isoformat())
    print(
        "Parameters: %d x %d matrix with repetition factor %d, noise level=%f"
        % (M, N, R, noise_level))
    dataset_chunk = sparse.random(M // R, N, format='csr', random_state=6410)
    chunks = [dataset_chunk]
    for i in range(R - 1):
        noise = sparse.random(M // R, N, noise_level, format='csr')
        chunks.append(dataset_chunk + noise)
    dataset = sparse.vstack(chunks, format='csr')
    print("[%s] Memory usage of CSR matrix is %d bytes" %
          (datetime.now().isoformat(), csr_memory_usage(dataset)))
    print("[%s] Converting CSR matrix to delta CSR..." %
          datetime.now().isoformat())
    dataset_delta = delta_csr_matrix(dataset,
                                     block_size=block_size,
                                     n_samples=n_samples)
    print("[%s] Memory usage of delta CSR matrix is %d bytes" %
          (datetime.now().isoformat(), delta_csr_memory_usage(dataset_delta)))
Exemplo n.º 2
0
def url_data_test(url_data_path, block_size, n_samples, n_history):
    """
    Test the memory savings of delta encoding when used to store the malicious
    URLs dataset
    """
    print("[%s] Starting URL data test..." % datetime.now().isoformat())
    dataset, _ = load_svmlight_file(url_data_path)
    print("[%s] Memory usage of CSR matrix is %d bytes" %
          (datetime.now().isoformat(), csr_memory_usage(dataset)))
    print("[%s] Converting CSR matrix to delta CSR..." %
          datetime.now().isoformat())
    dataset_delta = delta_csr_matrix(dataset,
                                     block_size=block_size,
                                     n_samples=n_samples,
                                     n_history=n_history)
    print("[%s] Memory usage of delta CSR matrix is %d bytes" %
          (datetime.now().isoformat(), delta_csr_memory_usage(dataset_delta)))
Exemplo n.º 3
0
def msnbc_data_test(msnbc_path, block_size, n_samples):
    """
    Test the memory savings of delta encoding when used to store the MSNBC.com
    Anonymous Web Data dataset.
    """
    print("[%s] Starting MSNBC data test..." % datetime.now().isoformat())
    dense_vectors = list(msnbc_data_loader(msnbc_path))
    dense_msnbc = np.vstack(dense_vectors)
    dataset = sparse.csr_matrix(dense_msnbc)
    print("[%s] Memory usage of CSR matrix is %d bytes" %
          (datetime.now().isoformat(), csr_memory_usage(dataset)))
    print("[%s] Converting CSR matrix to delta CSR..." %
          datetime.now().isoformat())
    dataset_delta = delta_csr_matrix(msnbc_data_loader(msnbc_path),
                                     dtype=np.int64,
                                     block_size=block_size,
                                     n_samples=n_samples)
    assert ((dataset_delta.toarray() == dense_msnbc).all())
    print("[%s] Memory usage of delta CSR matrix is %d bytes" %
          (datetime.now().isoformat(), delta_csr_memory_usage(dataset_delta)))
Exemplo n.º 4
0
def synthetic_data_test(M, N, R, block_size, n_samples):
    """
    Test the memory savings of delta encoding using a synthetic dataset that has
    been constructed to contain repeated rows. Tunable parameters include:
    M: Number of rows in the dataset
    N: Number of columns in the dataset
    R: Number of times each row is repeated. Must cleanly divide M.
    """
    print("[%s] Starting basic synthetic data test..." %
          datetime.now().isoformat())
    print("Parameters: %d x %d matrix with repetition factor %d" % (M, N, R))
    dataset_chunk = sparse.random(M // R, N, format='csr', random_state=6410)
    # achieve repetition by combining R copies of the chunk into a single matrix
    dataset = sparse.vstack([dataset_chunk for i in range(R)], format='csr')
    print("[%s] Memory usage of CSR matrix is %d bytes" %
          (datetime.now().isoformat(), csr_memory_usage(dataset)))
    print("[%s] Converting CSR matrix to delta CSR..." %
          datetime.now().isoformat())
    dataset_delta = delta_csr_matrix(dataset,
                                     block_size=block_size,
                                     n_samples=n_samples)
    print("[%s] Memory usage of delta CSR matrix is %d bytes" %
          (datetime.now().isoformat(), delta_csr_memory_usage(dataset_delta)))
Exemplo n.º 5
0
 def test_csr_to_delta_csr(self):
     delta_csr_from_csr = delta_csr_matrix(self.csr)
     self.assertTrue((delta_csr_from_csr.toarray() == self.dense).all(),
                     msg="Arrays differ after conversion from CSR")
Exemplo n.º 6
0
 def setUp(self):
     block = sparse.random(M // 2, N, format='csr')
     self.csr = sparse.vstack([block, block], format='csr')
     self.dense = self.csr.toarray()
     self.delta_csr = delta_csr_matrix(self.dense)