def test_split_already_persistent(self): bn, bm = (2, 1) x = np.arange(100).reshape(10, -1) blocks = [] for i in range(0, x.shape[0], bn): row = [x[i: i + bn, j: j + bm] for j in range(0, x.shape[1], bm)] blocks.append(row) data = StorageNumpy(input_array=x, name="test_split_already_persistent") data.sync() # Flush values to cassandra for i, chunk in enumerate(data.np_split(block_size=(bn, bm))): storage_id = chunk.storage_id chunk.sync() #Flush data del chunk chunk = getByID(storage_id) self.assertTrue(np.array_equal(list(chunk), blocks[i])) del data gc.collect() data = StorageNumpy(name="test_split_already_persistent") self.assertTrue(np.array_equal(list(data), x)) for i, chunk in enumerate(data.np_split(block_size=(bn, bm))): storage_id = chunk.storage_id chunk.sync() #Flush data del chunk chunk = getByID(storage_id) self.assertTrue(np.array_equal(list(chunk), blocks[i])) self.assertEqual(i + 1, len(blocks))
def test_load_2_dif_clusters_same_instance(self): base = np.arange(50 * 50).reshape((50, 50)) hecu_p = StorageNumpy(input_array=base, name='load_2_clustrs_same_inst') hecu_p.sync() # Flush values to cassandra hecu_p_load = StorageNumpy(name="load_2_clustrs_same_inst") hecu_p_load[0:1, 0:1] self.assertTrue(np.array_equal(hecu_p_load[40:50, 40:50], base[40:50, 40:50]))
def test_split_content(self): n = np.arange(88*66).reshape(88,66) s = StorageNumpy(n,"test_split_content") s.sync() # Flush values to cassandra del s s = StorageNumpy(None,"test_split_content") rows = [i for i in s.split(cols=False)] self.assertTrue(len(rows)==4) columns = [ i for i in s.split(cols=True)] self.assertTrue(len(columns)==3) blocks = [i for i in s.split()] self.assertTrue(len(blocks)==12) for i in rows: self.assertTrue(i.shape == (22,66)) for i in columns: self.assertTrue(i.shape == (88,22)) for i in blocks: self.assertTrue(i.shape == (22,22)) self.assertTrue(np.array_equal(rows[0],n[0:22,:])) self.assertTrue(np.array_equal(rows[1],n[22:44,:])) self.assertTrue(np.array_equal(rows[2],n[44:66,:])) self.assertTrue(np.array_equal(rows[3],n[66:,:])) self.assertTrue(np.array_equal(columns[0],n[:,0:22])) self.assertTrue(np.array_equal(columns[1],n[:,22:44])) self.assertTrue(np.array_equal(columns[2],n[:,44:]))
def test_load_StorageNumpy(self): n = np.arange(2*128).reshape(2,128) # A matrix with "some" columns s = StorageNumpy(n, "test_load_StorageNumpy") s.sync() # Flush values to cassandra s2 = StorageNumpy(None, "test_load_StorageNumpy") self.assertTrue(s2._is_persistent) self.assertEqual(s.storage_id, s2.storage_id)
def test_types_persistence(self): base_array = np.arange(256) tablename = self.ksp + '.' + "test_types_persistence" for typecode in np.typecodes['Integer']: if typecode == 'p': # TODO For now skip arrays made of pointers pass typed_array = StorageNumpy(base_array.astype(typecode), tablename) self.assertTrue(np.array_equal(typed_array, base_array.astype(typecode))) typed_array.sync() # Flush values to cassandra typed_array = StorageNumpy(None, tablename) self.assertTrue(np.allclose(typed_array, base_array.astype(typecode))) typed_array.delete_persistent() for typecode in np.typecodes['UnsignedInteger']: if typecode == 'P': # TODO For now skip arrays made of pointers pass typed_array = StorageNumpy(base_array.astype(typecode), tablename) self.assertTrue(np.allclose(typed_array, base_array.astype(typecode))) typed_array.sync() # Flush values to cassandra typed_array = StorageNumpy(None, tablename) self.assertTrue(np.allclose(typed_array, base_array.astype(typecode))) typed_array.delete_persistent()
def test_slice_after_load(self): n = np.arange(2*128).reshape(2,128) # A matrix with "some" columns s = StorageNumpy(n, "test_slice_after_load") s.sync() # Flush values to cassandra del s s = StorageNumpy(None, "test_slice_after_load") tmp = s[0,110:150] # Doing an slice on an unloaded numpy self.assertTrue(np.array_equal(tmp, n[0,110:150]))
def test_pv_one_dim(self): n = np.arange(66*66) sn = StorageNumpy(n,"test_pv_one_dim") sn.sync() del sn sn = StorageNumpy(None,"test_pv_one_dim") s1 = 30 self.assertTrue(np.array_equal(sn[s1], n[s1]))
def test_pv_big_np(self): n = np.arange(1000*1000).reshape(1000,1000) sn = StorageNumpy(n,"test_pv_big_np") sn.sync() del sn sn = StorageNumpy(None,"test_pv_big_np") s1 = (22,22) self.assertTrue(np.array_equal(sn[s1], n[s1]))
def test_row_access(self): n = np.arange(64*128).reshape(64,128) # A matrix with "some" columns s = StorageNumpy(n, "test_row_access") s.sync() # Flush values to cassandra del s s = StorageNumpy(None, "test_row_access") for i in range(0,64): tmp = s[i,:] # Access a whole row self.assertTrue(np.array_equal(tmp, n[i,:]))
def test_pv_load_correct_blocks(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_load_correct_blocks") sn.sync() del sn sn = StorageNumpy(None,"test_pv_load_correct_blocks") s1 = (0, slice(None, None, None)) x = sn[s1] self.assertTrue(len(sn._loaded_coordinates) == 3)
def test_loaded(self): n = np.arange(88*66).reshape(88,66) s = StorageNumpy(n, "test_loaded") self.assertTrue(s._numpy_full_loaded is True) s.sync() # Flush values to cassandra del s s = StorageNumpy(None, "test_loaded") self.assertTrue(s._numpy_full_loaded is False) # The accessed element must be FULL loaded row = s[0,:] self.assertTrue(s._numpy_full_loaded is False) self.assertTrue(row._numpy_full_loaded is True) del s s = StorageNumpy(None, "test_loaded") col = s[:, 0] self.assertTrue(s._numpy_full_loaded is False) self.assertTrue(col._numpy_full_loaded is True) del s s = StorageNumpy(None, "test_loaded") block = s[22:44, 22:44] self.assertTrue(s._numpy_full_loaded is False) self.assertTrue(block._numpy_full_loaded is True) # Loading ALL elements must make the object full loaded del s s = StorageNumpy(None, "test_loaded") for i in range(s.shape[0]): x = s[i,:] self.assertTrue(s._numpy_full_loaded is True) del s s = StorageNumpy(None, "test_loaded") for i in range(s.shape[1]): x = s[:,i] self.assertTrue(s._numpy_full_loaded is True) # Split MUST NOT load the object del s s = StorageNumpy(None, "test_loaded") rows = [ i for i in s.split(cols=False) ] for i in rows: self.assertTrue(i._numpy_full_loaded is False) del s s = StorageNumpy(None, "test_loaded") columns = [ i for i in s.split(cols=True) ] for i in columns: self.assertTrue(i._numpy_full_loaded is False) del s s = StorageNumpy(None, "test_loaded") blocks = [ i for i in s.split() ] for i in blocks: self.assertTrue(i._numpy_full_loaded is False)
def test_pv_only_int(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_only_int") sn.sync() del sn sn = StorageNumpy(None,"test_pv_only_int") s1 = 1 n1 = sn[s1] self.assertTrue(np.array_equal(n[1], n1))
def test_pv_three_dimensions(self): n = np.arange(3*66*66).reshape(3,66,66) sn = StorageNumpy(n,"test_pv_three_dimensions") sn.sync() del sn sn = StorageNumpy(None,"test_pv_three_dimensions") s1 = (0, 1, slice(None, None, None)) self.assertTrue(np.array_equal(sn[s1], n[s1])) s2 = slice(1,10,1) self.assertTrue(np.array_equal(sn[s1][s2], n[s1][s2]))
def test_pv_special_case(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_special_case") sn.sync() del sn sn = StorageNumpy(None,"test_pv_special_case") s1 = slice(1,65) s2 = sn[s1,s1] ssf=1 self.assertTrue(np.array_equal(sn[s1,s1][1], n[s1,s1][1]))
def test_pv_slice_from_from_slice_step(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_slice_from_slice_step") sn.sync() del sn sn = StorageNumpy(None,"test_pv_slice_from_slice_step") s1 = slice(1,65,2) s2 = slice(1,20,2) n1 = sn[s1][s2] self.assertTrue(np.array_equal(n1, n[s1][s2]))
def test_pv_slice_step(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_slice_step") sn.sync() del sn sn = StorageNumpy(None,"test_pv_slice_step") s1 = slice(1,65,2) n2 = sn[s1,s1] i=2 j=30 self.assertTrue(np.array_equal(n[s1,s1][i,j], n2[i,j]))
def test_get_subarray(self): base = np.arange(8 * 8 * 4).reshape((8, 8, 4)) hecu_p = StorageNumpy(input_array=base, name='test_get_subarray') hecu_p.sync() # Flush values to cassandra hecu_r2 = StorageNumpy(name="test_get_subarray") res = hecu_r2[:3, :2] sum = res.sum() res = hecu_r2[:3, :2] avg = res.mean() self.assertGreater(sum, 0) self.assertGreater(avg, 0)
def test_pv_slice_single_row(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_slice_single_row") sn.sync() del sn sn = StorageNumpy(None,"test_pv_slice_single_row") s1 = slice(1,65) s2 = slice(1, None, None) self.assertTrue(np.array_equal(sn[s1], n[s1])) self.assertTrue(np.array_equal(sn[s1][s2], n[s1][s2])) self.assertTrue(np.array_equal(sn[s1][s2][s2], n[s1][s2][s2]))
def test_pv_int_slice(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_int_slice") sn.sync() del sn sn = StorageNumpy(None,"test_pv_int_slice") # Caso: int, slice s1 = slice(1,65) s2 = 30 i=1 n2 = sn[s2,s1] self.assertTrue(np.array_equal(n2[i], n[s2,s1][i]))
def test_pv_slice_int(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_slice_int") sn.sync() del sn sn = StorageNumpy(None,"test_pv_slice_int") # Caso: slice, int s1 = slice(1,65) s2 = 30 i=1 n2 = sn[s1,s2] self.assertTrue(np.array_equal(n2[i], n[s1,s2][i]))
def test_pv_slice_slice(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_slice_slice") sn.sync() # Flush values to cassandra del sn sn = StorageNumpy(None,"test_pv_slice_slice") # Caso: slice, slice s1 = slice(1,65) n1 = sn[s1,s1] i=1 j=1 self.assertTrue(np.array_equal(n1[i,j], n[s1,s1][i,j]))
def test_pv_slice_single_column(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_slice_single_column") sn.sync() del sn sn = StorageNumpy(None,"test_pv_slice_single_column") s1 = (slice(None, None, None), 30) s2 = slice(1, None, None) self.assertTrue(sn[s1].shape == n[s1].shape) self.assertTrue(np.array_equal(sn[s1], n[s1])) self.assertTrue(np.array_equal(sn[s1][s2], n[s1][s2])) self.assertTrue(np.array_equal(sn[s1][s2][s2], n[s1][s2][s2]))
def test_pv_three_dimensions_easy(self): n = np.arange(4*4*4).reshape(4,4,4) sn = StorageNumpy(n,"test_pv_three_dimensions_easy") sn.sync() del sn sn = StorageNumpy(None,"test_pv_three_dimensions_easy") orig3 = (slice(None, None, None), slice(None, None, None), slice(None, None, None)) s1 = (0, 1, slice(None, None, None)) self.assertTrue(np.array_equal(sn[s1], n[s1])) s2 = slice(1, None, None) self.assertTrue(np.array_equal(sn[s1][s2], n[s1][s2])) self.assertTrue(np.array_equal(sn[s1][s2][s2], n[s1][s2][s2]))
def test_arrow_access(self): n = np.arange(50*50).reshape(50,50) s = StorageNumpy(n, "test_arrow_access") s.sync() del s s = StorageNumpy(None, "test_arrow_access") x = s[:, 20] self.assertTrue(np.array_equal(x, n[:,20])) y = s[:, 30] self.assertTrue(np.array_equal(y, n[:,30])) z = s[:, 49] self.assertTrue(np.array_equal(z, n[:,49]))
def test_slicing_3d(self): base = np.arange(8 * 8 * 4).reshape((8, 8, 4)) hecu = StorageNumpy(input_array=base, name='test_slicing_3d') res_hecu = hecu[6:7, 4:] res = base[6:7, 4:] self.assertTrue(np.array_equal(res, res_hecu)) hecu.sync() # Flush values to cassandra hecu = StorageNumpy(name="test_slicing_3d") res_hecu = hecu[6:7, 4:] self.assertTrue(np.array_equal(res, res_hecu)) hecu.delete_persistent()
def test_pv_negative_indexes(self): n = np.arange(66*66).reshape(66,66) sn = StorageNumpy(n,"test_pv_negative_indexes") sn.sync() del sn sn = StorageNumpy(None,"test_pv_negative_indexes") s1 = -1 self.assertTrue(np.array_equal(sn[s1], n[s1])) nn = np.arange(66*66) snn = StorageNumpy(nn,"test_pv_negative_indexes_small") del snn snn = StorageNumpy(None,"test_pv_negative_indexes_small") self.assertTrue(np.array_equal(snn[s1], nn[s1]))
def test_pv_three_dimensions_all_coords(self): n = np.arange(8*8*8).reshape(8,8,8) sn = StorageNumpy(n,"test_pv_three_dimensions_all_coords") sn.sync() del sn sn = StorageNumpy(None,"test_pv_three_dimensions_all_coords") orig3 = (slice(None, None, None), slice(None, None, None), slice(None, None, None)) coords = [] for i in sn.calculate_block_coords(orig3): coords.append(i) expected = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1)] result = all(map(lambda x, y: x == y, expected, coords)) self.assertTrue(result, True)
def test_pv_three_dimensions_slice_twodim(self): n = np.arange(8*8*8).reshape(8,8,8) sn = StorageNumpy(n,"test_pv_three_dimensions_slice_twodim") sn.sync() del sn sn = StorageNumpy(None,"test_pv_three_dimensions_slice_twodim") ss = sn[(slice(None,None,None), slice(None,None,None),0)] coords = [] for i in ss.calculate_block_coords(ss._build_args.view_serialization): coords.append(i) expected = [(0, 0, 0), (0, 1, 0), (1, 0, 0), (1, 1, 0)] result = all(map(lambda x, y: x == y, expected, coords)) self.assertTrue(result, True)
def test_read_all(self): nelem = 2 ** 21 elem_dim = 2 ** 7 base_array = np.arange(nelem).reshape((elem_dim, elem_dim, elem_dim)) casted = StorageNumpy(input_array=base_array, name="test_read_all") casted.sync() # Flush values to cassandra test_numpy = np.arange(nelem).reshape((elem_dim, elem_dim, elem_dim)) casted = StorageNumpy(name="test_read_all") chunk = casted[slice(None, None, None)] self.assertTrue(np.allclose(chunk.view(np.ndarray), test_numpy)) casted.delete_persistent()
def test_performance_storage_numpy_arrow(self): # Test the time to retrieve a column from Cassandra # Times to repeat the test TIMES = 10 # Matrix sizes to test matrix_size = (100, 200, 300, 400, 500, 600, 700, 800, 900, 1000) n_cols = 3 times = {} # Test 1 column for s in matrix_size: times[s] = [] # empty list for size 's' # Create a numpy n = np.arange(1000*s * n_cols).reshape(1000*s, n_cols) matrix_name = "matrix{}x{}".format(1000*s, n_cols) # Make it persistent o = StorageNumpy(n, matrix_name) o.sync() # Flush values to cassandra # Clean memory del o for i in range(TIMES): # Retrieve numpy from cassandra (NO data in memory) o = StorageNumpy(None, matrix_name) # LOAD_ON_DEMAND must be DISABLED! self.assertTrue(o.data.hex()[:40], '0' * 40) start = timer() # Load column column = random.randint(0, (n_cols-1)) o[:, column] end = timer() # Store time times[s].append(end - start) del o # All tests done, print results print("\nRESULTS:") for s in matrix_size: print("Matrix size{}x{} = ".format(1000*s, n_cols), times[s]) print("\n")