def test_generators(self): a = [1, 2] assert util.get_size_of_deep(iter(a)) == sys.getsizeof(iter(a)) def ima_gen(): for i in range(10): yield i assert util.get_size_of_deep(ima_gen()) == sys.getsizeof(ima_gen())
def test_sequences(self): pytest.importorskip('six', reason='Uses six for compatibility') assert util.get_size_of_deep([0]) == sys.getsizeof(0) assert util.get_size_of_deep([0, 0]) == 2 * sys.getsizeof(0) assert util.get_size_of_deep([[0, 0]]) == 2 * sys.getsizeof(0) assert util.get_size_of_deep({0: 0}) == 2 * sys.getsizeof(0) assert util.get_size_of_deep({0: [0]}) == 2 * sys.getsizeof(0)
def test_basic(self): pytest.importorskip('six', reason='Uses six for compatibility') assert util.get_size_of_deep("") == sys.getsizeof("") assert util.get_size_of_deep(0) == sys.getsizeof(0) bs = b"abc" assert util.get_size_of_deep(bytes(bs)) == sys.getsizeof(bytes(bs)) assert \ util.get_size_of_deep(bytearray(bs)) == sys.getsizeof(bytearray(bs))
def test_numpy(self): pytest.importorskip('six', reason='Uses six for compatibility') np = pytest.importorskip("numpy") arr = np.array([0]) assert util.get_size_of_deep(arr) == arr.nbytes assert util.get_size_of_deep([arr]) == arr.nbytes assert util.get_size_of_deep([arr, arr]) == 2 * arr.nbytes assert util.get_size_of_deep({0: arr}) == (sys.getsizeof(0) + arr.nbytes)
def test_big_lists(self): pytest.importorskip('six', reason='Uses six for compatibility') # Case: consider we have a large array, like a Tensor # but as a list. We want get_size_of_deep() to be fast. arr = list(range(int(1e6))) assert util.get_size_of_deep(arr) == (len(arr) * sys.getsizeof(0)) # A big list of strings will still require a slower reduce ss = list(str(i) for i in range(100)) assert util.get_size_of_deep(ss) == sum(sys.getsizeof(s) for s in ss)
def __call__(self, pid): # Convert pesky numpy boxed numeric types if needed import numpy as np if isinstance(pid, np.generic): pid = pid.item() part_df = df.filter(df[shard_col] == pid) part_rdd = part_df.rdd.repartition(100) rows = part_rdd.map(spark_row_to_tf_element).toLocalIterator() util.log.info("Reading partition %s " % pid) t = util.ThruputObserver(name='Partition %s' % pid, log_on_del=True) t.start_block() for row in rows: yield row t.update_tallies(n=1, num_bytes=util.get_size_of_deep(row)) t.stop_block() util.log.info("Done reading partition %s, stats:\n %s" % (pid, t)) with self.lock: # Since partitions are read in parallel, we need to maintain # independent timing stats for the main thread self.overall_thruput.stop_block(n=t.n, num_bytes=t.num_bytes) self.overall_thruput.maybe_log_progress(every_n=1) self.overall_thruput.start_block()
def test_obj(self): pytest.importorskip('six', reason='Uses six for compatibility') class Obj(object): # Has a __dict__ attribute def __init__(self): self.x = 0 expected = sys.getsizeof('x') + sys.getsizeof(0) assert util.get_size_of_deep(Obj()) == expected assert util.get_size_of_deep([Obj()]) == expected assert util.get_size_of_deep([Obj(), Obj()]) == 2 * expected class Slotted(object): __slots__ = ['x'] def __init__(self): self.x = 0 assert util.get_size_of_deep(Slotted()) == sys.getsizeof(0) assert util.get_size_of_deep([Slotted()]) == sys.getsizeof(0) assert util.get_size_of_deep([Slotted(), Slotted()]) == 2 * sys.getsizeof(0)
def test_basic(self): pytest.importorskip('six', reason='Uses six for compatibility') assert util.get_size_of_deep("") == sys.getsizeof("") assert util.get_size_of_deep(0) == sys.getsizeof(0)