@time_this(lambda x: len(x)) def numba_fast_cusum(values: np.ndarray) -> np.ndarray: return _numba_fast_cusum(values) @time_this(lambda x: len(x)) def np_fast_cusum(values: np.ndarray) -> np.ndarray: """ This is O(n) and optimized with C code """ return values.cumsum() if __name__ == '__main__': exp_range = ExponentialRange(0, 8, 1 / 4) values = random_numeric_list(exp_range.max) with timed_report(): for i in exp_range.iterator(4): slow_cusum(values[:i]) for i in exp_range.iterator(4): slow_cusum_expanded(values[:i]) for i in exp_range.iterator(): python_fast_cusum(values[:i]) for i in exp_range.iterator(): pandas_fast_cusum(pd.Series(values[:i]))
return result_count if __name__ == '__main__': # the_words = ['A', 'A', 'A', 'B', 'B', 'B', 'C'] * 100 # print(slow_count_occurrences(the_words)) # print(fast_count_occurrences(the_words)) # print(defaultdict_fast_count(the_words)) # print(counter_fast_count(the_words)) # print(np_fast_count(np.array(the_words))) # print(pd_fast_count(pd.Series(the_words))) # print(parallel_fast_count(the_words)) exp_range = ExponentialRange(0, 7, 1 / 4) the_words = random_words(exp_range.max) the_array = np.array(the_words) the_series = pd.Series(the_words) with timed_report(): for i in exp_range.iterator(4): slow_count_occurrences(the_words[:i]) for i in exp_range.iterator(): fast_count_occurrences(the_words[:i]) for i in exp_range.iterator(): defaultdict_fast_count(the_words[:i]) for i in exp_range.iterator():
""" Sizing information for dictionaries """ import sys from utils.profiler import ExponentialRange exp_range = ExponentialRange(0, 7, 1 / 8) for i in exp_range.iterator(): _dict = {j: j**2 for j in range(i)} _dict_size = sys.getsizeof(_dict) print(f'{len(_dict):<8} keys {_dict_size:>12} bytes') # Returns ... # 1 keys 248 bytes # 2 keys 248 bytes # 3 keys 248 bytes # 4 keys 248 bytes # 5 keys 248 bytes # 7 keys 376 bytes # 10 keys 376 bytes # 13 keys 656 bytes # 17 keys 656 bytes # 23 keys 1192 bytes # 31 keys 1192 bytes # 42 keys 1192 bytes # 56 keys 2288 bytes # 74 keys 2288 bytes # 100 keys 4712 bytes # 133 keys 4712 bytes # 177 keys 9328 bytes
n = len(values) is_sorted = all(values[i] >= values[i + 1] for i in range(n - 1)) assert is_sorted, 'values are not sorted.' def assert_top_k(top_k_values, values): assert_sorted(top_k_values) kth_value = top_k_values[-1] k = len(top_k_values) assert sum(v >= kth_value for v in values) == k, \ 'Something went wrong' if __name__ == '__main__': exp_range = ExponentialRange(2, 7, 1 / 4) values = random_numeric_list(exp_range.max) with timed_report(): for i in exp_range.iterator(): _values = values[:i].copy() _top_k = naive_find_top_k(_values) assert_top_k(_top_k, _values) for i in exp_range.iterator(): _values = values[:i].copy() _top_k = heap_find_top_k(_values) assert_top_k(_top_k, _values) for i in exp_range.iterator(): _values = values[:i].copy()
Generate sample CSV files for file-reading tests """ import numpy as np import pandas as pd import os import string import itertools from utils.profiler import ExponentialRange src_dir = os.path.dirname(os.path.abspath(__file__)) data_dir = os.path.join(src_dir, '..', 'data') target_dir = os.path.join(data_dir, 'big_numeric_csv_files') # Max size in rows as a power of ten exp_range = ExponentialRange(0, 7, 1 / 4) num_cols = 10 col_names = list(string.ascii_uppercase[:num_cols]) _data = np.random.random((exp_range.max, num_cols)) data = pd.DataFrame(_data, columns=col_names) _letters = string.ascii_uppercase _file_codes = itertools.product(_letters, repeat=2) _file_codes = list(_file_codes)[:exp_range.max] file_codes = [''.join(code) for code in _file_codes] for j, i in enumerate(exp_range.iterator()): code = file_codes[j] filename = f'file_{code}_rows_{i}.csv' filepath = os.path.join(target_dir, filename)