def do_calc(df, result_col='ratio', ndigits=2, print_n=100000): """Do calculation.""" @profiled(total=len(df), print_n=print_n, name='calc_counter') def calc(s): x = s['x'] y = s['y'] z = s['z'] return round((z - y) / (y - x), ndigits) print_msg('start calculation...') print_msg('total rows = {}'.format(len(df))) timer = Timer() ratios = df.apply(calc, axis=1) print_msg('finish calculation ({}s)'.format(timer.elapse(2))) print_msg('start appending result column...') timer.reset() df[result_col] = ratios print_msg('finish appending result column ({}s)'.format(timer.elapse(2))) return df
def main(): """Entry.""" src_dir = 'data' out_dir = 'result' result_dir = os.path.join(out_dir, 'max_occur') src_file = os.path.join(src_dir, 'data.xlsx') org_data_file = os.path.join(src_dir, 'org_data.csv') sorted_data_file = os.path.join(out_dir, 'data_sorted.csv') save_data_sorted = False print_msg('start running...') main_timer = Timer() if not os.path.exists(org_data_file): gen_org_data(out_file=org_data_file, xlsx_file=src_file, print_n=100000) df = do_calc(org_data_file, ndigits=2, print_n=100000) print_msg('start sorting values...') timer = Timer() data_sorted = df.sort_values(by='ratio') print_msg('finish sorting values ({}s)'.format(timer.elapse(2))) if save_data_sorted: print_msg('start saving sorted data to file...') timer.reset() data_sorted.to_csv(sorted_data_file, index=False) print_msg('finish saving sorted data to file ({}s)'.format( timer.elapse(2))) print_msg('start grouping data...') timer.reset() grouped = data_sorted.groupby('ratio') print_msg('finish grouping data ({}s)'.format(timer.elapse(2))) result = find_max_occur(grouped, print_n=10000) save_result(result, result_dir, print_n=10) print_msg('finish running ({}s)'.format(main_timer.elapse(2)))
def do_calc(filename, result_col='ratio', ndigits=2, print_n=100000): """Do calculation.""" def get_row_number(): print_msg('start counting data rows...') rows = get_lines_number(filename) - 1 print_msg('finish counting data rows ({}s)'.format(timer.elapse(2))) return rows timer = Timer() row_number = get_row_number() print_msg('total rows = {}'.format(row_number)) @profiled(total=row_number, print_n=print_n, name='calc_counter') def calc(s): x = s['x'] y = s['y'] z = s['z'] return round((z - y) / (y - x), ndigits) print_msg('start calculation...') timer.reset() ratios = [] with open(filename) as f: for line in f: line = line.split(',') try: data = { 'x': np.float64(line[0]), 'y': np.float64(line[1]), 'z': np.float64(line[2]), } except ValueError: continue ratios.append(calc(data)) print_msg('finish calculation ({}s)'.format(timer.elapse(2))) print_msg('start loading org data...') timer.reset() df = pd.read_csv(filename) print_msg('finish loading org data ({}s)'.format(timer.elapse(2))) print_msg('start appending result column...') timer.reset() df[result_col] = ratios print_msg('finish appending result column ({}s)'.format(timer.elapse(2))) return df