def gen_org_data(out_file, out_index='x,y,z', xlsx_file='data.xlsx', print_n=10000): """Generate original data for calculation.""" xlsx = pd.ExcelFile(xlsx_file) df = pd.read_excel(xlsx, 'Sheet1') z_set = df['z'].dropna().sort_values() x_y_set = df['y,x'].dropna().sort_values() print_msg('start generating original data...') timer = Timer() with open(out_file, 'w', encoding='utf-8') as f: __gen_org_data(z_set, x_y_set, f, out_index, print_n) print_msg('finish generating original data ({}s)'.format(timer.elapse(2)))
def save_result(rs, out_dir, print_n=10): """Save result.""" @profiled(total=len(rs), print_n=print_n, name='save_rs_counter') def _save_result(r, directory): filename = os.path.join(directory, 'result_{}.csv'.format(r[0])) df = r[1] del df['ratio'] df.to_csv(filename, index=False) print_msg('start saving result...') print_msg('total results = {}'.format(len(rs))) timer = Timer() for x in rs: _save_result(x, out_dir) print_msg('finish saving result ({}s)'.format(timer.elapse(2)))
def find_max_occur(groups, print_n=10000): """Find max occurred results from groups.""" @profiled(total=len(groups), print_n=print_n, name='find_max_counter') def _find_max_occur(grp, max_occur, rs): number = len(grp[1]) if number > max_occur: rs = [grp] max_occur = number elif number == max_occur: rs.append(grp) return rs, max_occur max_ = 0 result = [] print_msg('start finding max occur...') print_msg('total groups = {}'.format(len(groups))) timer = Timer() for group in groups: result, max_ = _find_max_occur(group, max_, result) print_msg('finish finding max occur ({}s)'.format(timer.elapse(2))) return result
def do_calc(df, result_col='ratio', ndigits=2, print_n=100000): """Do calculation.""" @profiled(total=len(df), print_n=print_n, name='calc_counter') def calc(s): x = s['x'] y = s['y'] z = s['z'] return round((z - y) / (y - x), ndigits) print_msg('start calculation...') print_msg('total rows = {}'.format(len(df))) timer = Timer() ratios = df.apply(calc, axis=1) print_msg('finish calculation ({}s)'.format(timer.elapse(2))) print_msg('start appending result column...') timer.reset() df[result_col] = ratios print_msg('finish appending result column ({}s)'.format(timer.elapse(2))) return df
def main(): """Entry.""" src_dir = 'data' out_dir = 'result' result_dir = os.path.join(out_dir, 'max_occur') src_file = os.path.join(src_dir, 'data.xlsx') org_data_file = os.path.join(src_dir, 'org_data.csv') sorted_data_file = os.path.join(out_dir, 'data_sorted.csv') save_data_sorted = False print_msg('start running...') main_timer = Timer() if not os.path.exists(org_data_file): gen_org_data(out_file=org_data_file, xlsx_file=src_file, print_n=100000) print_msg('start loading org data...') timer = Timer() df = pd.read_csv(org_data_file) print_msg('finish loading org data ({}s)'.format(timer.elapse(2))) do_calc(df, ndigits=2, print_n=100000) print_msg('start sorting values...') timer.reset() data_sorted = df.sort_values(by='ratio') print_msg('finish sorting values ({}s)'.format(timer.elapse(2))) if save_data_sorted: print_msg('start saving sorted data to file...') timer.reset() data_sorted.to_csv(sorted_data_file, index=False) print_msg('finish saving sorted data to file ({}s)'.format( timer.elapse(2))) print_msg('start grouping data...') timer.reset() grouped = data_sorted.groupby('ratio') print_msg('finish grouping data ({}s)'.format(timer.elapse(2))) result = find_max_occur(grouped, print_n=10000) save_result(result, result_dir, print_n=10) print_msg('finish running ({}s)'.format(main_timer.elapse(2)))
def do_calc(filename, result_col='ratio', ndigits=2, print_n=100000): """Do calculation.""" def get_row_number(): print_msg('start counting data rows...') rows = get_lines_number(filename) - 1 print_msg('finish counting data rows ({}s)'.format(timer.elapse(2))) return rows timer = Timer() row_number = get_row_number() print_msg('total rows = {}'.format(row_number)) @profiled(total=row_number, print_n=print_n, name='calc_counter') def calc(s): x = s['x'] y = s['y'] z = s['z'] return round((z - y) / (y - x), ndigits) print_msg('start calculation...') timer.reset() ratios = [] with open(filename) as f: for line in f: line = line.split(',') try: data = { 'x': np.float64(line[0]), 'y': np.float64(line[1]), 'z': np.float64(line[2]), } except ValueError: continue ratios.append(calc(data)) print_msg('finish calculation ({}s)'.format(timer.elapse(2))) print_msg('start loading org data...') timer.reset() df = pd.read_csv(filename) print_msg('finish loading org data ({}s)'.format(timer.elapse(2))) print_msg('start appending result column...') timer.reset() df[result_col] = ratios print_msg('finish appending result column ({}s)'.format(timer.elapse(2))) return df