def parse_logs(raw_stats, files, delta_mod_t=3600): t0 = time.time() for fp in files: delta_t = time.time() - os.path.getmtime(fp) if os.path.basename(fp) in raw_stats: if delta_t > delta_mod_t: continue print( 'Processing: {} - Last modified: {:.1f} sec ago < delta_mod_t={} sec' .format(fp, delta_t, delta_mod_t)) else: print('Processing: {}'.format(fp)) c2 = {} ii = 0 for line in open(fp, 'rb'): ii += 1 if ii % 10000 == 0: print(ii) if not line.startswith(b'{"_label_cost'): continue data = ds_parse.json_cooked(line, do_devType=True) if data['skipLearn']: continue # extract date from ts d = str(data['ts'][:13], 'utf-8') dev = str(data['devType'], 'utf-8') if d not in c2: c2[d] = {} if dev not in c2[d]: c2[d][dev] = [0, 0, 0] if 'ips' not in c2: c2['ips'] = {} if d[:10] not in c2['ips']: c2['ips'][d[:10]] = [0, 0, 0, 0] c2[d][dev][1] += 1 if data['a'] == 1: c2['ips'][d[:10]][1] += 1 / data['p'] c2['ips'][d[:10]][3] += 1 / data['p'] / data['num_a'] if data['o'] == 1: c2[d][dev][0] += 1 if data['cost'] != b'0': r = -float(data['cost']) c2[d][dev][2] += r if data['a'] == 1: c2['ips'][d[:10]][0] += r / data['p'] c2['ips'][d[:10]][2] += r / data['p'] / data['num_a'] raw_stats[os.path.basename(fp)] = c2 print('Log reading time:', time.time() - t0)
def get_metadata(self, local_log_path): summary_path = local_log_path + '.summary' for x in open(local_log_path, 'rb'): if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x, do_decode=True) with open(summary_path, 'a') as f: f.write(json.dumps(data) + '\n') os.remove(local_log_path) os.rename(summary_path, local_log_path)
def parse_logs(raw_stats, files, delta_mod_t=3600): t0 = time.time() for fp in files: delta_t = time.time() - os.path.getmtime(fp) if os.path.basename(fp) in raw_stats: if delta_t > delta_mod_t: continue print( 'Processing: {} - Last modified: {:.1f} sec ago < delta_mod_t={} sec' .format(fp, delta_t, delta_mod_t)) else: print('Processing: {}'.format(fp)) c2 = {} ii = 0 for line in open(fp, 'rb'): ii += 1 if ii % 10000 == 0: print(ii) if not line.startswith(b'{"_label_cost'): continue ei, r, o, ts, p, a, num_a, dev = ds_parse.json_cooked( line, do_devType=True) # extract date from ts d = str(ts[:13], 'utf-8') dev = str(dev, 'utf-8') if d not in c2: c2[d] = {} if dev not in c2[d]: c2[d][dev] = [0, 0, 0] if 'ips' not in c2: c2['ips'] = {} if d[:10] not in c2['ips']: c2['ips'][d[:10]] = [0, 0, 0, 0] c2[d][dev][1] += 1 if a == 1: c2['ips'][d[:10]][1] += 1 / p c2['ips'][d[:10]][3] += 1 / p / num_a if o == 1: c2[d][dev][0] += 1 if r != b'0': r = float(r) c2[d][dev][2] -= r if a == 1: c2['ips'][d[:10]][0] -= r / p c2['ips'][d[:10]][2] -= r / p / num_a raw_stats[os.path.basename(fp)] = c2 print('Log reading time:', time.time() - t0)
def parse_logs(raw_stats, files, delta_mod_t=3600): t0 = time.time() for fp in files: delta_t = time.time()-os.path.getmtime(fp) if os.path.basename(fp) in raw_stats: if delta_t > delta_mod_t: continue print('Processing: {} - Last modified: {:.1f} sec ago < delta_mod_t={} sec'.format(fp,delta_t,delta_mod_t)) else: print('Processing: {}'.format(fp)) c2 = {} ii = 0 for line in open(fp, 'rb'): ii += 1 if ii % 10000 == 0: print(ii) if not line.startswith(b'{"_label_cost'): continue data = ds_parse.json_cooked(line, do_devType=True) if data['skipLearn']: continue # extract date from ts d = str(data['ts'][:13], 'utf-8') dev = str(data['devType'], 'utf-8') if d not in c2: c2[d] = {} if dev not in c2[d]: c2[d][dev] = [0,0,0] if 'ips' not in c2: c2['ips'] = {} if d[:10] not in c2['ips']: c2['ips'][d[:10]] = [0,0,0,0] c2[d][dev][1] += 1 if data['a'] == 1: c2['ips'][d[:10]][1] += 1/data['p'] c2['ips'][d[:10]][3] += 1/data['p']/data['num_a'] if data['o'] == 1: c2[d][dev][0] += 1 if data['cost'] != b'0': r = -float(data['cost']) c2[d][dev][2] += r if data['a'] == 1: c2['ips'][d[:10]][0] += r/data['p'] c2['ips'][d[:10]][2] += r/data['p']/data['num_a'] raw_stats[os.path.basename(fp)] = c2 print('Log reading time:', time.time()-t0)
def parse_logs(raw_stats, files, delta_mod_t=3600): t0 = time.time() for fp in files: delta_t = time.time() - os.path.getmtime(fp) if os.path.basename(fp) in raw_stats and delta_t > delta_mod_t: continue print( 'Processing: {} - Last modified: {:.1f} sec ago < delta_mod_t={} sec' .format(fp, delta_t, delta_mod_t)) c2 = {} ii = 0 for line in open(fp, encoding="utf8"): ii += 1 if ii % 10000 == 0: print(ii) if 'Timestamp' not in line or '_label_cost' not in line: continue try: ei, r, ts, p, a, num_a, dev = ds_parse.json_cooked( line, do_devType=True) # extract date from ts d = ts[:13] if d not in c2: c2[d] = {} if dev not in c2[d]: c2[d][dev] = [0, 0, 0] if 'ips' not in c2: c2['ips'] = {} if d[:10] not in c2['ips']: c2['ips'][d[:10]] = [0, 0] c2[d][dev][1] += 1 c2['ips'][d[:10]][1] += 1 if r != '0': r = float(r) c2[d][dev][0] += 1 c2[d][dev][2] -= r if a == 1: c2['ips'][d[:10]][0] -= r / p except Exception as e: print('error: {0}'.format(e)) raw_stats[os.path.basename(fp)] = c2 print('Log reading time:', time.time() - t0)
def update(files, dt_str=13): fp_list = ds_parse.input_files_to_fp_list(files) l = [] c_imp = collections.Counter() c_clk = collections.Counter() c_imp_all = collections.Counter() for fp in fp_list: bytes_count = 0 tot_bytes = os.path.getsize(fp) for i, x in enumerate( gzip.open(fp, 'rb') if fp.endswith('.gz') else open(fp, 'rb')): bytes_count += len(x) if (i + 1) % 1000 == 0: if fp.endswith('.gz'): ds_parse.update_progress(i + 1, prefix=fp + ' - ') else: ds_parse.update_progress(bytes_count, tot_bytes, fp + ' - ') if x.startswith(b'{"_label') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x) if data is None: continue c_imp_all.update([data['ts'][:dt_str]]) if not data['skipLearn']: c_imp.update([data['ts'][:dt_str]]) l.append((data, x.strip())) if float(data['cost']) < 0: c_clk.update([data['ts'][:dt_str]]) if fp.endswith('.gz'): ds_parse.update_progress(i + 1, prefix=fp + ' - ') else: ds_parse.update_progress(bytes_count, tot_bytes, fp + ' - ') print() ctr = [] ts = [] print('Timestamp (UTC),Clicks,Activated Imp.,CTR,Total Imp.') for x in c_imp_all: ctr.append(c_clk[x] / max(c_imp[x], 1)) ts.append(x) print('{},{},{},{:.2%},{}'.format(x, c_clk[x], c_imp[x], ctr[-1], c_imp_all[x])) print() return ts, ctr, l
def update(files, dt_str=13): fp_list = ds_parse.input_files_to_fp_list(files) l = [] c_imp = collections.Counter() c_clk = collections.Counter() c_imp_all = collections.Counter() for fp in fp_list: bytes_count = 0 tot_bytes = os.path.getsize(fp) for i,x in enumerate(gzip.open(fp, 'rb') if fp.endswith('.gz') else open(fp, 'rb')): bytes_count += len(x) if (i+1) % 1000 == 0: if fp.endswith('.gz'): ds_parse.update_progress(i+1,prefix=fp+' - ') else: ds_parse.update_progress(bytes_count,tot_bytes,fp+' - ') if x.startswith(b'{"_label') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x) if data['a'] <= 0: continue c_imp_all.update([data['ts'][:dt_str]]) if not data['skipLearn']: c_imp.update([data['ts'][:dt_str]]) l.append((data, x.strip())) if float(data['cost']) < 0: c_clk.update([data['ts'][:dt_str]]) if fp.endswith('.gz'): ds_parse.update_progress(i+1,prefix=fp+' - ') else: ds_parse.update_progress(bytes_count,tot_bytes,fp+' - ') print() ctr = [] ts = [] print('Timestamp (UTC),Clicks,Activated Imp.,CTR,Total Imp.') for x in c_imp_all: ctr.append(c_clk[x]/max(c_imp[x],1)) ts.append(x) print('{},{},{},{:.2%},{}'.format(x,c_clk[x],c_imp[x],ctr[-1],c_imp_all[x])) print() return ts,ctr,l
def print_stats(local_fp, azure_path, verbose=False, plot_hist=False, hist_bin=100): t = time.time() gt = {} len_local_rank = 0 dup_rank = 0 local_rew = [] lines_errs = 0 err_codes = collections.Counter() bytes_count = 0 tot_bytes = os.path.getsize(local_fp) for i, x in enumerate(open(local_fp, encoding='utf-8')): bytes_count += len(x) if (i + 1) % 10000 == 0: ds_parse.update_progress( bytes_count, tot_bytes, 'Loading Local file: {} - '.format(local_fp)) if 'status_code:200' in x: if '/rank/' in x and '"eventId":"' in x: ei = ds_parse.local_rank(x) len_local_rank += 1 if ei in gt: dup_rank += 1 else: gt[ei] = {'i': len_local_rank} elif '/reward/' in x and 'content:' in x: ei, r = ds_parse.local_reward(x) local_rew.append((ei, r)) gt[ei].setdefault('local_rew', []).append(r) else: lines_errs += 1 else: err_codes.update([ds_parse.extract_field(x, 'status_code:', '\t')]) ds_parse.update_progress(tot_bytes, tot_bytes, 'Loading Local file: {} - '.format(local_fp)) print('\n\nLoading Azure files...') if os.path.isdir(azure_path): files = [ azure_fp.path for azure_fp in scantree(azure_path) if azure_fp.name.endswith('.json') ] else: files = [azure_path] verbose_output = [] ei_miss_local = 0 azure_data = [] for ii, azure_fp in enumerate(files): bytes_count = 0 tot_bytes = os.path.getsize(azure_fp) for i, x in enumerate( gzip.open(azure_fp, 'rb') if azure_fp. endswith('.gz') else open(azure_fp, 'rb')): bytes_count += len(x) if (i + 1) % 10000 == 0: if azure_fp.endswith('.gz'): ds_parse.update_progress(i + 1, prefix='File {}/{}: {} - '.format( ii + 1, len(files), azure_fp)) else: ds_parse.update_progress( bytes_count, tot_bytes, 'File {}/{}: {} - '.format(ii + 1, len(files), azure_fp)) if x.startswith(b'{"_label_cost":'): data = ds_parse.json_cooked(x) ei = str(data['ei'], 'utf-8') c = str(data['cost'], 'utf-8') azure_data.append((ei, c)) if ei not in gt: ei_miss_local += 1 if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Ranking missing from Local' .format(len(azure_data), ei)) else: gt[ei].setdefault('azure_data', []).append((c, data['ts'])) if azure_fp.endswith('.gz'): ds_parse.update_progress(i + 1, prefix='File {}/{}: {} - '.format( ii + 1, len(files), azure_fp)) else: ds_parse.update_progress( bytes_count, tot_bytes, 'File {}/{}: {} - '.format(ii + 1, len(files), azure_fp)) print() print() dup_azure_counter = collections.Counter() dup_rew_counter = collections.Counter() err_rewards_idx = [] no_events_idx = [] no_rewards_idx = [] for i, ei in enumerate(gt): if (i + 1) % 10000 == 0: ds_parse.update_progress(i + 1, len(gt), 'Evaluating differences - ') if 'local_rew' in gt[ei]: if len(gt[ei]['local_rew']) > 1: dup_rew_counter.update([len(gt[ei]['local_rew'])]) if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Duplicate in Reward: {}'. format(gt[ei]['i'], ei, gt[ei]['local_rew'])) else: if 'azure_data' in gt[ei]: if len(gt[ei]['azure_data']) > 1: dup_azure_counter.update([len(gt[ei]['azure_data'])]) if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Duplicate in Azure: {}' .format(gt[ei]['i'], ei, gt[ei]['azure_data'])) else: a = float(gt[ei]['local_rew'][0]) b = float(gt[ei]['azure_data'][0][0]) if abs(a + b) > max(1e-7 * max(abs(a), abs(b)), 1e-6): err_rewards_idx.append(gt[ei]['i']) if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Error in reward: Local: {} Azure: {}' .format(gt[ei]['i'], ei, gt[ei]['local_rew'][0], gt[ei]['azure_data'][0])) else: no_events_idx.append(gt[ei]['i']) if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Ranking missing from Azure' .format(gt[ei]['i'], ei)) else: no_rewards_idx.append(gt[ei]['i']) if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Reward missing from local'.format( gt[ei]['i'], ei)) ds_parse.update_progress(i + 1, len(gt), 'Evaluating differences - ') print() for x in verbose_output: print(x) print('\nComputing summary stats...') rew_dict = {y[0]: y[1] for y in local_rew} azure_dict = {y[0]: y[1] for y in azure_data} dup_azure = sum((x - 1) * dup_azure_counter[x] for x in dup_azure_counter) dup_rew = sum((x - 1) * dup_rew_counter[x] for x in dup_rew_counter) if verbose: print('-----' * 10) print('Missing events indexes (1-based indexing)\n{}'.format( no_events_idx)) print('-----' * 10) print('Missing local rewards indexes (1-based indexing)\n{}'.format( no_rewards_idx)) print('-----' * 10) print('Wrong rewards indexes (1-based indexing)\n{}'.format( err_rewards_idx)) print('-----' * 10) print('Events in local_rank: {} (Duplicates: {})'.format( len_local_rank, dup_rank)) print('Events in local_rew: {} (Duplicates: {} - {})'.format( len(local_rew), dup_rew, dup_rew_counter)) print('Events in azure_data: {} (Duplicates: {} - {})'.format( len(azure_data), dup_azure, dup_azure_counter)) print('-----' * 10) print('Intersection local_rank/local_rew:', sum(1 for x in rew_dict if x in gt)) print('Intersection local_rank/azure_data:', sum(1 for x in azure_dict if x in gt)) print('Missing EventIds from local: {}'.format(ei_miss_local)) print('Missing EventIds from azure: {}'.format(len(no_events_idx)), end='') if no_events_idx: print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx), len_local_rank), end='') print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='') if no_rewards_idx: print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx), len_local_rank), end='') print('\nWrong rewards: {}'.format(len(err_rewards_idx))) print('-----' * 10) print('status_codes errors: {}'.format(err_codes.most_common())) print('Lines skipped in Local file: {}'.format(lines_errs)) print('-----' * 10) print('Elapsed time: ', time.time() - t) if plot_hist: if err_rewards_idx or no_events_idx or no_rewards_idx: plt.rcParams.update({'font.size': 16}) # General font size if err_rewards_idx: a = plt.hist(err_rewards_idx, hist_bin, label='Wrong reward', color='xkcd:orange') if verbose: print('err_rewards_idx', a) if no_events_idx: b = plt.hist(no_events_idx, hist_bin, label='No rank', color='xkcd:blue') if verbose: print('no_events_idx', b) if no_rewards_idx: c = plt.hist(no_rewards_idx, hist_bin, label='No local reward', color='xkcd:red') if verbose: print('no_rewards_idx', c) plt.title('Missing/Wrong rank and reward requests', fontsize=20) plt.xlabel('Request index', fontsize=18) plt.ylabel('Bin Count', fontsize=18) plt.legend() plt.show() else: print('Nothing to plot! All is good!')
def compute_estimates(log_fp, cats_transformer=None): # Init estimators online = ips_snips.Estimator() baseline1 = ips_snips.Estimator() baselineR = ips_snips.Estimator() online_mle = mle.Estimator() baseline1_mle = mle.Estimator() baselineR_mle = mle.Estimator() online_cressieread = cressieread.Estimator() baseline1_cressieread = cressieread.Estimator() baselineR_cressieread = cressieread.Estimator() print('Processing: {}'.format(log_fp)) bytes_count = 0 tot_bytes = os.path.getsize(log_fp) evts = 0 for i, x in enumerate( gzip.open(log_fp, 'rb') if log_fp. endswith('.gz') else open(log_fp, 'rb')): # display progress bytes_count += len(x) if (i + 1) % 10000 == 0: if log_fp.endswith('.gz'): ds_parse.update_progress(i + 1) else: ds_parse.update_progress(bytes_count, tot_bytes) # parse dsjson file if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x) if data['skipLearn']: continue r = 0 if data['cost'] == b'0' else -float(data['cost']) # Update estimators with tuple (p_log, r, p_pred) online.add_example(data['p'], r, data['p']) baseline1.add_example(data['p'], r, 1 if data['a'] == 1 else 0) baselineR.add_example(data['p'], r, 1 / data['num_a']) online_mle.add_example(data['p'], r, data['p']) baseline1_mle.add_example(data['p'], r, 1 if data['a'] == 1 else 0) baselineR_mle.add_example(data['p'], r, 1 / data['num_a']) online_cressieread.add_example(data['p'], r, data['p']) baseline1_cressieread.add_example(data['p'], r, 1 if data['a'] == 1 else 0) baselineR_cressieread.add_example(data['p'], r, 1 / data['num_a']) evts += 1 if x.startswith(b'{"_label_ca":') and x.strip().endswith(b'}'): data = ds_parse.json_cooked_continuous_actions(x) if cats_transformer is None: raise RuntimeError( "Not all of the required arguments for running with continuous actions have been provided." ) # passing logged action as predicted action to transformer data = cats_transformer.transform(data, data['a']) # passing baseline action as predicted action to transformer data_baseline1 = cats_transformer.transform( data, cats_transformer.get_baseline1_prediction()) if data['skipLearn']: continue r = 0 if data['cost'] == b'0' else -float(data['cost']) # Update estimators with tuple (p_log, r, p_pred) online.add_example(data['p'], r, data['p']) baseline1.add_example(data['p'], r, data_baseline1['pred_p']) baselineR.add_example(data['p'], r, 1.0 / cats_transformer.continuous_range) online_mle.add_example(data['p'], r, data['p']) baseline1_mle.add_example(data['p'], r, data_baseline1['pred_p']) baselineR_mle.add_example(data['p'], r, 1.0 / cats_transformer.continuous_range) online_cressieread.add_example(data['p'], r, data['p']) baseline1_cressieread.add_example(data['p'], r, data_baseline1['pred_p']) baselineR_cressieread.add_example( data['p'], r, 1.0 / cats_transformer.continuous_range) evts += 1 if log_fp.endswith('.gz'): len_text = ds_parse.update_progress(i + 1) else: len_text = ds_parse.update_progress(bytes_count, tot_bytes) print('\nProcessed {} events out of {} lines'.format(evts, i + 1)) print('online_ips:', online.get_estimate('ips')) print('baseline1_ips:', baseline1.get_estimate('ips')) print('baseline1 gaussian ci:', baseline1.get_interval('gaussian')) print('baseline1 clopper pearson ci:', baseline1.get_interval('clopper-pearson')) print('baselineR_ips:', baselineR.get_estimate('ips')) print('baselineR gaussian ci:', baselineR.get_interval('gaussian')) print('baselineR clopper pearson ci:', baselineR.get_interval('clopper-pearson')) print('online_snips:', online.get_estimate('snips')) print('baseline1_snips:', baseline1.get_estimate('snips')) print('baselineR_snips:', baselineR.get_estimate('snips')) print('online_mle:', online_mle.get_estimate()) print('baseline1_mle:', baseline1_mle.get_estimate()) print('baselineR_mle:', baselineR_mle.get_estimate()) print('online_cressieread:', online_cressieread.get_estimate()) print('baseline1_cressieread:', baseline1_cressieread.get_estimate()) print('baselineR_cressieread:', baselineR_cressieread.get_estimate())
def print_stats(local_fp, azure_path, verbose=False, plot_hist=False): print('Computing statistics...') local_rank = [] local_rew = [] lines_errs = 0 err_codes = collections.Counter() for x in open(local_fp, encoding='utf-8'): if 'status_code:200' in x: if '/rank/' in x and '"eventId":"' in x: local_rank.append(ds_parse.local_rank(x)) elif '/reward/' in x and 'content:' in x: local_rew.append(ds_parse.local_reward(x)) else: lines_errs += 1 else: err_codes.update([ds_parse.extract_field(x, 'status_code:', '\t')]) if os.path.isdir(azure_path): files = [ azure_fp.path for azure_fp in scantree(azure_path) if azure_fp.name.endswith('.json') ] else: files = [azure_path] azure_data = [] for azure_fp in files: for x in open(azure_fp, 'rb'): if x.startswith(b'{"_label_cost":'): data = ds_parse.json_cooked(x) azure_data.append([data['ei'], data['cost']]) local_rank_set = set(local_rank) rew_dict = {y[0]: y[1] for y in local_rew} azure_dict = {str(y[0], 'utf-8'): str(y[1], 'utf-8') for y in azure_data} err_rewards_idx = [] no_events_idx = [] no_rewards_idx = [] for i, x in enumerate(local_rank): if x in rew_dict: if x in azure_dict: if abs(1. + float(azure_dict[x]) / float(rew_dict[x])) > 1e-7: if verbose: print( 'Idx: {} - Error in reward: Local: {} Azure: {} - EventId: {}' .format(i + 1, rew_dict[x], azure_dict[x], x)) err_rewards_idx.append(i + 1) else: no_events_idx.append(i + 1) if verbose: print('Idx: {} - Ranking missing from Azure - EventId: {}'. format(i + 1, x)) else: no_rewards_idx.append(i + 1) if verbose: print( 'Idx: {} - Reward missing from local - EventId: {}'.format( i + 1, x)) dup_local = len(local_rew) - len(rew_dict) dup_azure = len(azure_data) - len(azure_dict) if verbose: print('-----' * 10) print('Missing events indexes (1-based indexing)\n{}'.format( no_events_idx)) print('-----' * 10) print('Missing local rewards indexes (1-based indexing)\n{}'.format( no_rewards_idx)) print('-----' * 10) print('Wrong rewards indexes (1-based indexing)\n{}'.format( err_rewards_idx)) if dup_local > 0: print('-----' * 10) print('Duplicates in Local rewards') dup_analysis(local_rew) if dup_azure > 0: print('-----' * 10) print('Duplicates in Azure Storage') dup_analysis(azure_data) print('-----' * 10) print('Events in local_rank: {} (Duplicates: {})'.format( len(local_rank), len(local_rank) - len(local_rank_set))) print('Events in local_rew: {} (Duplicates: {})'.format( len(local_rew), dup_local)) print('Events in azure_data: {} (Duplicates: {})'.format( len(azure_data), dup_azure)) print('-----' * 10) print('Intersection local_rank/local_rew:', len(local_rank_set.intersection(rew_dict.keys()))) print('Intersection local_rank/azure_data:', len(local_rank_set.intersection(azure_dict.keys()))) print('Missing EventIds: {}'.format(len(no_events_idx)), end='') if no_events_idx: print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx), len(local_rank)), end='') print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='') if no_rewards_idx: print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx), len(local_rank)), end='') print('\nWrong rewards: {}'.format(len(err_rewards_idx))) print('-----' * 10) print('status_codes errors: {}'.format(err_codes.most_common())) print('Lines skipped in Local file: {}'.format(lines_errs)) print('-----' * 10) if plot_hist: if err_rewards_idx or no_events_idx or no_rewards_idx: plt.rcParams.update({'font.size': 16}) # General font size if err_rewards_idx: a = plt.hist(err_rewards_idx, 50, label='Wrong reward', color='xkcd:orange') if verbose: print('err_rewards_idx', a) if no_events_idx: b = plt.hist(no_events_idx, 50, label='No rank', color='xkcd:blue') if verbose: print('no_events_idx', b) if no_rewards_idx: c = plt.hist(no_rewards_idx, 50, label='No local reward', color='xkcd:red') if verbose: print('no_rewards_idx', c) plt.title('Missing/Wrong rank and reward requests', fontsize=20) plt.xlabel('Request index', fontsize=18) plt.ylabel('Bin Count', fontsize=18) plt.legend() plt.show() else: print('Nothing to plot! All is good!')
def create_stats(log_fp, dashboard_file, predictions_files=None): t0 = time.time() if predictions_files is None: print('Searching prediction files for log file: {}'.format(log_fp)) predictions_files = [] for fn in os.scandir(os.path.dirname(log_fp)): if fn.path.startswith(log_fp+'.') and fn.name.endswith('.pred'): predictions_files.append(fn.path) # load predictions from predictions_files pred = {} for pred_fp in predictions_files: if os.path.isfile(pred_fp): name = pred_fp.split('.')[-2] # check that policy name is encoded in file_name if name: pred[name] = [x.strip() for x in open(pred_fp) if x.strip()] print('Loaded {} predictions from {}'.format(len(pred[name]),pred_fp)) else: print('Name is not valid - Skip: {}'.format(pred_fp)) else: print('Error loading policy predictions. Pred file not found: {}'.format(pred_fp)) sys.exit() if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(len(pred[name]) for name in pred): print('Error: Prediction file length ({}) must be equal for all files'.format([len(pred[name]) for name in pred])) sys.exit() d = {} print('Processing: {}'.format(log_fp)) bytes_count = 0 tot_bytes = os.path.getsize(log_fp) evts = 0 for i,x in enumerate(gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')): # display progress bytes_count += len(x) if (i+1) % 1000 == 0: if log_fp.endswith('.gz'): ds_parse.update_progress(i+1) else: ds_parse.update_progress(bytes_count,tot_bytes) if x.startswith(b'{"_label_cost":'): data = ds_parse.json_cooked(x) if data['skipLearn']: continue r = 0 if data['cost'] == b'0' else -float(data['cost']) ############################### Aggregates for each bin ###################################### # # 'n': IPS of numerator # 'N': total number of samples in bin from log (IPS = n/N) # 'd': IPS of denominator (SNIPS = n/d) # 'Ne': number of samples in bin when off-policy agrees with log policy # 'c': max abs. value of numerator's items (needed for Clopper-Pearson confidence intervals) # 'SoS': sum of squares of numerator's items (needed for Gaussian confidence intervals) # ################################################################################################# # binning timestamp every 5 min ts_bin = get_ts_5min_bin(data['ts']) # initialize aggregates for ts_bin if ts_bin not in d: d[ts_bin] = collections.OrderedDict({'online' : {'n':0,'N':0,'d':0}, 'baseline1' : {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0}, 'baselineRand' : {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0}}) for name in pred: d[ts_bin][name] = {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0} # update aggregates for online and baseline policies d[ts_bin]['online']['d'] += 1 d[ts_bin]['online']['N'] += 1 d[ts_bin]['baselineRand']['N'] += 1 d[ts_bin]['baseline1']['N'] += 1 d[ts_bin]['baselineRand']['Ne'] += 1 d[ts_bin]['baselineRand']['d'] += 1/data['p']/data['num_a'] if data['a'] == 1: d[ts_bin]['baseline1']['Ne'] += 1 d[ts_bin]['baseline1']['d'] += 1/data['p'] if r != 0: d[ts_bin]['online']['n'] += r d[ts_bin]['baselineRand']['n'] += r/data['p']/data['num_a'] d[ts_bin]['baselineRand']['c'] = max(d[ts_bin]['baselineRand']['c'], r/data['p']/data['num_a']) d[ts_bin]['baselineRand']['SoS'] += (r/data['p']/data['num_a'])**2 if data['a'] == 1: d[ts_bin]['baseline1']['n'] += r/data['p'] d[ts_bin]['baseline1']['c'] = max(d[ts_bin]['baseline1']['c'], r/data['p']) d[ts_bin]['baseline1']['SoS'] += (r/data['p'])**2 # update aggregates for additional policies from predictions for name in pred: pred_prob = get_prediction_prob(data['a']-1, pred[name][evts]) # a-1: 0-index action d[ts_bin][name]['N'] += 1 if pred_prob > 0: p_over_p = pred_prob/data['p'] d[ts_bin][name]['d'] += p_over_p d[ts_bin][name]['Ne'] += 1 if r != 0: d[ts_bin][name]['n'] += r*p_over_p d[ts_bin][name]['c'] = max(d[ts_bin][name]['c'], r*p_over_p) d[ts_bin][name]['SoS'] += (r*p_over_p)**2 evts += 1 if log_fp.endswith('.gz'): len_text = ds_parse.update_progress(i+1) else: len_text = ds_parse.update_progress(bytes_count,tot_bytes) sys.stdout.write("\r" + " "*len_text + "\r") sys.stdout.flush() print('Read {} lines - Processed {} events'.format(i+1,evts)) if any(len(pred[name]) != evts for name in pred): print('Error: Prediction file length ({}) is different from number of events in log file ({})'.format([len(pred[name]) for name in pred],evts)) sys.exit() output_dashboard_data(d, dashboard_file) print('Total Elapsed Time: {:.1f} sec.'.format(time.time()-t0))
def print_stats(local_fp, azure_path, verbose=False, plot_hist=False, hist_bin=100): t = time.time() gt = {} len_local_rank = 0 dup_rank = 0 local_rew = [] lines_errs = 0 err_codes = collections.Counter() bytes_count = 0 tot_bytes = os.path.getsize(local_fp) for i,x in enumerate(open(local_fp, encoding='utf-8')): bytes_count += len(x) if (i+1) % 10000 == 0: ds_parse.update_progress(bytes_count,tot_bytes,'Loading Local file: {} - '.format(local_fp)) if 'status_code:200' in x: if '/rank/' in x and '"eventId":"' in x: ei = ds_parse.local_rank(x) len_local_rank += 1 if ei in gt: dup_rank += 1 else: gt[ei] = {'i': len_local_rank} elif '/reward/' in x and 'content:' in x: ei,r = ds_parse.local_reward(x) local_rew.append((ei,r)) gt[ei].setdefault('local_rew',[]).append(r) else: lines_errs += 1 else: err_codes.update([ds_parse.extract_field(x,'status_code:','\t')]) ds_parse.update_progress(tot_bytes,tot_bytes,'Loading Local file: {} - '.format(local_fp)) print('\n\nLoading Azure files...') if os.path.isdir(azure_path): files = [azure_fp.path for azure_fp in scantree(azure_path) if azure_fp.name.endswith('.json')] else: files = [azure_path] verbose_output = [] ei_miss_local = 0 azure_data = [] for ii,azure_fp in enumerate(files): bytes_count = 0 tot_bytes = os.path.getsize(azure_fp) for i,x in enumerate(gzip.open(azure_fp, 'rb') if azure_fp.endswith('.gz') else open(azure_fp, 'rb')): bytes_count += len(x) if (i+1) % 10000 == 0: if azure_fp.endswith('.gz'): ds_parse.update_progress(i+1,prefix='File {}/{}: {} - '.format(ii+1,len(files),azure_fp)) else: ds_parse.update_progress(bytes_count,tot_bytes,'File {}/{}: {} - '.format(ii+1,len(files),azure_fp)) if x.startswith(b'{"_label_cost":'): data = ds_parse.json_cooked(x) ei = str(data['ei'], 'utf-8') c = str(data['cost'], 'utf-8') azure_data.append((ei, c)) if ei not in gt: ei_miss_local += 1 if verbose: verbose_output.append('Idx: {} - EventId: {} - Ranking missing from Local'.format(len(azure_data),ei)) else: gt[ei].setdefault('azure_data',[]).append((c, data['ts'])) if azure_fp.endswith('.gz'): ds_parse.update_progress(i+1,prefix='File {}/{}: {} - '.format(ii+1,len(files),azure_fp)) else: ds_parse.update_progress(bytes_count,tot_bytes,'File {}/{}: {} - '.format(ii+1,len(files),azure_fp)) print() print() dup_azure_counter = collections.Counter() dup_rew_counter = collections.Counter() err_rewards_idx = [] no_events_idx = [] no_rewards_idx = [] for i,ei in enumerate(gt): if (i+1) % 10000 == 0: ds_parse.update_progress(i+1,len(gt),'Evaluating differences - ') if 'local_rew' in gt[ei]: if len(gt[ei]['local_rew']) > 1: dup_rew_counter.update([len(gt[ei]['local_rew'])]) if verbose: verbose_output.append('Idx: {} - EventId: {} - Duplicate in Reward: {}'.format(gt[ei]['i'],ei,gt[ei]['local_rew'])) else: if 'azure_data' in gt[ei]: if len(gt[ei]['azure_data']) > 1: dup_azure_counter.update([len(gt[ei]['azure_data'])]) if verbose: verbose_output.append('Idx: {} - EventId: {} - Duplicate in Azure: {}'.format(gt[ei]['i'],ei,gt[ei]['azure_data'])) else: a = float(gt[ei]['local_rew'][0]) b = float(gt[ei]['azure_data'][0][0]) if abs(a+b) > max(1e-7 * max(abs(a), abs(b)), 1e-6): err_rewards_idx.append(gt[ei]['i']) if verbose: verbose_output.append('Idx: {} - EventId: {} - Error in reward: Local: {} Azure: {}'.format(gt[ei]['i'],ei,gt[ei]['local_rew'][0],gt[ei]['azure_data'][0])) else: no_events_idx.append(gt[ei]['i']) if verbose: verbose_output.append('Idx: {} - EventId: {} - Ranking missing from Azure'.format(gt[ei]['i'],ei)) else: no_rewards_idx.append(gt[ei]['i']) if verbose: verbose_output.append('Idx: {} - EventId: {} - Reward missing from local'.format(gt[ei]['i'],ei)) ds_parse.update_progress(i+1,len(gt),'Evaluating differences - ') print() for x in verbose_output: print(x) print('\nComputing summary stats...') rew_dict = {y[0]: y[1] for y in local_rew} azure_dict = {y[0]: y[1] for y in azure_data} dup_azure = sum((x-1)*dup_azure_counter[x] for x in dup_azure_counter) dup_rew = sum((x-1)*dup_rew_counter[x] for x in dup_rew_counter) if verbose: print('-----'*10) print('Missing events indexes (1-based indexing)\n{}'.format(no_events_idx)) print('-----'*10) print('Missing local rewards indexes (1-based indexing)\n{}'.format(no_rewards_idx)) print('-----'*10) print('Wrong rewards indexes (1-based indexing)\n{}'.format(err_rewards_idx)) print('-----'*10) print('Events in local_rank: {} (Duplicates: {})'.format(len_local_rank, dup_rank)) print('Events in local_rew: {} (Duplicates: {} - {})'.format(len(local_rew), dup_rew, dup_rew_counter)) print('Events in azure_data: {} (Duplicates: {} - {})'.format(len(azure_data), dup_azure, dup_azure_counter)) print('-----'*10) print('Intersection local_rank/local_rew:',sum(1 for x in rew_dict if x in gt)) print('Intersection local_rank/azure_data:',sum(1 for x in azure_dict if x in gt)) print('Missing EventIds from local: {}'.format(ei_miss_local)) print('Missing EventIds from azure: {}'.format(len(no_events_idx)), end='') if no_events_idx: print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx),len_local_rank), end='') print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='') if no_rewards_idx: print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx),len_local_rank), end='') print('\nWrong rewards: {}'.format(len(err_rewards_idx))) print('-----'*10) print('status_codes errors: {}'.format(err_codes.most_common())) print('Lines skipped in Local file: {}'.format(lines_errs)) print('-----'*10) print('Elapsed time: ',time.time()-t) if plot_hist: if err_rewards_idx or no_events_idx or no_rewards_idx: plt.rcParams.update({'font.size': 16}) # General font size if err_rewards_idx: a = plt.hist(err_rewards_idx, hist_bin, label='Wrong reward', color='xkcd:orange') if verbose: print('err_rewards_idx',a) if no_events_idx: b = plt.hist(no_events_idx, hist_bin, label='No rank', color='xkcd:blue') if verbose: print('no_events_idx',b) if no_rewards_idx: c = plt.hist(no_rewards_idx, hist_bin, label='No local reward', color='xkcd:red') if verbose: print('no_rewards_idx',c) plt.title('Missing/Wrong rank and reward requests', fontsize=20) plt.xlabel('Request index', fontsize=18) plt.ylabel('Bin Count', fontsize=18) plt.legend() plt.show() else: print('Nothing to plot! All is good!')
def compute_estimates(log_fp): # Init estimators online = ips_snips.Estimator() baseline1 = ips_snips.Estimator() baselineR = ips_snips.Estimator() online_mle = mle.Estimator() baseline1_mle = mle.Estimator() baselineR_mle = mle.Estimator() print('Processing: {}'.format(log_fp)) bytes_count = 0 tot_bytes = os.path.getsize(log_fp) evts = 0 for i, x in enumerate( gzip.open(log_fp, 'rb') if log_fp. endswith('.gz') else open(log_fp, 'rb')): # display progress bytes_count += len(x) if (i + 1) % 10000 == 0: if log_fp.endswith('.gz'): ds_parse.update_progress(i + 1) else: ds_parse.update_progress(bytes_count, tot_bytes) # parse dsjson file if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x) if data['skipLearn']: continue r = 0 if data['cost'] == b'0' else -float(data['cost']) # Update estimators with tuple (p_log, r, p_pred) online.add_example(data['p'], r, data['p']) baseline1.add_example(data['p'], r, 1 if data['a'] == 1 else 0) baselineR.add_example(data['p'], r, 1 / data['num_a']) online_mle.add_example(data['p'], r, data['p']) baseline1_mle.add_example(data['p'], r, 1 if data['a'] == 1 else 0) baselineR_mle.add_example(data['p'], r, 1 / data['num_a']) evts += 1 if log_fp.endswith('.gz'): len_text = ds_parse.update_progress(i + 1) else: len_text = ds_parse.update_progress(bytes_count, tot_bytes) print('\nProcessed {} events out of {} lines'.format(evts, i + 1)) print('online_ips:', online.get_estimate('ips')) print('baseline1_ips:', baseline1.get_estimate('ips')) print('baselineR_ips:', baselineR.get_estimate('ips')) print('online_snips:', online.get_estimate('snips')) print('baseline1_snips:', baseline1.get_estimate('snips')) print('baselineR_snips:', baselineR.get_estimate('snips')) print('online_mle:', online_mle.get_estimate()) print('baseline1_mle:', baseline1_mle.get_estimate()) print('baselineR_mle:', baselineR_mle.get_estimate())
def download_container(app_id, log_dir, start_date=None, end_date=None, overwrite_mode=0, dry_run=False, version=2, verbose=False, create_gzip_mode=-1, delta_mod_t=3600, max_connections=4, confirm=False): t_start = time.time() print('-----'*10) print('Current UTC time: {}'.format(datetime.datetime.now(datetime.timezone.utc))) print('Start Date: {}'.format(start_date)) print('End Date: {}'.format(end_date)) print('Overwrite mode: {}'.format(overwrite_mode)) print('dry_run: {}'.format(dry_run)) print('version: {}'.format(version)) print('create_gzip_mode: {}'.format(create_gzip_mode)) if not dry_run: os.makedirs(os.path.join(log_dir, app_id), exist_ok=True) # Get Azure Storage Authentication config = configparser.ConfigParser() config.read('ds.config') connection_string = config['AzureStorageAuthentication'].get(app_id, config['AzureStorageAuthentication']['$Default']) print('-----'*10) if version == 1: # using C# api for uncooked logs output_fp = os.path.join(log_dir, app_id, app_id+'_'+start_date.strftime("%Y-%m-%d")+'_'+end_date.strftime("%Y-%m-%d")+'.json') print('Destination: {}'.format(output_fp)) do_download = True if os.path.isfile(output_fp): if overwrite_mode in {0, 3, 4}: print('Output file already exits. Not downloading'.format(output_fp)) do_download = False elif overwrite_mode == 1 and input('Output file already exits. Do you want to overwrite [Y/n]? '.format(output_fp)) not in {'Y', 'y'}: do_download = False if do_download: if dry_run: print('--dry_run - Not downloading!') else: print('Downloading...', end='') try: import requests LogDownloaderURL = "https://cps-staging-exp-experimentation.azurewebsites.net/api/Log?account={ACCOUNT_NAME}&key={ACCOUNT_KEY}&start={START_DATE}&end={END_DATE}&container={CONTAINER}" connection_string_dict = dict(x.split('=',1) for x in connection_string.split(';')) if not connection_string_dict['AccountName'] or len(connection_string_dict['AccountKey']) != 88: print("Error: Invalid Azure Storage ConnectionString.") sys.exit() url = LogDownloaderURL.format(ACCOUNT_NAME=connection_string_dict['AccountName'], ACCOUNT_KEY=connection_string_dict['AccountKey'].replace('+','%2b'), CONTAINER=app_id, START_DATE=start_date.strftime("%Y-%m-%d"), END_DATE=(end_date+datetime.timedelta(days=1)).strftime("%Y-%m-%d")) r = requests.post(url) open(output_fp, 'wb').write(r.content) print(' Done!\n') except Exception as e: print('Error: {}'.format(e)) else: # using BlockBlobService python api for cooked logs try: print('Establishing Azure Storage BlockBlobService connection...') bbs = BlockBlobService(connection_string=connection_string) # List all blobs and download them one by one print('Getting blobs list...') blobs = bbs.list_blobs(app_id) except Exception as e: if e.args[0] == 'dictionary update sequence element #0 has length 1; 2 is required': print("Error: Invalid Azure Storage ConnectionString.") elif type(e.args[0]) == str and e.args[0].startswith('The specified container does not exist.'): print("Error: The specified container ({}) does not exist.".format(app_id)) else: print("Error:\nType: {}\nArgs: {}".format(type(e).__name__, e.args)) sys.exit() print('Iterating through blobs...\n') selected_fps = [] for blob in blobs: if '/data/' not in blob.name: if verbose: print('{} - Skip: Non-data blob\n'.format(blob.name)) continue blob_day = datetime.datetime.strptime(blob.name.split('/data/', 1)[1].split('_', 1)[0], '%Y/%m/%d') if (start_date and blob_day < start_date) or (end_date and end_date < blob_day): if verbose: print('{} - Skip: Outside of date range\n'.format(blob.name)) continue try: bp = bbs.get_blob_properties(app_id, blob.name) if confirm: if input("{} - Do you want to download [Y/n]? ".format(blob.name)) not in {'Y', 'y'}: print() continue fp = os.path.join(log_dir, app_id, blob.name.replace('/','_')) selected_fps.append(fp) if os.path.isfile(fp): file_size = os.path.getsize(fp) if overwrite_mode == 0: if verbose: print('{} - Skip: Output file already exits\n'.format(blob.name)) continue elif overwrite_mode in {1, 3, 4}: if file_size == bp.properties.content_length: # file size is the same, skip! if verbose: print('{} - Skip: Output file already exits with same size\n'.format(blob.name)) continue print('Output file already exits: {}\nLocal size: {:.3f} MB\nAzure size: {:.3f} MB'.format(fp, file_size/(1024**2), bp.properties.content_length/(1024**2))) if overwrite_mode in {3, 4} and file_size > bp.properties.content_length: # local file size is larger, skip with warning! print('{} - Skip: Output file already exits with larger size\n'.format(blob.name)) continue if overwrite_mode == 1 and input("Do you want to overwrite [Y/n]? ") not in {'Y', 'y'}: print() continue else: file_size = None print('Processing: {} (size: {:.3f}MB - Last modified: {})'.format(blob.name, bp.properties.content_length/(1024**2), bp.properties.last_modified)) # check if blob was modified in the last delta_mod_t sec if datetime.datetime.now(datetime.timezone.utc)-bp.properties.last_modified < datetime.timedelta(0, delta_mod_t): if overwrite_mode < 2: if input("Azure blob currently in use (modified in the last delta_mod_t={} sec). Do you want to download anyway [Y/n]? ".format(delta_mod_t)) not in {'Y', 'y'}: print() continue elif overwrite_mode == 4: print('Azure blob currently in use (modified in the last delta_mod_t={} sec). Skipping!\n'.format(delta_mod_t)) continue max_connections = 1 # set max_connections to 1 to prevent crash if azure blob is modified during download if dry_run: print('--dry_run - Not downloading!') else: t0 = time.time() if overwrite_mode in {3, 4} and file_size: print('Check validity of remote file... ', end='') temp_fp = fp + '.temp' cmpsize = min(file_size,8*1024**2) bbs.get_blob_to_path(app_id, blob.name, temp_fp, max_connections=max_connections, start_range=file_size-cmpsize, end_range=file_size-1) if cmp_files(fp, temp_fp, -cmpsize): print('Valid!') print('Resume downloading to temp file with max_connections = {}...'.format(max_connections)) bbs.get_blob_to_path(app_id, blob.name, temp_fp, progress_callback=update_progress, max_connections=max_connections, start_range=os.path.getsize(fp)) download_time = time.time()-t0 download_size_MB = os.path.getsize(temp_fp)/(1024**2) # file size in MB print('\nAppending to local file...') with open(fp, 'ab') as f1, open(temp_fp, 'rb') as f2: shutil.copyfileobj(f2, f1, length=100*1024**2) # writing chunks of 100MB to avoid consuming memory print('Appending completed. Deleting temp file...') os.remove(temp_fp) else: os.remove(temp_fp) print('Invalid! - Skip\n') continue print('Downloaded {:.3f} MB in {:.1f} sec. ({:.3f} MB/sec) - Total elapsed time: {:.1f} sec.\n'.format(download_size_MB, download_time, download_size_MB/download_time, time.time()-t0)) else: print('Downloading with max_connections = {}...'.format(max_connections)) bbs.get_blob_to_path(app_id, blob.name, fp, progress_callback=update_progress, max_connections=max_connections) download_time = time.time()-t0 download_size_MB = os.path.getsize(fp)/(1024**2) # file size in MB print('\nDownloaded {:.3f} MB in {:.1f} sec. ({:.3f} MB/sec)\n'.format(download_size_MB, download_time, download_size_MB/download_time)) except Exception as e: print('Error: {}'.format(e)) if create_gzip_mode > -1: if selected_fps: selected_fps = [x for x in selected_fps if os.path.isfile(x)] if create_gzip_mode == 0: models = {} for fp in selected_fps: models.setdefault(os.path.basename(fp).split('_data_',1)[0], []).append(fp) for model in models: models[model].sort(key=lambda x : list(map(int,x.split('_data_')[1].split('_')[:3]))) start_date = '-'.join(models[model][0].split('_data_')[1].split('_')[:3]) end_date = '-'.join(models[model][-1].split('_data_')[1].split('_')[:3]) output_fp = os.path.join(log_dir, app_id, app_id+'_'+model+'_data_'+start_date+'_'+end_date+'.json.gz') print('Concat and zip files of LastConfigurationEditDate={} to: {}'.format(model, output_fp)) if os.path.isfile(output_fp) and input('Output file already exits. Do you want to overwrite [Y/n]? '.format(output_fp)) not in {'Y', 'y'}: continue if dry_run: print('--dry_run - Not downloading!') else: with gzip.open(output_fp, 'wb') as f_out: for fp in models[model]: print('Adding: {}'.format(fp)) with open(fp, 'rb') as f_in: shutil.copyfileobj(f_in, f_out, length=100*1024**2) # writing chunks of 100MB to avoid consuming memory elif create_gzip_mode == 1: selected_fps.sort(key=lambda x : (list(map(int,x.split('_data_')[1].split('_')[:3])), -os.path.getsize(x), x)) selected_fps_merged = [] last_fp_date = None for fp in selected_fps: fp_date = datetime.datetime.strptime('_'.join(fp.split('_data_')[1].split('_')[:3]), "%Y_%m_%d") if fp_date != last_fp_date: selected_fps_merged.append(fp) last_fp_date = fp_date start_date = '-'.join(selected_fps_merged[0].split('_data_')[1].split('_')[:3]) end_date = '-'.join(selected_fps_merged[-1].split('_data_')[1].split('_')[:3]) output_fp = os.path.join(log_dir, app_id, app_id+'_merged_data_'+start_date+'_'+end_date+'.json.gz') print('Merge and zip files of all LastConfigurationEditDate to: {}'.format(output_fp)) if not os.path.isfile(output_fp) or input('Output file already exits. Do you want to overwrite [Y/n]? '.format(output_fp)) in {'Y', 'y'}: if dry_run: for fp in selected_fps_merged: print('Adding: {}'.format(fp)) print('--dry_run - Not downloading!') else: with gzip.open(output_fp, 'wb') as f_out: for fp in selected_fps_merged: print('Adding: {}'.format(fp)) with open(fp, 'rb') as f_in: shutil.copyfileobj(f_in, f_out, length=100*1024**2) # writing chunks of 100MB to avoid consuming memory elif create_gzip_mode == 2: selected_fps.sort(key=lambda x : (list(map(int,x.split('_data_')[1].split('_')[:3])), -os.path.getsize(x), x)) start_date = '-'.join(selected_fps[0].split('_data_')[1].split('_')[:3]) end_date = '-'.join(selected_fps[-1].split('_data_')[1].split('_')[:3]) output_fp = os.path.join(log_dir, app_id, app_id+'_deepmerged_data_'+start_date+'_'+end_date+'.json.gz') print('Merge, unique, sort, and zip files of all LastConfigurationEditDate to: {}'.format(output_fp)) if not os.path.isfile(output_fp) or input('Output file already exits. Do you want to overwrite [Y/n]? '.format(output_fp)) in {'Y', 'y'}: d = {} for fn in selected_fps: print('Parsing: {}'.format(fn), end='', flush=True) if not dry_run: for x in open(fn, 'rb'): if x.startswith(b'{"_label_cost') and x.strip().endswith(b'}'): # reading only cooked lined data = ds_parse.json_cooked(x) if data['ei'] not in d or float(data['cost']) < d[data['ei']][1]: # taking line with best reward d[data['ei']] = (data['ts'], float(data['cost']), x) print(' - len(d): {}'.format(len(d))) print('Writing to output .gz file...') if dry_run: print('--dry_run - Not downloading!') else: with gzip.open(output_fp, 'wb') as f: i = 0 for x in sorted(d.values(), key=lambda x : x[0]): # events are sorted by timestamp f.write(x[2]) i += 1 if i % 5000 == 0: update_progress(i, len(d)) update_progress(i, len(d)) print() else: print('Unrecognized --create_gzip_mode: {}, skipping creating gzip files.'.format(create_gzip_mode)) else: print('No file downloaded, skipping creating gzip files.') print('Total elapsed time: {:.1f} sec.\n'.format(time.time()-t_start))
def create_stats(log_fp, log_type='cb', d=None, predictions_files=None, is_summary=False, report_progress=True): t0 = time.time() if d is None: d = {} if predictions_files is None: print('Searching prediction files for log file: {}'.format(log_fp)) predictions_files = [] for fn in os.scandir(os.path.dirname(log_fp)): if fn.path.startswith(log_fp + '.') and fn.name.endswith('.pred'): predictions_files.append(fn.path) # load predictions from predictions_files pred = {} for pred_fp in predictions_files: if os.path.isfile(pred_fp): if is_summary: name = pred_fp.split('/')[-1].split('.')[-2] else: name = pred_fp.split('.')[ -2] # check that policy name is encoded in file_name if name: if log_type == 'cb': pred[name] = [ x.strip() for x in open(pred_fp) if x.strip() ] elif log_type == 'ccb': with open(pred_fp) as f: pred[name] = [] slot = [] for x in f: x = x.strip() if x: slot.append(x) else: pred[name].append(slot) slot = [] print('Loaded {} predictions from {}'.format( len(pred[name]), pred_fp)) else: print('Name is not valid - Skip: {}'.format(pred_fp)) else: print('Error loading policy predictions. Pred file not found: {}'. format(pred_fp)) sys.exit() if len(pred) > 1 and min(len(pred[name]) for name in pred) != max( len(pred[name]) for name in pred): print('Error: Prediction file length ({}) must be equal for all files'. format([len(pred[name]) for name in pred])) sys.exit() print('Processing: {}'.format(log_fp)) bytes_count = 0 tot_bytes = os.path.getsize(log_fp) evts = 0 for i, x in enumerate( gzip.open(log_fp, 'rb') if log_fp. endswith('.gz') else open(log_fp, 'rb')): if report_progress: # display progress bytes_count += len(x) if (i + 1) % 1000 == 0: if log_fp.endswith('.gz'): ds_parse.update_progress(i + 1) else: ds_parse.update_progress(bytes_count, tot_bytes) data = None if log_type == 'ccb': if x.startswith(b'{"Timestamp"') and x.strip().endswith(b'}'): data = ds_parse.ccb_json_cooked(x) aggregates_ccb_data(data, pred, d, evts) elif log_type == 'cb': if is_summary: data = json.loads(x.decode("utf-8")) elif x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x, do_decode=True) # Skip wrongly formated lines or not activated lines if data is None or data['skipLearn']: continue aggregates_cb_data(data, pred, d, evts) evts += 1 if report_progress: if log_fp.endswith('.gz'): len_text = ds_parse.update_progress(i + 1) else: len_text = ds_parse.update_progress(bytes_count, tot_bytes) sys.stdout.write("\r" + " " * len_text + "\r") sys.stdout.flush() print('Read {} lines - Processed {} events'.format(i + 1, evts)) if any(len(pred[name]) != evts for name in pred): print( 'Error: Prediction file length ({}) is different from number of events in log file ({})' .format([len(pred[name]) for name in pred], evts)) sys.exit() print('Total Elapsed Time: {:.1f} sec.'.format(time.time() - t0)) return d
def create_stats(log_fp, dashboard_file, predictions_files=None): t0 = time.time() if predictions_files is None: print('Searching prediction files for log file: {}'.format(log_fp)) predictions_files = [] for fn in os.scandir(os.path.dirname(log_fp)): if fn.path.startswith(log_fp+'.') and fn.name.endswith('.pred'): predictions_files.append(fn.path) # load predictions from predictions_files pred = {} for pred_fp in predictions_files: if os.path.isfile(pred_fp): name = pred_fp.split('.')[-2] # check that policy name is encoded in file_name if name: pred[name] = [x.strip() for x in open(pred_fp) if x.strip()] print('Loaded {} predictions from {}'.format(len(pred[name]),pred_fp)) else: print('Name is not valid - Skip: {}'.format(pred_fp)) else: print('Error loading policy predictions. Pred file not found: {}'.format(pred_fp)) sys.exit() if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(len(pred[name]) for name in pred): print('Error: Prediction file length ({}) must be equal for all files'.format([len(pred[name]) for name in pred])) sys.exit() d = {} print('Processing: {}'.format(log_fp)) bytes_count = 0 tot_bytes = os.path.getsize(log_fp) evts = 0 i = 0 for x in (gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')): # display progress bytes_count += len(x) i += 1 if i % 5000 == 0: if log_fp.endswith('.gz'): if i % 20000 == 0: print('.', end='', flush=True) if i % 1000000 == 0: print(' - Iter:',i) else: ds_parse.update_progress(bytes_count,tot_bytes) if x.startswith(b'{"_label_cost":'): data = ds_parse.json_cooked(x) r = 0 if data['cost'] == b'0' else -float(data['cost']) # binning time stamp every 5 min ts_bin = get_ts_5min_bin(data['ts']) # initialize aggregate for ts_bin if ts_bin not in d: d[ts_bin] = collections.OrderedDict({'online' : {'n':0,'d':0}, 'baseline1' : {'n':0.,'d':0.,'c':0.,'N':0}, 'baselineRand' : {'n':0.,'d':0.,'c':0.,'N':0}}) for name in pred: d[ts_bin][name] = {'n':0.,'d':0.,'c':0.,'N':0} # online and baseline policies d[ts_bin]['online']['d'] += 1 d[ts_bin]['baselineRand']['N'] += 1 d[ts_bin]['baselineRand']['d'] += 1/data['p']/data['num_a'] if data['a'] == 1: d[ts_bin]['baseline1']['N'] += 1 d[ts_bin]['baseline1']['d'] += 1/data['p'] if r != 0: d[ts_bin]['online']['n'] += r d[ts_bin]['baselineRand']['n'] += r/data['p']/data['num_a'] d[ts_bin]['baselineRand']['c'] = max(d[ts_bin]['baselineRand']['c'], r/data['p']/data['num_a']) if data['a'] == 1: d[ts_bin]['baseline1']['n'] += r/data['p'] d[ts_bin]['baseline1']['c'] = max(d[ts_bin]['baseline1']['c'], r/data['p']) # additional policies from predictions for name in pred: pred_prob = get_prediction_prob(data['a']-1, pred[name][evts]) # a-1: 0-index action if pred_prob > 0: p_over_p = pred_prob/data['p'] d[ts_bin][name]['d'] += p_over_p d[ts_bin][name]['N'] += 1 if r != 0: d[ts_bin][name]['n'] += r*p_over_p d[ts_bin][name]['c'] = max(d[ts_bin][name]['c'], r*p_over_p) evts += 1 if not log_fp.endswith('.gz'): len_text = ds_parse.update_progress(bytes_count,tot_bytes) sys.stdout.write("\r" + " "*len_text + "\r") sys.stdout.flush() print('Processed {} events'.format(evts)) if any(len(pred[name]) != evts for name in pred): print('Error: Prediction file length ({}) is different from number of events in log file ({})'.format([len(pred[name]) for name in pred],evts)) sys.exit() output_dashboard_data(d, dashboard_file) print('Total Elapsed Time: {:.1f} sec.'.format(time.time()-t0))