def update(files, dt_str=13): fp_list = ds_parse.input_files_to_fp_list(files) l = [] c_imp = collections.Counter() c_clk = collections.Counter() c_imp_all = collections.Counter() for fp in fp_list: bytes_count = 0 tot_bytes = os.path.getsize(fp) for i, x in enumerate( gzip.open(fp, 'rb') if fp.endswith('.gz') else open(fp, 'rb')): bytes_count += len(x) if (i + 1) % 1000 == 0: if fp.endswith('.gz'): ds_parse.update_progress(i + 1, prefix=fp + ' - ') else: ds_parse.update_progress(bytes_count, tot_bytes, fp + ' - ') if x.startswith(b'{"_label') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x) if data is None: continue c_imp_all.update([data['ts'][:dt_str]]) if not data['skipLearn']: c_imp.update([data['ts'][:dt_str]]) l.append((data, x.strip())) if float(data['cost']) < 0: c_clk.update([data['ts'][:dt_str]]) if fp.endswith('.gz'): ds_parse.update_progress(i + 1, prefix=fp + ' - ') else: ds_parse.update_progress(bytes_count, tot_bytes, fp + ' - ') print() ctr = [] ts = [] print('Timestamp (UTC),Clicks,Activated Imp.,CTR,Total Imp.') for x in c_imp_all: ctr.append(c_clk[x] / max(c_imp[x], 1)) ts.append(x) print('{},{},{},{:.2%},{}'.format(x, c_clk[x], c_imp[x], ctr[-1], c_imp_all[x])) print() return ts, ctr, l
def update(files, dt_str=13): fp_list = ds_parse.input_files_to_fp_list(files) l = [] c_imp = collections.Counter() c_clk = collections.Counter() c_imp_all = collections.Counter() for fp in fp_list: bytes_count = 0 tot_bytes = os.path.getsize(fp) for i,x in enumerate(gzip.open(fp, 'rb') if fp.endswith('.gz') else open(fp, 'rb')): bytes_count += len(x) if (i+1) % 1000 == 0: if fp.endswith('.gz'): ds_parse.update_progress(i+1,prefix=fp+' - ') else: ds_parse.update_progress(bytes_count,tot_bytes,fp+' - ') if x.startswith(b'{"_label') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x) if data['a'] <= 0: continue c_imp_all.update([data['ts'][:dt_str]]) if not data['skipLearn']: c_imp.update([data['ts'][:dt_str]]) l.append((data, x.strip())) if float(data['cost']) < 0: c_clk.update([data['ts'][:dt_str]]) if fp.endswith('.gz'): ds_parse.update_progress(i+1,prefix=fp+' - ') else: ds_parse.update_progress(bytes_count,tot_bytes,fp+' - ') print() ctr = [] ts = [] print('Timestamp (UTC),Clicks,Activated Imp.,CTR,Total Imp.') for x in c_imp_all: ctr.append(c_clk[x]/max(c_imp[x],1)) ts.append(x) print('{},{},{},{:.2%},{}'.format(x,c_clk[x],c_imp[x],ctr[-1],c_imp_all[x])) print() return ts,ctr,l
def print_stats(local_fp, azure_path, verbose=False, plot_hist=False, hist_bin=100): t = time.time() gt = {} len_local_rank = 0 dup_rank = 0 local_rew = [] lines_errs = 0 err_codes = collections.Counter() bytes_count = 0 tot_bytes = os.path.getsize(local_fp) for i, x in enumerate(open(local_fp, encoding='utf-8')): bytes_count += len(x) if (i + 1) % 10000 == 0: ds_parse.update_progress( bytes_count, tot_bytes, 'Loading Local file: {} - '.format(local_fp)) if 'status_code:200' in x: if '/rank/' in x and '"eventId":"' in x: ei = ds_parse.local_rank(x) len_local_rank += 1 if ei in gt: dup_rank += 1 else: gt[ei] = {'i': len_local_rank} elif '/reward/' in x and 'content:' in x: ei, r = ds_parse.local_reward(x) local_rew.append((ei, r)) gt[ei].setdefault('local_rew', []).append(r) else: lines_errs += 1 else: err_codes.update([ds_parse.extract_field(x, 'status_code:', '\t')]) ds_parse.update_progress(tot_bytes, tot_bytes, 'Loading Local file: {} - '.format(local_fp)) print('\n\nLoading Azure files...') if os.path.isdir(azure_path): files = [ azure_fp.path for azure_fp in scantree(azure_path) if azure_fp.name.endswith('.json') ] else: files = [azure_path] verbose_output = [] ei_miss_local = 0 azure_data = [] for ii, azure_fp in enumerate(files): bytes_count = 0 tot_bytes = os.path.getsize(azure_fp) for i, x in enumerate( gzip.open(azure_fp, 'rb') if azure_fp. endswith('.gz') else open(azure_fp, 'rb')): bytes_count += len(x) if (i + 1) % 10000 == 0: if azure_fp.endswith('.gz'): ds_parse.update_progress(i + 1, prefix='File {}/{}: {} - '.format( ii + 1, len(files), azure_fp)) else: ds_parse.update_progress( bytes_count, tot_bytes, 'File {}/{}: {} - '.format(ii + 1, len(files), azure_fp)) if x.startswith(b'{"_label_cost":'): data = ds_parse.json_cooked(x) ei = str(data['ei'], 'utf-8') c = str(data['cost'], 'utf-8') azure_data.append((ei, c)) if ei not in gt: ei_miss_local += 1 if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Ranking missing from Local' .format(len(azure_data), ei)) else: gt[ei].setdefault('azure_data', []).append((c, data['ts'])) if azure_fp.endswith('.gz'): ds_parse.update_progress(i + 1, prefix='File {}/{}: {} - '.format( ii + 1, len(files), azure_fp)) else: ds_parse.update_progress( bytes_count, tot_bytes, 'File {}/{}: {} - '.format(ii + 1, len(files), azure_fp)) print() print() dup_azure_counter = collections.Counter() dup_rew_counter = collections.Counter() err_rewards_idx = [] no_events_idx = [] no_rewards_idx = [] for i, ei in enumerate(gt): if (i + 1) % 10000 == 0: ds_parse.update_progress(i + 1, len(gt), 'Evaluating differences - ') if 'local_rew' in gt[ei]: if len(gt[ei]['local_rew']) > 1: dup_rew_counter.update([len(gt[ei]['local_rew'])]) if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Duplicate in Reward: {}'. format(gt[ei]['i'], ei, gt[ei]['local_rew'])) else: if 'azure_data' in gt[ei]: if len(gt[ei]['azure_data']) > 1: dup_azure_counter.update([len(gt[ei]['azure_data'])]) if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Duplicate in Azure: {}' .format(gt[ei]['i'], ei, gt[ei]['azure_data'])) else: a = float(gt[ei]['local_rew'][0]) b = float(gt[ei]['azure_data'][0][0]) if abs(a + b) > max(1e-7 * max(abs(a), abs(b)), 1e-6): err_rewards_idx.append(gt[ei]['i']) if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Error in reward: Local: {} Azure: {}' .format(gt[ei]['i'], ei, gt[ei]['local_rew'][0], gt[ei]['azure_data'][0])) else: no_events_idx.append(gt[ei]['i']) if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Ranking missing from Azure' .format(gt[ei]['i'], ei)) else: no_rewards_idx.append(gt[ei]['i']) if verbose: verbose_output.append( 'Idx: {} - EventId: {} - Reward missing from local'.format( gt[ei]['i'], ei)) ds_parse.update_progress(i + 1, len(gt), 'Evaluating differences - ') print() for x in verbose_output: print(x) print('\nComputing summary stats...') rew_dict = {y[0]: y[1] for y in local_rew} azure_dict = {y[0]: y[1] for y in azure_data} dup_azure = sum((x - 1) * dup_azure_counter[x] for x in dup_azure_counter) dup_rew = sum((x - 1) * dup_rew_counter[x] for x in dup_rew_counter) if verbose: print('-----' * 10) print('Missing events indexes (1-based indexing)\n{}'.format( no_events_idx)) print('-----' * 10) print('Missing local rewards indexes (1-based indexing)\n{}'.format( no_rewards_idx)) print('-----' * 10) print('Wrong rewards indexes (1-based indexing)\n{}'.format( err_rewards_idx)) print('-----' * 10) print('Events in local_rank: {} (Duplicates: {})'.format( len_local_rank, dup_rank)) print('Events in local_rew: {} (Duplicates: {} - {})'.format( len(local_rew), dup_rew, dup_rew_counter)) print('Events in azure_data: {} (Duplicates: {} - {})'.format( len(azure_data), dup_azure, dup_azure_counter)) print('-----' * 10) print('Intersection local_rank/local_rew:', sum(1 for x in rew_dict if x in gt)) print('Intersection local_rank/azure_data:', sum(1 for x in azure_dict if x in gt)) print('Missing EventIds from local: {}'.format(ei_miss_local)) print('Missing EventIds from azure: {}'.format(len(no_events_idx)), end='') if no_events_idx: print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx), len_local_rank), end='') print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='') if no_rewards_idx: print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx), len_local_rank), end='') print('\nWrong rewards: {}'.format(len(err_rewards_idx))) print('-----' * 10) print('status_codes errors: {}'.format(err_codes.most_common())) print('Lines skipped in Local file: {}'.format(lines_errs)) print('-----' * 10) print('Elapsed time: ', time.time() - t) if plot_hist: if err_rewards_idx or no_events_idx or no_rewards_idx: plt.rcParams.update({'font.size': 16}) # General font size if err_rewards_idx: a = plt.hist(err_rewards_idx, hist_bin, label='Wrong reward', color='xkcd:orange') if verbose: print('err_rewards_idx', a) if no_events_idx: b = plt.hist(no_events_idx, hist_bin, label='No rank', color='xkcd:blue') if verbose: print('no_events_idx', b) if no_rewards_idx: c = plt.hist(no_rewards_idx, hist_bin, label='No local reward', color='xkcd:red') if verbose: print('no_rewards_idx', c) plt.title('Missing/Wrong rank and reward requests', fontsize=20) plt.xlabel('Request index', fontsize=18) plt.ylabel('Bin Count', fontsize=18) plt.legend() plt.show() else: print('Nothing to plot! All is good!')
def compute_estimates(log_fp, cats_transformer=None): # Init estimators online = ips_snips.Estimator() baseline1 = ips_snips.Estimator() baselineR = ips_snips.Estimator() online_mle = mle.Estimator() baseline1_mle = mle.Estimator() baselineR_mle = mle.Estimator() online_cressieread = cressieread.Estimator() baseline1_cressieread = cressieread.Estimator() baselineR_cressieread = cressieread.Estimator() print('Processing: {}'.format(log_fp)) bytes_count = 0 tot_bytes = os.path.getsize(log_fp) evts = 0 for i, x in enumerate( gzip.open(log_fp, 'rb') if log_fp. endswith('.gz') else open(log_fp, 'rb')): # display progress bytes_count += len(x) if (i + 1) % 10000 == 0: if log_fp.endswith('.gz'): ds_parse.update_progress(i + 1) else: ds_parse.update_progress(bytes_count, tot_bytes) # parse dsjson file if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x) if data['skipLearn']: continue r = 0 if data['cost'] == b'0' else -float(data['cost']) # Update estimators with tuple (p_log, r, p_pred) online.add_example(data['p'], r, data['p']) baseline1.add_example(data['p'], r, 1 if data['a'] == 1 else 0) baselineR.add_example(data['p'], r, 1 / data['num_a']) online_mle.add_example(data['p'], r, data['p']) baseline1_mle.add_example(data['p'], r, 1 if data['a'] == 1 else 0) baselineR_mle.add_example(data['p'], r, 1 / data['num_a']) online_cressieread.add_example(data['p'], r, data['p']) baseline1_cressieread.add_example(data['p'], r, 1 if data['a'] == 1 else 0) baselineR_cressieread.add_example(data['p'], r, 1 / data['num_a']) evts += 1 if x.startswith(b'{"_label_ca":') and x.strip().endswith(b'}'): data = ds_parse.json_cooked_continuous_actions(x) if cats_transformer is None: raise RuntimeError( "Not all of the required arguments for running with continuous actions have been provided." ) # passing logged action as predicted action to transformer data = cats_transformer.transform(data, data['a']) # passing baseline action as predicted action to transformer data_baseline1 = cats_transformer.transform( data, cats_transformer.get_baseline1_prediction()) if data['skipLearn']: continue r = 0 if data['cost'] == b'0' else -float(data['cost']) # Update estimators with tuple (p_log, r, p_pred) online.add_example(data['p'], r, data['p']) baseline1.add_example(data['p'], r, data_baseline1['pred_p']) baselineR.add_example(data['p'], r, 1.0 / cats_transformer.continuous_range) online_mle.add_example(data['p'], r, data['p']) baseline1_mle.add_example(data['p'], r, data_baseline1['pred_p']) baselineR_mle.add_example(data['p'], r, 1.0 / cats_transformer.continuous_range) online_cressieread.add_example(data['p'], r, data['p']) baseline1_cressieread.add_example(data['p'], r, data_baseline1['pred_p']) baselineR_cressieread.add_example( data['p'], r, 1.0 / cats_transformer.continuous_range) evts += 1 if log_fp.endswith('.gz'): len_text = ds_parse.update_progress(i + 1) else: len_text = ds_parse.update_progress(bytes_count, tot_bytes) print('\nProcessed {} events out of {} lines'.format(evts, i + 1)) print('online_ips:', online.get_estimate('ips')) print('baseline1_ips:', baseline1.get_estimate('ips')) print('baseline1 gaussian ci:', baseline1.get_interval('gaussian')) print('baseline1 clopper pearson ci:', baseline1.get_interval('clopper-pearson')) print('baselineR_ips:', baselineR.get_estimate('ips')) print('baselineR gaussian ci:', baselineR.get_interval('gaussian')) print('baselineR clopper pearson ci:', baselineR.get_interval('clopper-pearson')) print('online_snips:', online.get_estimate('snips')) print('baseline1_snips:', baseline1.get_estimate('snips')) print('baselineR_snips:', baselineR.get_estimate('snips')) print('online_mle:', online_mle.get_estimate()) print('baseline1_mle:', baseline1_mle.get_estimate()) print('baselineR_mle:', baselineR_mle.get_estimate()) print('online_cressieread:', online_cressieread.get_estimate()) print('baseline1_cressieread:', baseline1_cressieread.get_estimate()) print('baselineR_cressieread:', baselineR_cressieread.get_estimate())
def create_stats(log_fp, dashboard_file, predictions_files=None): t0 = time.time() if predictions_files is None: print('Searching prediction files for log file: {}'.format(log_fp)) predictions_files = [] for fn in os.scandir(os.path.dirname(log_fp)): if fn.path.startswith(log_fp+'.') and fn.name.endswith('.pred'): predictions_files.append(fn.path) # load predictions from predictions_files pred = {} for pred_fp in predictions_files: if os.path.isfile(pred_fp): name = pred_fp.split('.')[-2] # check that policy name is encoded in file_name if name: pred[name] = [x.strip() for x in open(pred_fp) if x.strip()] print('Loaded {} predictions from {}'.format(len(pred[name]),pred_fp)) else: print('Name is not valid - Skip: {}'.format(pred_fp)) else: print('Error loading policy predictions. Pred file not found: {}'.format(pred_fp)) sys.exit() if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(len(pred[name]) for name in pred): print('Error: Prediction file length ({}) must be equal for all files'.format([len(pred[name]) for name in pred])) sys.exit() d = {} print('Processing: {}'.format(log_fp)) bytes_count = 0 tot_bytes = os.path.getsize(log_fp) evts = 0 for i,x in enumerate(gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')): # display progress bytes_count += len(x) if (i+1) % 1000 == 0: if log_fp.endswith('.gz'): ds_parse.update_progress(i+1) else: ds_parse.update_progress(bytes_count,tot_bytes) if x.startswith(b'{"_label_cost":'): data = ds_parse.json_cooked(x) if data['skipLearn']: continue r = 0 if data['cost'] == b'0' else -float(data['cost']) ############################### Aggregates for each bin ###################################### # # 'n': IPS of numerator # 'N': total number of samples in bin from log (IPS = n/N) # 'd': IPS of denominator (SNIPS = n/d) # 'Ne': number of samples in bin when off-policy agrees with log policy # 'c': max abs. value of numerator's items (needed for Clopper-Pearson confidence intervals) # 'SoS': sum of squares of numerator's items (needed for Gaussian confidence intervals) # ################################################################################################# # binning timestamp every 5 min ts_bin = get_ts_5min_bin(data['ts']) # initialize aggregates for ts_bin if ts_bin not in d: d[ts_bin] = collections.OrderedDict({'online' : {'n':0,'N':0,'d':0}, 'baseline1' : {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0}, 'baselineRand' : {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0}}) for name in pred: d[ts_bin][name] = {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0} # update aggregates for online and baseline policies d[ts_bin]['online']['d'] += 1 d[ts_bin]['online']['N'] += 1 d[ts_bin]['baselineRand']['N'] += 1 d[ts_bin]['baseline1']['N'] += 1 d[ts_bin]['baselineRand']['Ne'] += 1 d[ts_bin]['baselineRand']['d'] += 1/data['p']/data['num_a'] if data['a'] == 1: d[ts_bin]['baseline1']['Ne'] += 1 d[ts_bin]['baseline1']['d'] += 1/data['p'] if r != 0: d[ts_bin]['online']['n'] += r d[ts_bin]['baselineRand']['n'] += r/data['p']/data['num_a'] d[ts_bin]['baselineRand']['c'] = max(d[ts_bin]['baselineRand']['c'], r/data['p']/data['num_a']) d[ts_bin]['baselineRand']['SoS'] += (r/data['p']/data['num_a'])**2 if data['a'] == 1: d[ts_bin]['baseline1']['n'] += r/data['p'] d[ts_bin]['baseline1']['c'] = max(d[ts_bin]['baseline1']['c'], r/data['p']) d[ts_bin]['baseline1']['SoS'] += (r/data['p'])**2 # update aggregates for additional policies from predictions for name in pred: pred_prob = get_prediction_prob(data['a']-1, pred[name][evts]) # a-1: 0-index action d[ts_bin][name]['N'] += 1 if pred_prob > 0: p_over_p = pred_prob/data['p'] d[ts_bin][name]['d'] += p_over_p d[ts_bin][name]['Ne'] += 1 if r != 0: d[ts_bin][name]['n'] += r*p_over_p d[ts_bin][name]['c'] = max(d[ts_bin][name]['c'], r*p_over_p) d[ts_bin][name]['SoS'] += (r*p_over_p)**2 evts += 1 if log_fp.endswith('.gz'): len_text = ds_parse.update_progress(i+1) else: len_text = ds_parse.update_progress(bytes_count,tot_bytes) sys.stdout.write("\r" + " "*len_text + "\r") sys.stdout.flush() print('Read {} lines - Processed {} events'.format(i+1,evts)) if any(len(pred[name]) != evts for name in pred): print('Error: Prediction file length ({}) is different from number of events in log file ({})'.format([len(pred[name]) for name in pred],evts)) sys.exit() output_dashboard_data(d, dashboard_file) print('Total Elapsed Time: {:.1f} sec.'.format(time.time()-t0))
def print_stats(local_fp, azure_path, verbose=False, plot_hist=False, hist_bin=100): t = time.time() gt = {} len_local_rank = 0 dup_rank = 0 local_rew = [] lines_errs = 0 err_codes = collections.Counter() bytes_count = 0 tot_bytes = os.path.getsize(local_fp) for i,x in enumerate(open(local_fp, encoding='utf-8')): bytes_count += len(x) if (i+1) % 10000 == 0: ds_parse.update_progress(bytes_count,tot_bytes,'Loading Local file: {} - '.format(local_fp)) if 'status_code:200' in x: if '/rank/' in x and '"eventId":"' in x: ei = ds_parse.local_rank(x) len_local_rank += 1 if ei in gt: dup_rank += 1 else: gt[ei] = {'i': len_local_rank} elif '/reward/' in x and 'content:' in x: ei,r = ds_parse.local_reward(x) local_rew.append((ei,r)) gt[ei].setdefault('local_rew',[]).append(r) else: lines_errs += 1 else: err_codes.update([ds_parse.extract_field(x,'status_code:','\t')]) ds_parse.update_progress(tot_bytes,tot_bytes,'Loading Local file: {} - '.format(local_fp)) print('\n\nLoading Azure files...') if os.path.isdir(azure_path): files = [azure_fp.path for azure_fp in scantree(azure_path) if azure_fp.name.endswith('.json')] else: files = [azure_path] verbose_output = [] ei_miss_local = 0 azure_data = [] for ii,azure_fp in enumerate(files): bytes_count = 0 tot_bytes = os.path.getsize(azure_fp) for i,x in enumerate(gzip.open(azure_fp, 'rb') if azure_fp.endswith('.gz') else open(azure_fp, 'rb')): bytes_count += len(x) if (i+1) % 10000 == 0: if azure_fp.endswith('.gz'): ds_parse.update_progress(i+1,prefix='File {}/{}: {} - '.format(ii+1,len(files),azure_fp)) else: ds_parse.update_progress(bytes_count,tot_bytes,'File {}/{}: {} - '.format(ii+1,len(files),azure_fp)) if x.startswith(b'{"_label_cost":'): data = ds_parse.json_cooked(x) ei = str(data['ei'], 'utf-8') c = str(data['cost'], 'utf-8') azure_data.append((ei, c)) if ei not in gt: ei_miss_local += 1 if verbose: verbose_output.append('Idx: {} - EventId: {} - Ranking missing from Local'.format(len(azure_data),ei)) else: gt[ei].setdefault('azure_data',[]).append((c, data['ts'])) if azure_fp.endswith('.gz'): ds_parse.update_progress(i+1,prefix='File {}/{}: {} - '.format(ii+1,len(files),azure_fp)) else: ds_parse.update_progress(bytes_count,tot_bytes,'File {}/{}: {} - '.format(ii+1,len(files),azure_fp)) print() print() dup_azure_counter = collections.Counter() dup_rew_counter = collections.Counter() err_rewards_idx = [] no_events_idx = [] no_rewards_idx = [] for i,ei in enumerate(gt): if (i+1) % 10000 == 0: ds_parse.update_progress(i+1,len(gt),'Evaluating differences - ') if 'local_rew' in gt[ei]: if len(gt[ei]['local_rew']) > 1: dup_rew_counter.update([len(gt[ei]['local_rew'])]) if verbose: verbose_output.append('Idx: {} - EventId: {} - Duplicate in Reward: {}'.format(gt[ei]['i'],ei,gt[ei]['local_rew'])) else: if 'azure_data' in gt[ei]: if len(gt[ei]['azure_data']) > 1: dup_azure_counter.update([len(gt[ei]['azure_data'])]) if verbose: verbose_output.append('Idx: {} - EventId: {} - Duplicate in Azure: {}'.format(gt[ei]['i'],ei,gt[ei]['azure_data'])) else: a = float(gt[ei]['local_rew'][0]) b = float(gt[ei]['azure_data'][0][0]) if abs(a+b) > max(1e-7 * max(abs(a), abs(b)), 1e-6): err_rewards_idx.append(gt[ei]['i']) if verbose: verbose_output.append('Idx: {} - EventId: {} - Error in reward: Local: {} Azure: {}'.format(gt[ei]['i'],ei,gt[ei]['local_rew'][0],gt[ei]['azure_data'][0])) else: no_events_idx.append(gt[ei]['i']) if verbose: verbose_output.append('Idx: {} - EventId: {} - Ranking missing from Azure'.format(gt[ei]['i'],ei)) else: no_rewards_idx.append(gt[ei]['i']) if verbose: verbose_output.append('Idx: {} - EventId: {} - Reward missing from local'.format(gt[ei]['i'],ei)) ds_parse.update_progress(i+1,len(gt),'Evaluating differences - ') print() for x in verbose_output: print(x) print('\nComputing summary stats...') rew_dict = {y[0]: y[1] for y in local_rew} azure_dict = {y[0]: y[1] for y in azure_data} dup_azure = sum((x-1)*dup_azure_counter[x] for x in dup_azure_counter) dup_rew = sum((x-1)*dup_rew_counter[x] for x in dup_rew_counter) if verbose: print('-----'*10) print('Missing events indexes (1-based indexing)\n{}'.format(no_events_idx)) print('-----'*10) print('Missing local rewards indexes (1-based indexing)\n{}'.format(no_rewards_idx)) print('-----'*10) print('Wrong rewards indexes (1-based indexing)\n{}'.format(err_rewards_idx)) print('-----'*10) print('Events in local_rank: {} (Duplicates: {})'.format(len_local_rank, dup_rank)) print('Events in local_rew: {} (Duplicates: {} - {})'.format(len(local_rew), dup_rew, dup_rew_counter)) print('Events in azure_data: {} (Duplicates: {} - {})'.format(len(azure_data), dup_azure, dup_azure_counter)) print('-----'*10) print('Intersection local_rank/local_rew:',sum(1 for x in rew_dict if x in gt)) print('Intersection local_rank/azure_data:',sum(1 for x in azure_dict if x in gt)) print('Missing EventIds from local: {}'.format(ei_miss_local)) print('Missing EventIds from azure: {}'.format(len(no_events_idx)), end='') if no_events_idx: print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx),len_local_rank), end='') print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='') if no_rewards_idx: print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx),len_local_rank), end='') print('\nWrong rewards: {}'.format(len(err_rewards_idx))) print('-----'*10) print('status_codes errors: {}'.format(err_codes.most_common())) print('Lines skipped in Local file: {}'.format(lines_errs)) print('-----'*10) print('Elapsed time: ',time.time()-t) if plot_hist: if err_rewards_idx or no_events_idx or no_rewards_idx: plt.rcParams.update({'font.size': 16}) # General font size if err_rewards_idx: a = plt.hist(err_rewards_idx, hist_bin, label='Wrong reward', color='xkcd:orange') if verbose: print('err_rewards_idx',a) if no_events_idx: b = plt.hist(no_events_idx, hist_bin, label='No rank', color='xkcd:blue') if verbose: print('no_events_idx',b) if no_rewards_idx: c = plt.hist(no_rewards_idx, hist_bin, label='No local reward', color='xkcd:red') if verbose: print('no_rewards_idx',c) plt.title('Missing/Wrong rank and reward requests', fontsize=20) plt.xlabel('Request index', fontsize=18) plt.ylabel('Bin Count', fontsize=18) plt.legend() plt.show() else: print('Nothing to plot! All is good!')
def compute_estimates(log_fp): # Init estimators online = ips_snips.Estimator() baseline1 = ips_snips.Estimator() baselineR = ips_snips.Estimator() online_mle = mle.Estimator() baseline1_mle = mle.Estimator() baselineR_mle = mle.Estimator() print('Processing: {}'.format(log_fp)) bytes_count = 0 tot_bytes = os.path.getsize(log_fp) evts = 0 for i, x in enumerate( gzip.open(log_fp, 'rb') if log_fp. endswith('.gz') else open(log_fp, 'rb')): # display progress bytes_count += len(x) if (i + 1) % 10000 == 0: if log_fp.endswith('.gz'): ds_parse.update_progress(i + 1) else: ds_parse.update_progress(bytes_count, tot_bytes) # parse dsjson file if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x) if data['skipLearn']: continue r = 0 if data['cost'] == b'0' else -float(data['cost']) # Update estimators with tuple (p_log, r, p_pred) online.add_example(data['p'], r, data['p']) baseline1.add_example(data['p'], r, 1 if data['a'] == 1 else 0) baselineR.add_example(data['p'], r, 1 / data['num_a']) online_mle.add_example(data['p'], r, data['p']) baseline1_mle.add_example(data['p'], r, 1 if data['a'] == 1 else 0) baselineR_mle.add_example(data['p'], r, 1 / data['num_a']) evts += 1 if log_fp.endswith('.gz'): len_text = ds_parse.update_progress(i + 1) else: len_text = ds_parse.update_progress(bytes_count, tot_bytes) print('\nProcessed {} events out of {} lines'.format(evts, i + 1)) print('online_ips:', online.get_estimate('ips')) print('baseline1_ips:', baseline1.get_estimate('ips')) print('baselineR_ips:', baselineR.get_estimate('ips')) print('online_snips:', online.get_estimate('snips')) print('baseline1_snips:', baseline1.get_estimate('snips')) print('baselineR_snips:', baselineR.get_estimate('snips')) print('online_mle:', online_mle.get_estimate()) print('baseline1_mle:', baseline1_mle.get_estimate()) print('baselineR_mle:', baselineR_mle.get_estimate())
def create_act_d(l): act_d = {} data = [] ctr_all = {} for i,x in enumerate(l): js = json.loads(x[1]) if i == 0: print('These are the actions features from your first event:\n',js['c']['_multi']) actions_names_fields = input('\nEnter a (comma separated) list of JSON fields used to extract the action name:').split(',') try: sep1,sep2 = input('Enter separators to parse the action name string keeping only substring between the separators (comma separated):').split(',') except: print('Separators not correctly entered - not using separators') sep1,sep2 = '','' print('Start parsing...') if max(js['p']) - min(js['p']) > 1e-5: model_ind = js['a'][np.argmax(js['p'])]-1 vw_model = js.get('VWState', {}).get('m', 'N/A') else: model_ind = -1 a_mod = None vw_model = 'N/A' actions = set() temp = [] for j,y in enumerate(js['c']['_multi']): ########### Parsing action features to extract name ######## action_name = y[actions_names_fields[0]] for field in actions_names_fields[1:]: action_name = action_name[field] if sep1: action_name = action_name.split(sep1,1)[1] if sep2: action_name = action_name.split(sep2,1)[0] ############################################################ is_firstAction = int(j > 0) if action_name not in act_d: act_d[action_name] = (len(act_d),[],[]) if action_name not in actions: actions.add(action_name) act_d[action_name][is_firstAction+1].append(i) if j == js['_labelIndex']: a = act_d[action_name][0] if j == model_ind: a_mod = act_d[action_name][0] temp.append(action_name) if act_d[action_name][0] not in ctr_all: ctr_all[act_d[action_name][0]] = [0,0,0,0,0,action_name,collections.Counter()] ctr_all[act_d[action_name][0]][2] += 1 data.append((a, a_mod, js['_label_cost'], model_ind, js['_label_Action'], js['Timestamp'],temp, vw_model)) ctr_all[a][1] += 1 ctr_all[a][4] += 1/js['_label_probability'] ctr_all[a][-1].update([-js['_label_cost']]) if js['_label_cost'] != 0: ctr_all[a][0] -= js['_label_cost'] ctr_all[a][3] -= js['_label_cost']/js['_label_probability'] if (i+1) % 1000 == 0: ds_parse.update_progress(i+1,len(l)) ds_parse.update_progress(i+1,len(l)) print('\n\nActionId,Rewards,Choosen,Available,Rew. IPS,Choosen IPS,IPS,SNIPS,ActionName') for a in range(len(ctr_all)): print(','.join(map(str,[a]+ctr_all[a][:-2]+[ctr_all[a][3]/max(ctr_all[a][2],1),ctr_all[a][3]/max(ctr_all[a][4],1)]+[ctr_all[a][-2]]))) print('\nMost Common Rewards') rew_list = sorted({x[0] for a in range(len(ctr_all)) for x in ctr_all[a][-1].most_common(10)}) print(','.join(map(str, ['ActionId']+rew_list))) for a in range(len(ctr_all)): print(','.join(map(str, [a]+[ctr_all[a][-1][r] for r in rew_list]))) return act_d,data,ctr_all
def create_act_d(l): act_d = {} data = [] ctr_all = {} for i, x in enumerate(l): js = json.loads(x[1]) if i == 0: print('These are the actions features from your first event:\n', js['c']['_multi']) actions_names_fields = input( '\nEnter a (comma separated) list of JSON fields used to extract the action name:' ).split(',') try: sep1, sep2 = input( 'Enter separators to parse the action name string keeping only substring between the separators (comma separated):' ).split(',') except: print( 'Separators not correctly entered - not using separators') sep1, sep2 = '', '' print('Start parsing...') if max(js['p']) - min(js['p']) > 1e-5: model_ind = js['a'][np.argmax(js['p'])] - 1 vw_model = js.get('VWState', {}).get('m', 'N/A') else: model_ind = -1 a_mod = None vw_model = 'N/A' actions = set() temp = [] for j, y in enumerate(js['c']['_multi']): ########### Parsing action features to extract name ######## action_name = y[actions_names_fields[0]] for field in actions_names_fields[1:]: action_name = action_name[field] if sep1: action_name = action_name.split(sep1, 1)[1] if sep2: action_name = action_name.split(sep2, 1)[0] ############################################################ is_firstAction = int(j > 0) if action_name not in act_d: act_d[action_name] = (len(act_d), [], []) if action_name not in actions: actions.add(action_name) act_d[action_name][is_firstAction + 1].append(i) if j == js['_labelIndex']: a = act_d[action_name][0] if j == model_ind: a_mod = act_d[action_name][0] temp.append(action_name) if act_d[action_name][0] not in ctr_all: ctr_all[act_d[action_name][0]] = [ 0, 0, 0, 0, 0, action_name, collections.Counter() ] ctr_all[act_d[action_name][0]][2] += 1 data.append((a, a_mod, js['_label_cost'], model_ind, js['_label_Action'], js['Timestamp'], temp, vw_model)) ctr_all[a][1] += 1 ctr_all[a][4] += 1 / js['_label_probability'] ctr_all[a][-1].update([-js['_label_cost']]) if js['_label_cost'] != 0: ctr_all[a][0] -= js['_label_cost'] ctr_all[a][3] -= js['_label_cost'] / js['_label_probability'] if (i + 1) % 1000 == 0: ds_parse.update_progress(i + 1, len(l)) ds_parse.update_progress(i + 1, len(l)) print( '\n\nActionId,Rewards,Choosen,Available,Rew. IPS,Choosen IPS,IPS,SNIPS,ActionName' ) for a in range(len(ctr_all)): print(','.join( map(str, [a] + ctr_all[a][:-2] + [ ctr_all[a][3] / max(ctr_all[a][2], 1), ctr_all[a][3] / max(ctr_all[a][4], 1) ] + [ctr_all[a][-2]]))) print('\nMost Common Rewards') rew_list = sorted({ x[0] for a in range(len(ctr_all)) for x in ctr_all[a][-1].most_common(10) }) print(','.join(map(str, ['ActionId'] + rew_list))) for a in range(len(ctr_all)): print(','.join(map(str, [a] + [ctr_all[a][-1][r] for r in rew_list]))) return act_d, data, ctr_all
def create_stats(log_fp, log_type='cb', d=None, predictions_files=None, is_summary=False, report_progress=True): t0 = time.time() if d is None: d = {} if predictions_files is None: print('Searching prediction files for log file: {}'.format(log_fp)) predictions_files = [] for fn in os.scandir(os.path.dirname(log_fp)): if fn.path.startswith(log_fp + '.') and fn.name.endswith('.pred'): predictions_files.append(fn.path) # load predictions from predictions_files pred = {} for pred_fp in predictions_files: if os.path.isfile(pred_fp): if is_summary: name = pred_fp.split('/')[-1].split('.')[-2] else: name = pred_fp.split('.')[ -2] # check that policy name is encoded in file_name if name: if log_type == 'cb': pred[name] = [ x.strip() for x in open(pred_fp) if x.strip() ] elif log_type == 'ccb': with open(pred_fp) as f: pred[name] = [] slot = [] for x in f: x = x.strip() if x: slot.append(x) else: pred[name].append(slot) slot = [] print('Loaded {} predictions from {}'.format( len(pred[name]), pred_fp)) else: print('Name is not valid - Skip: {}'.format(pred_fp)) else: print('Error loading policy predictions. Pred file not found: {}'. format(pred_fp)) sys.exit() if len(pred) > 1 and min(len(pred[name]) for name in pred) != max( len(pred[name]) for name in pred): print('Error: Prediction file length ({}) must be equal for all files'. format([len(pred[name]) for name in pred])) sys.exit() print('Processing: {}'.format(log_fp)) bytes_count = 0 tot_bytes = os.path.getsize(log_fp) evts = 0 for i, x in enumerate( gzip.open(log_fp, 'rb') if log_fp. endswith('.gz') else open(log_fp, 'rb')): if report_progress: # display progress bytes_count += len(x) if (i + 1) % 1000 == 0: if log_fp.endswith('.gz'): ds_parse.update_progress(i + 1) else: ds_parse.update_progress(bytes_count, tot_bytes) data = None if log_type == 'ccb': if x.startswith(b'{"Timestamp"') and x.strip().endswith(b'}'): data = ds_parse.ccb_json_cooked(x) aggregates_ccb_data(data, pred, d, evts) elif log_type == 'cb': if is_summary: data = json.loads(x.decode("utf-8")) elif x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'): data = ds_parse.json_cooked(x, do_decode=True) # Skip wrongly formated lines or not activated lines if data is None or data['skipLearn']: continue aggregates_cb_data(data, pred, d, evts) evts += 1 if report_progress: if log_fp.endswith('.gz'): len_text = ds_parse.update_progress(i + 1) else: len_text = ds_parse.update_progress(bytes_count, tot_bytes) sys.stdout.write("\r" + " " * len_text + "\r") sys.stdout.flush() print('Read {} lines - Processed {} events'.format(i + 1, evts)) if any(len(pred[name]) != evts for name in pred): print( 'Error: Prediction file length ({}) is different from number of events in log file ({})' .format([len(pred[name]) for name in pred], evts)) sys.exit() print('Total Elapsed Time: {:.1f} sec.'.format(time.time() - t0)) return d
def create_stats(log_fp, dashboard_file, predictions_files=None): t0 = time.time() if predictions_files is None: print('Searching prediction files for log file: {}'.format(log_fp)) predictions_files = [] for fn in os.scandir(os.path.dirname(log_fp)): if fn.path.startswith(log_fp+'.') and fn.name.endswith('.pred'): predictions_files.append(fn.path) # load predictions from predictions_files pred = {} for pred_fp in predictions_files: if os.path.isfile(pred_fp): name = pred_fp.split('.')[-2] # check that policy name is encoded in file_name if name: pred[name] = [x.strip() for x in open(pred_fp) if x.strip()] print('Loaded {} predictions from {}'.format(len(pred[name]),pred_fp)) else: print('Name is not valid - Skip: {}'.format(pred_fp)) else: print('Error loading policy predictions. Pred file not found: {}'.format(pred_fp)) sys.exit() if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(len(pred[name]) for name in pred): print('Error: Prediction file length ({}) must be equal for all files'.format([len(pred[name]) for name in pred])) sys.exit() d = {} print('Processing: {}'.format(log_fp)) bytes_count = 0 tot_bytes = os.path.getsize(log_fp) evts = 0 i = 0 for x in (gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')): # display progress bytes_count += len(x) i += 1 if i % 5000 == 0: if log_fp.endswith('.gz'): if i % 20000 == 0: print('.', end='', flush=True) if i % 1000000 == 0: print(' - Iter:',i) else: ds_parse.update_progress(bytes_count,tot_bytes) if x.startswith(b'{"_label_cost":'): data = ds_parse.json_cooked(x) r = 0 if data['cost'] == b'0' else -float(data['cost']) # binning time stamp every 5 min ts_bin = get_ts_5min_bin(data['ts']) # initialize aggregate for ts_bin if ts_bin not in d: d[ts_bin] = collections.OrderedDict({'online' : {'n':0,'d':0}, 'baseline1' : {'n':0.,'d':0.,'c':0.,'N':0}, 'baselineRand' : {'n':0.,'d':0.,'c':0.,'N':0}}) for name in pred: d[ts_bin][name] = {'n':0.,'d':0.,'c':0.,'N':0} # online and baseline policies d[ts_bin]['online']['d'] += 1 d[ts_bin]['baselineRand']['N'] += 1 d[ts_bin]['baselineRand']['d'] += 1/data['p']/data['num_a'] if data['a'] == 1: d[ts_bin]['baseline1']['N'] += 1 d[ts_bin]['baseline1']['d'] += 1/data['p'] if r != 0: d[ts_bin]['online']['n'] += r d[ts_bin]['baselineRand']['n'] += r/data['p']/data['num_a'] d[ts_bin]['baselineRand']['c'] = max(d[ts_bin]['baselineRand']['c'], r/data['p']/data['num_a']) if data['a'] == 1: d[ts_bin]['baseline1']['n'] += r/data['p'] d[ts_bin]['baseline1']['c'] = max(d[ts_bin]['baseline1']['c'], r/data['p']) # additional policies from predictions for name in pred: pred_prob = get_prediction_prob(data['a']-1, pred[name][evts]) # a-1: 0-index action if pred_prob > 0: p_over_p = pred_prob/data['p'] d[ts_bin][name]['d'] += p_over_p d[ts_bin][name]['N'] += 1 if r != 0: d[ts_bin][name]['n'] += r*p_over_p d[ts_bin][name]['c'] = max(d[ts_bin][name]['c'], r*p_over_p) evts += 1 if not log_fp.endswith('.gz'): len_text = ds_parse.update_progress(bytes_count,tot_bytes) sys.stdout.write("\r" + " "*len_text + "\r") sys.stdout.flush() print('Processed {} events'.format(evts)) if any(len(pred[name]) != evts for name in pred): print('Error: Prediction file length ({}) is different from number of events in log file ({})'.format([len(pred[name]) for name in pred],evts)) sys.exit() output_dashboard_data(d, dashboard_file) print('Total Elapsed Time: {:.1f} sec.'.format(time.time()-t0))