예제 #1
0
def print_stats(local_fp, azure_path, verbose=False, plot_hist=False):

    print('Computing statistics...')

    local_rank = []
    local_rew = []
    lines_errs = 0
    err_codes = collections.Counter()
    for x in open(local_fp, encoding='utf-8'):
        if 'status_code:200' in x:
            if '/rank/' in x and '"eventId":"' in x:
                local_rank.append(ds_parse.local_rank(x))
            elif '/reward/' in x and 'content:' in x:
                local_rew.append(ds_parse.local_reward(x))
            else:
                lines_errs += 1
        else:
            err_codes.update([ds_parse.extract_field(x, 'status_code:', '\t')])

    if os.path.isdir(azure_path):
        files = [
            azure_fp.path for azure_fp in scantree(azure_path)
            if azure_fp.name.endswith('.json')
        ]
    else:
        files = [azure_path]

    azure_data = []
    for azure_fp in files:
        for x in open(azure_fp, 'rb'):
            if x.startswith(b'{"_label_cost":'):
                data = ds_parse.json_cooked(x)
                azure_data.append([data['ei'], data['cost']])

    local_rank_set = set(local_rank)
    rew_dict = {y[0]: y[1] for y in local_rew}
    azure_dict = {str(y[0], 'utf-8'): str(y[1], 'utf-8') for y in azure_data}

    err_rewards_idx = []
    no_events_idx = []
    no_rewards_idx = []
    for i, x in enumerate(local_rank):
        if x in rew_dict:
            if x in azure_dict:
                if abs(1. + float(azure_dict[x]) / float(rew_dict[x])) > 1e-7:
                    if verbose:
                        print(
                            'Idx: {} - Error in reward: Local: {} Azure: {} - EventId: {}'
                            .format(i + 1, rew_dict[x], azure_dict[x], x))
                    err_rewards_idx.append(i + 1)
            else:
                no_events_idx.append(i + 1)
                if verbose:
                    print('Idx: {} - Ranking missing from Azure - EventId: {}'.
                          format(i + 1, x))
        else:
            no_rewards_idx.append(i + 1)
            if verbose:
                print(
                    'Idx: {} - Reward missing from local - EventId: {}'.format(
                        i + 1, x))

    dup_local = len(local_rew) - len(rew_dict)
    dup_azure = len(azure_data) - len(azure_dict)
    if verbose:
        print('-----' * 10)
        print('Missing events indexes (1-based indexing)\n{}'.format(
            no_events_idx))
        print('-----' * 10)
        print('Missing local rewards indexes (1-based indexing)\n{}'.format(
            no_rewards_idx))
        print('-----' * 10)
        print('Wrong rewards indexes (1-based indexing)\n{}'.format(
            err_rewards_idx))
        if dup_local > 0:
            print('-----' * 10)
            print('Duplicates in Local rewards')
            dup_analysis(local_rew)
        if dup_azure > 0:
            print('-----' * 10)
            print('Duplicates in Azure Storage')
            dup_analysis(azure_data)
    print('-----' * 10)
    print('Events in local_rank: {} (Duplicates: {})'.format(
        len(local_rank),
        len(local_rank) - len(local_rank_set)))
    print('Events in local_rew: {} (Duplicates: {})'.format(
        len(local_rew), dup_local))
    print('Events in azure_data: {} (Duplicates: {})'.format(
        len(azure_data), dup_azure))
    print('-----' * 10)
    print('Intersection local_rank/local_rew:',
          len(local_rank_set.intersection(rew_dict.keys())))
    print('Intersection local_rank/azure_data:',
          len(local_rank_set.intersection(azure_dict.keys())))
    print('Missing EventIds: {}'.format(len(no_events_idx)), end='')
    if no_events_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx),
                                                     len(local_rank)),
              end='')
    print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='')
    if no_rewards_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx),
                                                     len(local_rank)),
              end='')
    print('\nWrong rewards: {}'.format(len(err_rewards_idx)))
    print('-----' * 10)
    print('status_codes errors: {}'.format(err_codes.most_common()))
    print('Lines skipped in Local file: {}'.format(lines_errs))
    print('-----' * 10)
    if plot_hist:
        if err_rewards_idx or no_events_idx or no_rewards_idx:
            plt.rcParams.update({'font.size': 16})  # General font size
            if err_rewards_idx:
                a = plt.hist(err_rewards_idx,
                             50,
                             label='Wrong reward',
                             color='xkcd:orange')
                if verbose:
                    print('err_rewards_idx', a)
            if no_events_idx:
                b = plt.hist(no_events_idx,
                             50,
                             label='No rank',
                             color='xkcd:blue')
                if verbose:
                    print('no_events_idx', b)
            if no_rewards_idx:
                c = plt.hist(no_rewards_idx,
                             50,
                             label='No local reward',
                             color='xkcd:red')
                if verbose:
                    print('no_rewards_idx', c)
            plt.title('Missing/Wrong rank and reward requests', fontsize=20)
            plt.xlabel('Request index', fontsize=18)
            plt.ylabel('Bin Count', fontsize=18)
            plt.legend()
            plt.show()
        else:
            print('Nothing to plot! All is good!')
예제 #2
0
def print_stats(local_fp,
                azure_path,
                verbose=False,
                plot_hist=False,
                hist_bin=100):

    t = time.time()

    gt = {}
    len_local_rank = 0
    dup_rank = 0
    local_rew = []
    lines_errs = 0
    err_codes = collections.Counter()
    bytes_count = 0
    tot_bytes = os.path.getsize(local_fp)
    for i, x in enumerate(open(local_fp, encoding='utf-8')):
        bytes_count += len(x)
        if (i + 1) % 10000 == 0:
            ds_parse.update_progress(
                bytes_count, tot_bytes,
                'Loading Local file: {} - '.format(local_fp))
        if 'status_code:200' in x:
            if '/rank/' in x and '"eventId":"' in x:
                ei = ds_parse.local_rank(x)
                len_local_rank += 1
                if ei in gt:
                    dup_rank += 1
                else:
                    gt[ei] = {'i': len_local_rank}
            elif '/reward/' in x and 'content:' in x:
                ei, r = ds_parse.local_reward(x)
                local_rew.append((ei, r))
                gt[ei].setdefault('local_rew', []).append(r)
            else:
                lines_errs += 1
        else:
            err_codes.update([ds_parse.extract_field(x, 'status_code:', '\t')])
    ds_parse.update_progress(tot_bytes, tot_bytes,
                             'Loading Local file: {} - '.format(local_fp))

    print('\n\nLoading Azure files...')
    if os.path.isdir(azure_path):
        files = [
            azure_fp.path for azure_fp in scantree(azure_path)
            if azure_fp.name.endswith('.json')
        ]
    else:
        files = [azure_path]

    verbose_output = []

    ei_miss_local = 0
    azure_data = []
    for ii, azure_fp in enumerate(files):
        bytes_count = 0
        tot_bytes = os.path.getsize(azure_fp)
        for i, x in enumerate(
                gzip.open(azure_fp, 'rb') if azure_fp.
                endswith('.gz') else open(azure_fp, 'rb')):
            bytes_count += len(x)
            if (i + 1) % 10000 == 0:
                if azure_fp.endswith('.gz'):
                    ds_parse.update_progress(i + 1,
                                             prefix='File {}/{}: {} - '.format(
                                                 ii + 1, len(files), azure_fp))
                else:
                    ds_parse.update_progress(
                        bytes_count, tot_bytes,
                        'File {}/{}: {} - '.format(ii + 1, len(files),
                                                   azure_fp))

            if x.startswith(b'{"_label_cost":'):
                data = ds_parse.json_cooked(x)
                ei = str(data['ei'], 'utf-8')
                c = str(data['cost'], 'utf-8')
                azure_data.append((ei, c))
                if ei not in gt:
                    ei_miss_local += 1
                    if verbose:
                        verbose_output.append(
                            'Idx: {} - EventId: {} - Ranking missing from Local'
                            .format(len(azure_data), ei))
                else:
                    gt[ei].setdefault('azure_data', []).append((c, data['ts']))
        if azure_fp.endswith('.gz'):
            ds_parse.update_progress(i + 1,
                                     prefix='File {}/{}: {} - '.format(
                                         ii + 1, len(files), azure_fp))
        else:
            ds_parse.update_progress(
                bytes_count, tot_bytes,
                'File {}/{}: {} - '.format(ii + 1, len(files), azure_fp))
        print()
    print()

    dup_azure_counter = collections.Counter()
    dup_rew_counter = collections.Counter()
    err_rewards_idx = []
    no_events_idx = []
    no_rewards_idx = []
    for i, ei in enumerate(gt):
        if (i + 1) % 10000 == 0:
            ds_parse.update_progress(i + 1, len(gt),
                                     'Evaluating differences - ')
        if 'local_rew' in gt[ei]:
            if len(gt[ei]['local_rew']) > 1:
                dup_rew_counter.update([len(gt[ei]['local_rew'])])
                if verbose:
                    verbose_output.append(
                        'Idx: {} - EventId: {} - Duplicate in Reward: {}'.
                        format(gt[ei]['i'], ei, gt[ei]['local_rew']))
            else:
                if 'azure_data' in gt[ei]:
                    if len(gt[ei]['azure_data']) > 1:
                        dup_azure_counter.update([len(gt[ei]['azure_data'])])
                        if verbose:
                            verbose_output.append(
                                'Idx: {} - EventId: {} - Duplicate in Azure: {}'
                                .format(gt[ei]['i'], ei, gt[ei]['azure_data']))
                    else:
                        a = float(gt[ei]['local_rew'][0])
                        b = float(gt[ei]['azure_data'][0][0])
                        if abs(a + b) > max(1e-7 * max(abs(a), abs(b)), 1e-6):
                            err_rewards_idx.append(gt[ei]['i'])
                            if verbose:
                                verbose_output.append(
                                    'Idx: {} - EventId: {} - Error in reward: Local: {} Azure: {}'
                                    .format(gt[ei]['i'], ei,
                                            gt[ei]['local_rew'][0],
                                            gt[ei]['azure_data'][0]))
                else:
                    no_events_idx.append(gt[ei]['i'])
                    if verbose:
                        verbose_output.append(
                            'Idx: {} - EventId: {} - Ranking missing from Azure'
                            .format(gt[ei]['i'], ei))
        else:
            no_rewards_idx.append(gt[ei]['i'])
            if verbose:
                verbose_output.append(
                    'Idx: {} - EventId: {} - Reward missing from local'.format(
                        gt[ei]['i'], ei))
    ds_parse.update_progress(i + 1, len(gt), 'Evaluating differences - ')
    print()

    for x in verbose_output:
        print(x)

    print('\nComputing summary stats...')
    rew_dict = {y[0]: y[1] for y in local_rew}
    azure_dict = {y[0]: y[1] for y in azure_data}

    dup_azure = sum((x - 1) * dup_azure_counter[x] for x in dup_azure_counter)
    dup_rew = sum((x - 1) * dup_rew_counter[x] for x in dup_rew_counter)
    if verbose:
        print('-----' * 10)
        print('Missing events indexes (1-based indexing)\n{}'.format(
            no_events_idx))
        print('-----' * 10)
        print('Missing local rewards indexes (1-based indexing)\n{}'.format(
            no_rewards_idx))
        print('-----' * 10)
        print('Wrong rewards indexes (1-based indexing)\n{}'.format(
            err_rewards_idx))
    print('-----' * 10)
    print('Events in local_rank: {} (Duplicates: {})'.format(
        len_local_rank, dup_rank))
    print('Events in local_rew: {} (Duplicates: {} - {})'.format(
        len(local_rew), dup_rew, dup_rew_counter))
    print('Events in azure_data: {} (Duplicates: {} - {})'.format(
        len(azure_data), dup_azure, dup_azure_counter))
    print('-----' * 10)
    print('Intersection local_rank/local_rew:',
          sum(1 for x in rew_dict if x in gt))
    print('Intersection local_rank/azure_data:',
          sum(1 for x in azure_dict if x in gt))
    print('Missing EventIds from local: {}'.format(ei_miss_local))
    print('Missing EventIds from azure: {}'.format(len(no_events_idx)), end='')
    if no_events_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx),
                                                     len_local_rank),
              end='')
    print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='')
    if no_rewards_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx),
                                                     len_local_rank),
              end='')
    print('\nWrong rewards: {}'.format(len(err_rewards_idx)))
    print('-----' * 10)
    print('status_codes errors: {}'.format(err_codes.most_common()))
    print('Lines skipped in Local file: {}'.format(lines_errs))
    print('-----' * 10)
    print('Elapsed time: ', time.time() - t)
    if plot_hist:
        if err_rewards_idx or no_events_idx or no_rewards_idx:
            plt.rcParams.update({'font.size': 16})  # General font size
            if err_rewards_idx:
                a = plt.hist(err_rewards_idx,
                             hist_bin,
                             label='Wrong reward',
                             color='xkcd:orange')
                if verbose:
                    print('err_rewards_idx', a)
            if no_events_idx:
                b = plt.hist(no_events_idx,
                             hist_bin,
                             label='No rank',
                             color='xkcd:blue')
                if verbose:
                    print('no_events_idx', b)
            if no_rewards_idx:
                c = plt.hist(no_rewards_idx,
                             hist_bin,
                             label='No local reward',
                             color='xkcd:red')
                if verbose:
                    print('no_rewards_idx', c)
            plt.title('Missing/Wrong rank and reward requests', fontsize=20)
            plt.xlabel('Request index', fontsize=18)
            plt.ylabel('Bin Count', fontsize=18)
            plt.legend()
            plt.show()
        else:
            print('Nothing to plot! All is good!')
예제 #3
0
def print_stats(local_fp, azure_path, verbose=False, plot_hist=False, hist_bin=100):

    t = time.time()

    gt = {}
    len_local_rank = 0
    dup_rank = 0
    local_rew = []
    lines_errs = 0
    err_codes = collections.Counter()
    bytes_count = 0
    tot_bytes = os.path.getsize(local_fp)
    for i,x in enumerate(open(local_fp, encoding='utf-8')):
        bytes_count += len(x)
        if (i+1) % 10000 == 0:
            ds_parse.update_progress(bytes_count,tot_bytes,'Loading Local file: {} - '.format(local_fp))
        if 'status_code:200' in x:
            if '/rank/' in x and '"eventId":"' in x:
                ei = ds_parse.local_rank(x)
                len_local_rank += 1
                if ei in gt:
                    dup_rank += 1
                else:
                    gt[ei] = {'i': len_local_rank}
            elif '/reward/' in x and 'content:' in x:
                ei,r = ds_parse.local_reward(x)
                local_rew.append((ei,r))
                gt[ei].setdefault('local_rew',[]).append(r)
            else:
                lines_errs += 1
        else:
            err_codes.update([ds_parse.extract_field(x,'status_code:','\t')])
    ds_parse.update_progress(tot_bytes,tot_bytes,'Loading Local file: {} - '.format(local_fp))

    print('\n\nLoading Azure files...')
    if os.path.isdir(azure_path):
        files = [azure_fp.path for azure_fp in scantree(azure_path) if azure_fp.name.endswith('.json')]
    else:
        files = [azure_path]

    verbose_output = []

    ei_miss_local = 0
    azure_data = []
    for ii,azure_fp in enumerate(files):
        bytes_count = 0
        tot_bytes = os.path.getsize(azure_fp)
        for i,x in enumerate(gzip.open(azure_fp, 'rb') if azure_fp.endswith('.gz') else open(azure_fp, 'rb')):
            bytes_count += len(x)
            if (i+1) % 10000 == 0:
                if azure_fp.endswith('.gz'):
                    ds_parse.update_progress(i+1,prefix='File {}/{}: {} - '.format(ii+1,len(files),azure_fp))
                else:
                    ds_parse.update_progress(bytes_count,tot_bytes,'File {}/{}: {} - '.format(ii+1,len(files),azure_fp))

            if x.startswith(b'{"_label_cost":'):
                data = ds_parse.json_cooked(x)
                ei = str(data['ei'], 'utf-8')
                c = str(data['cost'], 'utf-8')
                azure_data.append((ei, c))
                if ei not in gt:
                    ei_miss_local += 1
                    if verbose:
                        verbose_output.append('Idx: {} - EventId: {} - Ranking missing from Local'.format(len(azure_data),ei))
                else:
                    gt[ei].setdefault('azure_data',[]).append((c, data['ts']))
        if azure_fp.endswith('.gz'):
            ds_parse.update_progress(i+1,prefix='File {}/{}: {} - '.format(ii+1,len(files),azure_fp))
        else:
            ds_parse.update_progress(bytes_count,tot_bytes,'File {}/{}: {} - '.format(ii+1,len(files),azure_fp))
        print()
    print()

    dup_azure_counter = collections.Counter()
    dup_rew_counter = collections.Counter()
    err_rewards_idx = []
    no_events_idx = []
    no_rewards_idx = []
    for i,ei in enumerate(gt):
        if (i+1) % 10000 == 0:
            ds_parse.update_progress(i+1,len(gt),'Evaluating differences - ')
        if 'local_rew' in gt[ei]:
            if len(gt[ei]['local_rew']) > 1:
                dup_rew_counter.update([len(gt[ei]['local_rew'])])
                if verbose:
                    verbose_output.append('Idx: {} - EventId: {} - Duplicate in Reward: {}'.format(gt[ei]['i'],ei,gt[ei]['local_rew']))
            else:
                if 'azure_data' in gt[ei]:
                    if len(gt[ei]['azure_data']) > 1:
                        dup_azure_counter.update([len(gt[ei]['azure_data'])])
                        if verbose:
                            verbose_output.append('Idx: {} - EventId: {} - Duplicate in Azure: {}'.format(gt[ei]['i'],ei,gt[ei]['azure_data']))
                    else:
                        a = float(gt[ei]['local_rew'][0])
                        b = float(gt[ei]['azure_data'][0][0])
                        if abs(a+b) > max(1e-7 * max(abs(a), abs(b)), 1e-6):
                            err_rewards_idx.append(gt[ei]['i'])
                            if verbose:
                                verbose_output.append('Idx: {} - EventId: {} - Error in reward: Local: {} Azure: {}'.format(gt[ei]['i'],ei,gt[ei]['local_rew'][0],gt[ei]['azure_data'][0]))
                else:
                    no_events_idx.append(gt[ei]['i'])
                    if verbose:
                        verbose_output.append('Idx: {} - EventId: {} - Ranking missing from Azure'.format(gt[ei]['i'],ei))
        else:
            no_rewards_idx.append(gt[ei]['i'])
            if verbose:
                verbose_output.append('Idx: {} - EventId: {} - Reward missing from local'.format(gt[ei]['i'],ei))
    ds_parse.update_progress(i+1,len(gt),'Evaluating differences - ')
    print()

    for x in verbose_output:
        print(x)

    print('\nComputing summary stats...')
    rew_dict = {y[0]: y[1] for y in local_rew}
    azure_dict = {y[0]: y[1] for y in azure_data}

    dup_azure = sum((x-1)*dup_azure_counter[x] for x in dup_azure_counter)
    dup_rew = sum((x-1)*dup_rew_counter[x] for x in dup_rew_counter)
    if verbose:
        print('-----'*10)
        print('Missing events indexes (1-based indexing)\n{}'.format(no_events_idx))
        print('-----'*10)
        print('Missing local rewards indexes (1-based indexing)\n{}'.format(no_rewards_idx))
        print('-----'*10)
        print('Wrong rewards indexes (1-based indexing)\n{}'.format(err_rewards_idx))
    print('-----'*10)
    print('Events in local_rank: {} (Duplicates: {})'.format(len_local_rank, dup_rank))
    print('Events in local_rew: {} (Duplicates: {} - {})'.format(len(local_rew), dup_rew, dup_rew_counter))
    print('Events in azure_data: {} (Duplicates: {} - {})'.format(len(azure_data), dup_azure, dup_azure_counter))
    print('-----'*10)
    print('Intersection local_rank/local_rew:',sum(1 for x in rew_dict if x in gt))
    print('Intersection local_rank/azure_data:',sum(1 for x in azure_dict if x in gt))
    print('Missing EventIds from local: {}'.format(ei_miss_local))
    print('Missing EventIds from azure: {}'.format(len(no_events_idx)), end='')
    if no_events_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx),len_local_rank), end='')
    print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='')
    if no_rewards_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx),len_local_rank), end='')
    print('\nWrong rewards: {}'.format(len(err_rewards_idx)))
    print('-----'*10)
    print('status_codes errors: {}'.format(err_codes.most_common()))
    print('Lines skipped in Local file: {}'.format(lines_errs))
    print('-----'*10)
    print('Elapsed time: ',time.time()-t)
    if plot_hist:
        if err_rewards_idx or no_events_idx or no_rewards_idx:
            plt.rcParams.update({'font.size': 16})  # General font size
            if err_rewards_idx:
                a = plt.hist(err_rewards_idx, hist_bin, label='Wrong reward', color='xkcd:orange')
                if verbose:
                    print('err_rewards_idx',a)
            if no_events_idx:
                b = plt.hist(no_events_idx, hist_bin, label='No rank', color='xkcd:blue')
                if verbose:
                    print('no_events_idx',b)
            if no_rewards_idx:
                c = plt.hist(no_rewards_idx, hist_bin, label='No local reward', color='xkcd:red')
                if verbose:
                    print('no_rewards_idx',c)
            plt.title('Missing/Wrong rank and reward requests', fontsize=20)
            plt.xlabel('Request index', fontsize=18)
            plt.ylabel('Bin Count', fontsize=18)
            plt.legend()
            plt.show()
        else:
            print('Nothing to plot! All is good!')