def add_tokens(logdir, selected_models_file):

    df_models = pd.read_csv(selected_models_file)

    df_models['train_n_tokens'] = df_models['name'].apply(lambda x: int(x.split('_')[-1].replace('tokens', '')))

    if 'tokens_search' not in df_models.columns:
        df_models['tokens_search'] = '[0 0 0]'

    dirlist = os.listdir(logdir)

    for run in dirlist:

        if '.csv' in run or '.json' in run or run == 'results':
            continue
        # if run == 'results':
        #     continue

        run_dir = os.path.join(logdir, run)
        print(f'Working on: {run}')
        with open(os.path.join(run_dir, 'config.json'), encoding='utf-8') as f:
            config_run = json.load(f)

        output = config_run['task']['dataset']['output'][:4]
        case = ''.join([letter for letter in config_run['task']['dataset']['name'] if not letter.isnumeric()])
        sw = config_run['task']['dataset']['skip_wall']
        ntok = config_run['prior']['length']['max_']

        run_name = f'{output}_{case}_sw{sw}_{ntok}tokens'

        results = load_iterations(os.path.join(logdir, run))

        df_joined = pd.DataFrame()

        for key in results:
            df_joined = pd.concat([df_joined, results[key]], axis=0, ignore_index=True)

        if 'tokens' not in df_joined.columns:
            continue

        df_joined['r_sum'] = df_joined.apply(lambda x: x['r_max_PH'] + x['r_max_CD'] + x['r_max_CBFS'], axis=1)

        df_models['tokens_search'] = df_models.apply(lambda x: search_tokens(x, df_joined), axis=1)

    filename = selected_models_file[:-4] + '_added_tokens.csv'
    df_models.to_csv(filename, index=False)
def fetch_iteration_metrics(logdir, finished=True):

    results = load_iterations(logdir)

    available_metric = results[list(results.keys())[0]].columns.values
    plot_metrics = [metric for metric in plot_metrics if metric in available_metric]

    # if finished:
    n_iter = 0
    for key, value in results.items():
        if value.shape[0] > n_iter:
            n_iter = value.shape[0]

    plot_dict = {}
    for metric in plot_metrics:
        plot_dict[metric] = []

    for key in results:
        for metric in plot_metrics:
            # if finished:
            if len(results[key][metric].values) == n_iter:
                plot_dict[metric].append(results[key][metric].values)
            else:
                # extend array to full length
                short_arr = results[key][metric].values
                extended_arr = short_arr[-1]*np.ones(n_iter)
                extended_arr[:short_arr.shape[0]] = short_arr
                plot_dict[metric].append(extended_arr)

    # if the r_max_full or r_best_full are nonzero, overwrite base_r_best and base_r_max with full dataset stats
    if 'r_max_full' in plot_metrics:
        if not (plot_dict['r_max_full'][0] == 0).all():
            plot_dict['base_r_max'] = plot_dict['r_max_full']
        del plot_dict['r_max_full']

    if 'r_best_full' in plot_metrics:
        if not (plot_dict['r_best_full'][0] == 0).all():
            plot_dict['base_r_best'] = plot_dict['r_best_full']
        del plot_dict['r_best_full']

    return plot_dict
示例#3
0
def create_plots_for_increasing_n_iterations():
    # WIP
    logdir = '../logs_completed/compare_baselines'

    dirlist = os.listdir(logdir)

    iterations = np.arange(10, 110, 10)

    metrics = {}
    all_rewards = []
    for dir in dirlist:
        results = load_iterations(os.path.join(logdir, dir))

        basename = '_'.join(
            list(results.keys())[0].split('.')[0].split('_')[:-1])

        metrics[dir] = {}
        metrics[dir]['mean'] = []
        metrics[dir]['std'] = []
        metrics[dir]['max'] = []

        rewards_sorted = []
        for ii in range(len(results)):
            rewards_sorted.append(
                results[f'{basename}_{ii+1}.csv']['base_r_best'].values[-1])
            all_rewards.append(
                results[f'{basename}_{ii+1}.csv']['base_r_best'].values[-1])

        for nit in iterations:
            metrics[dir]['mean'].append(np.mean(rewards_sorted[:nit]))
            metrics[dir]['std'].append(np.std(rewards_sorted[:nit]))
            metrics[dir]['max'].append(np.max(rewards_sorted[:nit]))

    from scipy.stats import norm
    mu, std = norm.fit(all_rewards)

    plt.figure()
    plt.hist(all_rewards, bins=20)

    # Plot the PDF.
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = norm.pdf(x, mu, std)

    plt.plot(x, p, 'k', linewidth=2)

    plt.axvline(x=np.mean(all_rewards), color='C1')
    plt.axvline(x=np.mean(all_rewards) + np.std(all_rewards), color='C2')
    plt.axvline(x=np.mean(all_rewards) + 2 * np.std(all_rewards), color='C3')
    plt.axvline(x=np.mean(all_rewards) - np.std(all_rewards), color='C2')
    plt.axvline(x=np.mean(all_rewards) - 2 * np.std(all_rewards), color='C3')
    # plt.axvline(x=np.mean(all_rewards))

    plt.show()

    plt.figure()
    for key in metrics:
        plt.plot(iterations, metrics[key]['mean'], label=key)

    plt.legend()

    plt.figure()
    for key in metrics:
        plt.plot(iterations, metrics[key]['std'], label=key)

    plt.legend()

    plt.figure()
    for key in metrics:
        plt.plot(iterations, metrics[key]['max'], label=key)

    plt.legend()

    plt.show()
示例#4
0
def plot_optimise_statistics():
    logdir = '../logs_completed/log_2022-03-20-182735_optimize_statistics'

    matplotlib.use('tkagg')

    optim_stats = {}
    for filename in os.listdir(logdir):
        split = filename.split('_')[-1].split('.')
        if split[0] == 'stats' and split[1] == 'csv':
            df_append = pd.read_csv(f'{logdir}/{filename}', header=None)
            optim_stats[filename] = df_append

    # for key in return_dict:

    all_match_bestperformer = []
    first_bestperformer = []
    last_bestperformer = []
    for key in optim_stats:
        for value in optim_stats[key].iloc[:, 0].values:
            all_match_bestperformer.append(value)

        for value in optim_stats[key].iloc[:, 0].values[:300]:
            first_bestperformer.append(value)

        for value in optim_stats[key].iloc[:, 0].values[300:]:
            last_bestperformer.append(value)

    all_match_bestperformer = np.array(all_match_bestperformer)
    print(
        f"ratio of batches that found best perfromer with less than 100 iters  {np.mean(all_match_bestperformer < 100)}"
    )

    bins = np.arange(0, 2000, 10)
    #
    plt.figure()
    plt.hist(x=all_match_bestperformer, bins=bins)  #, density=True)
    plt.xlim([0, 200])
    plt.show()
    #

    #
    plt.figure()
    plt.title('first300')
    plt.hist(x=first_bestperformer, bins=bins, density=True)
    plt.xlim([0, 200])
    plt.show()
    #

    #
    plt.figure()
    plt.title('last300')

    plt.hist(x=last_bestperformer, bins=bins, density=True)
    plt.xlim([0, 200])
    plt.show()
    #
    #

    batches_match = np.zeros(
        (optim_stats[list(optim_stats.keys())[0]].shape[0],
         optim_stats[list(optim_stats.keys())[0]].shape[1] - 1,
         len(optim_stats)))

    # all_match_bestperformer = np.array(all_match_bestperformer)
    # np.mean(all_match_bestperformer < 100)

    for ii in range(len(optim_stats)):
        key = list(optim_stats.keys())[ii]
        batches_match[:, :, ii] = optim_stats[key].values[:, 1:]

    mean_batch_match = np.mean(batches_match, axis=-1)

    figsize = (12, 9)
    cm = 1 / 2.54  # centimeters in inches
    plt.figure(figsize=tuple([val * cm for val in list(figsize)]))
    plot_iters = [0, 100, 300, 500]
    colors = [f'C{ii+1}' for ii in range(len(plot_iters))]
    colors[-1] = 'C6'  # the purple is difficult to see over the blue hist
    linestyles = [
        '-', (0, (5, 1)), (0, (1, 1)), (0, (3, 1, 1, 1)), '-.',
        (0, (3, 1, 1, 1, 1, 1))
    ]

    counter = 0
    for ii in plot_iters:
        plt.plot(bins,
                 100 * mean_batch_match[ii, :],
                 label=f'Iter: {ii}',
                 color=colors[counter],
                 linestyle=linestyles[counter],
                 linewidth=2)
        counter += 1
    plt.xlabel('Iteration limit')
    plt.xlim([0, 200])
    plt.xticks(np.arange(0, 220, 20))
    plt.grid('both', linestyle=':')
    plt.legend()

    ax1 = plt.gca()
    ax1.set_ylabel('Match percentage')
    ax2 = ax1.twinx()
    ax1.set_zorder(10)
    ax1.patch.set_visible(False)
    ax2.hist(x=all_match_bestperformer,
             bins=bins,
             density=True,
             zorder=-1,
             label='All')
    # opacity doesnt work with eps...
    # ax2.hist(x=last_bestperformer, bins=bins, density=True, zorder=-1, alpha = 0.5,color ='red', label='Last 300') # not sure if this should be included
    ax2.set_ylabel('Probability density')
    # plt.legend(loc='right')

    plt.savefig('../logs_completed/aa_plots/iterlim_prob_dens_batch_match.eps',
                format='eps',
                bbox_inches='tight')

    plt.show()

    # make plot of duration and max and mean rewards

    logdir = '../logs_completed/compare_iterlim_optimisation/log_2022-01-19-154202_LR01'
    lim_stats = load_iterations(logdir)

    logdir = '../logs_completed/compare_iterlim_optimisation/log_2022-03-22-105724_unconstrained_optimisation'
    unlim_stats = load_iterations(logdir)

    lim_base_r_arr = []
    lim_duration_arr = []
    for run in lim_stats:
        lim_base_r_arr.append(lim_stats[run]['base_r_best'].values)
        lim_duration_arr.append(lim_stats[run]['proc_time'].values)

    lim_base_r_arr = np.array(lim_base_r_arr)
    lim_duration_arr = np.array(lim_duration_arr)

    unlim_base_r_arr = []
    unlim_duration_arr = []
    for run in unlim_stats:
        unlim_base_r_arr.append(unlim_stats[run]['base_r_best'].values)
        unlim_duration_arr.append(unlim_stats[run]['proc_time'].values)

    unlim_base_r_arr = np.array(unlim_base_r_arr)
    unlim_duration_arr = np.array(unlim_duration_arr)

    figsize = (12, 9)
    cm = 1 / 2.54  # centimeters in inches
    plt.figure(figsize=tuple([val * cm for val in list(figsize)]))
    plt.plot(np.max(lim_base_r_arr, axis=0),
             label=r'$r_{max}$ Constrained',
             linewidth=2,
             linestyle='-')
    plt.plot(np.max(unlim_base_r_arr, axis=0),
             label=r'$r_{max}$ Unconstrained',
             linewidth=2,
             linestyle=(0, (5, 1)))
    plt.plot(np.mean(lim_base_r_arr, axis=0),
             label=r'$r_{mean}$ Constrained',
             linewidth=2,
             linestyle=(0, (1, 1)))
    plt.plot(np.mean(unlim_base_r_arr, axis=0),
             label=r'$r_{mean}$ Unconstrained',
             linewidth=2,
             linestyle=(0, (3, 1, 1, 1)))
    plt.xlim([0, 600])
    plt.legend(loc='lower right')
    plt.grid('both', linestyle=':')
    plt.xlabel('Iterations')
    plt.ylabel(r'$r\;(\tau)$')
    plt.savefig('../logs_completed/aa_plots/iterlim_rewards.eps',
                format='eps',
                bbox_inches='tight')

    clipped_lim_duration_arr = lim_duration_arr[:, :600]
    np.mean(np.sum(clipped_lim_duration_arr, axis=1))
    np.mean(np.sum(unlim_duration_arr, axis=1))

    print(
        np.mean(np.sum(clipped_lim_duration_arr, axis=1)) /
        np.mean(np.sum(unlim_duration_arr, axis=1)))

    print('here')
def plot_ntokens_r_max(logdir):

    dirlist = os.listdir(logdir)

    tokens = []
    r_max_PH = []
    r_max_CD = []
    r_max_CBFS = []

    dim_dict = {'exp': exp,
                'log': log}

    m = Dimension('length')
    s = Dimension('time')

    input_dims = {"grad_u_T1": 1 / s,
                  "grad_u_T2": 1 / s,
                  "grad_u_T3": 1 / s,
                  "grad_u_T4": 1 / s,
                  "k": (m ** 2) / (s ** 2),
                  "inv1": m / m,
                  "inv2": m / m,
                  "T1": m / m,
                  "T2": m / m,
                  "T3": m / m,
                  "T4": m / m}

    for run in dirlist:

        if '.csv' in run:
            continue

        run_dir = os.path.join(logdir, run)
        print(f'Working on: {run}')
        with open(os.path.join(run_dir, 'config.json'), encoding='utf-8') as f:
            config_run = json.load(f)

        output = config_run['task']['dataset']['output'][:4]
        case = ''.join([letter for letter in config_run['task']['dataset']['name'] if not letter.isnumeric()])
        sw = config_run['task']['dataset']['skip_wall']
        ntok = config_run['prior']['length']['max_']

        run_name = f'{output}_{case}_sw{sw}_{ntok}tokens'

        results = load_iterations(os.path.join(logdir, run))

        df_joined = pd.DataFrame()

        for key in results:
            df_joined = pd.concat([df_joined, results[key]], axis=0, ignore_index=True)

        df_joined['r_sum'] = df_joined.apply(lambda x: x['r_max_PH'] + x['r_max_CD'] + x['r_max_CBFS'], axis=1)

        inputs = config_run['task']['dataset']['input']
        for ii in range(len(inputs)):
            dim_dict[f'x{ii + 1}'] = input_dims[inputs[ii]]

        df_joined['dimensions'] = df_joined.apply(lambda x: check_expression_dim(x['batch_r_max_expression'], dim_dict), axis=1)

        if output == 'kDef':
            target_dim = (0, 2, -3, 0, 0, 0, 0)
        if output == 'bDel':
            target_dim = (0, 0, 0, 0, 0, 0, 0)

        df_joined = df_joined[df_joined['dimensions'] == target_dim]

        df_joined['name'] = run_name
        df_joined['output'] = output
        df_joined['training_case'] = case
        df_joined['skip_wall'] = sw

        if 'tokens' in df_joined.columns:
            df_joined['ntokens'] = df_joined.apply(lambda x: count_tokens(x['tokens'], ntok), axis=1)
        else:
            df_joined['ntokens'] = ntok

        tokens.append(df_joined['ntokens'].values)
        r_max_PH.append(df_joined['r_max_PH'].values)
        r_max_CD.append(df_joined['r_max_CD'].values)
        r_max_CBFS.append(df_joined['r_max_CBFS'].values)

    tokens = np.concatenate(tokens, axis=0)
    r_max_PH = np.concatenate(r_max_PH, axis=0)
    r_max_CD = np.concatenate(r_max_CD, axis=0)
    r_max_CBFS = np.concatenate(r_max_CBFS, axis=0)

    sorted_tokens = []
    sorted_r_max_PH = []
    sorted_r_max_CD = []
    sorted_r_max_CBFS = []

    for token in np.unique(tokens):
        sorted_tokens.append(token)
        best_model_PH = np.argmax(r_max_PH[tokens == token])
        sorted_r_max_PH.append(r_max_PH[tokens == token][best_model_PH])
        sorted_r_max_CD.append(r_max_CD[tokens == token][best_model_PH])
        sorted_r_max_CBFS.append(r_max_CBFS[tokens == token][best_model_PH])

    markersize = 25
    lw = 2
    width = 7
    figsize = (width, 3*width/4)
    cm = 1 / 2.54  # centimeters in inches

    plt.figure(figsize=tuple([val*cm for val in list(figsize)]))
    plt.xlabel(r"$n_{tokens}$")
    plt.ylabel(r"$r_{max}$")
    plt.xticks(np.arange(0,25,2))
    plt.yticks(np.arange(0,1,0.05))
    ax = plt.gca()
    ax.set_axisbelow(True)
    plt.grid('both', linestyle=':')
    plt.plot(sorted_tokens, sorted_r_max_CD, label='$CD_{12600}$', c='C1', linestyle='--', linewidth=lw, marker='^')
    plt.plot(sorted_tokens, sorted_r_max_CBFS, label='$CBFS_{13700}$', c='C2', linestyle=':', linewidth=lw, marker='v')
    plt.plot(sorted_tokens, sorted_r_max_PH, label='$PH_{10595}$', c='C0', linestyle=(0, (3, 1, 1, 1)), linewidth=lw, marker='d')

    order = [2, 0, 1]
    handles, labels = ax.get_legend_handles_labels()
    plt.legend(handles=[handles[idx] for idx in order], labels=[labels[idx] for idx in order], ncol=3, loc='center', bbox_to_anchor=(0.5, 1.15), prop={'size': 8}) # ,ncol=4, loc='center', bbox_to_anchor=(0.5, 1.1), prop={'size': 9}

    plt.savefig(f'../logs_completed/aa_plots/ntokens_r_max{logdir.split("/")[-1]}.eps', format='eps', bbox_inches='tight')
def summarise_results(logdir):

    dirlist = os.listdir(logdir)
    try:
        dirlist.remove('results.csv')
    except ValueError:
        pass

    dim_dict = {'exp': exp,
                'log': log}

    m = Dimension('length')
    s = Dimension('time')

    input_dims = {"grad_u_T1": 1 / s,
                  "grad_u_T2": 1 / s,
                  "grad_u_T3": 1 / s,
                  "grad_u_T4": 1 / s,
                  "k": (m ** 2) / (s ** 2),
                  "inv1": m / m,
                  "inv2": m / m,
                  "T1": m / m,
                  "T2": m / m,
                  "T3": m / m,
                  "T4": m / m}

    df_results = pd.DataFrame()

    for run in dirlist:

        if '.csv' in run:
            continue

        run_dir = os.path.join(logdir, run)
        print(f'Working on: {run}')
        with open(os.path.join(run_dir, 'config.json'), encoding='utf-8') as f:
            config_run = json.load(f)

        output = config_run['task']['dataset']['output'][:4]
        case = ''.join([letter for letter in config_run['task']['dataset']['name'] if not letter.isnumeric()])
        sw = config_run['task']['dataset']['skip_wall']
        ntok = config_run['prior']['length']['max_']

        run_name = f'{output}_{case}_sw{sw}_{ntok}tokens'

        results = load_iterations(os.path.join(logdir, run))

        df_joined = pd.DataFrame()

        for key in results:
            df_joined = pd.concat([df_joined, results[key]], axis=0, ignore_index=True)

        inputs = config_run['task']['dataset']['input']
        for ii in range(len(inputs)):
            dim_dict[f'x{ii+1}'] = input_dims[inputs[ii]]


        df_joined = df_joined[~df_joined['batch_r_max_expression'].isna()]

        df_joined['r_sum'] = df_joined.apply(lambda x: x['r_max_PH'] + x['r_max_CD'] + x['r_max_CBFS'], axis=1)

        if output == 'kDef':
            df_joined['dimensions'] = df_joined.apply(
                lambda x: check_expression_dim(x['batch_r_max_expression'], dim_dict), axis=1)

            target_dim = (0, 2, -3, 0, 0, 0, 0)
        if output == 'bDel':
            target_dim = (0, 0, 0, 0, 0, 0, 0)
            df_joined['dimensions'] = [(0, 0, 0, 0, 0, 0, 0) for _ in df_joined.index]

        df_joined = df_joined.drop_duplicates(subset=['batch_r_max_expression'])
        df_joined['converted_expression'] = df_joined.apply(lambda x: convert_expression(x['batch_r_max_expression'], inputs), axis=1)

        df_joined['name'] = run_name
        df_joined['output'] = output
        df_joined['training_case'] = case
        df_joined['skip_wall'] = sw

        if 'tokens' in df_joined.columns:
            df_joined['ntokens'] = df_joined.apply(lambda x: count_tokens(x['tokens'], ntok), axis=1)
        else:
            df_joined['ntokens'] = ntok

        df_right_dim = df_joined[df_joined['dimensions'] == target_dim]
        df_right_dim = df_right_dim.drop_duplicates(subset=['batch_r_max_expression'])
        df_right_dim['correct_dim'] = True

        # add best on all cases
        df_best = df_right_dim.sort_values('r_sum', ascending=False).head(70)
        df_best['rank'] = np.arange(len(df_best))
        df_best['ranked_by'] = 'r_sum'
        df_results = pd.concat([df_results, df_best], axis=0, ignore_index=True)

        # add best on all cases
        df_best = df_right_dim.sort_values(f'r_max_{case}', ascending=False).head(70)
        df_best['rank'] = np.arange(len(df_best))
        df_best['ranked_by'] = f'r_max_{case}'
        df_results = pd.concat([df_results, df_best], axis=0, ignore_index=True)


        save_arr = df_joined['r_max_PH'].values
        np.savetxt(os.path.join(logdir, f'LR{config_run["controller"]["learning_rate"]}_ent{config_run["controller"]["entropy_weight"]}_rewards.csv'), save_arr, delimiter=',')

        plt.figure()
        plt.hist(df_joined['r_max_PH'], bins=20)
        plt.title(f'{df_joined["r_max_PH"].max()}')
        plt.savefig(f'../logs_completed/aa_plots/aatmp_len{config_run["prior"]["length"]["max_"]}_LR{config_run["controller"]["learning_rate"]}_ent{config_run["controller"]["entropy_weight"]}.png')


        # df_wrong_dim = df_joined[df_joined['dimensions'] != target_dim]
        # df_wrong_dim = df_wrong_dim.drop_duplicates(subset=['batch_r_max_expression'])
        # df_wrong_dim['correct_dim'] = False
        #
        # # add best on all cases
        # df_best = df_wrong_dim.sort_values('r_sum', ascending=False).head(70)
        # df_best['rank'] = np.arange(len(df_best))
        # df_best['ranked_by'] = 'r_sum'
        # df_results = pd.concat([df_results, df_best], axis=0, ignore_index=True)
        #
        # # add best on all cases
        # df_best = df_wrong_dim.sort_values(f'r_max_{case}', ascending=False).head(70)
        # df_best['rank'] = np.arange(len(df_best))
        # df_best['ranked_by'] = f'r_max_{case}'
        # df_results = pd.concat([df_results, df_best], axis=0, ignore_index=True)


    save_cols = ['name','rank', 'ranked_by', 'r_max_PH', 'r_max_CD', 'r_max_CBFS', 'r_sum', 'batch_r_max_expression',
                 'dimensions', 'training_case', 'skip_wall', 'ntokens', 'correct_dim', 'converted_expression']
    df_save = df_results[save_cols]
    df_save = df_save.drop_duplicates(subset=['batch_r_max_expression'])

    df_save.to_csv(os.path.join(logdir, 'results.csv'),index=False)