def score(df: pd.DataFrame) -> float: """Output a score for MDP Playground.""" df = mdpp_preprocess(df_in=df) regret_score = plotting.ave_regret_score( df, baseline_regret=BASE_REGRET, episode=NUM_EPISODES) norm_score = 1.0 * regret_score # 2.5 was heuristically chosen value to get Sonnet DQN to score approx. 0.75, so that better algorithms like Rainbow can get score close to 1. With a bigger NN this would mean an unclipped score of 1.1 for Sonnet DQN, which is fair I think. However, a2c_rnn even reached 2.0 on this scale. DQN may be not performing as well because its epsilon is not annealed to 0. # print("unclipped score:", norm_score) norm_score = np.clip(norm_score, 0, 1) return norm_score
def score(df: pd.DataFrame) -> float: """Output a single score for cartpole = 50% regret, 50% has a good run.""" cp_df = cartpole_preprocess(df_in=df) regret_score = plotting.ave_regret_score( cp_df, baseline_regret=BASE_REGRET, episode=NUM_EPISODES) # Give 50% of score if your "best" episode > GOOD_EPISODE threshold. solve_score = np.mean( cp_df.groupby('bsuite_id')['best_episode'].max() > GOOD_EPISODE) return 0.5 * (regret_score + solve_score)
def score(df: pd.DataFrame) -> float: """Output a single score = 50% regret, 50% "final accuracy".""" regret_score = plotting.ave_regret_score( df, baseline_regret=BASE_REGRET, episode=sweep.NUM_EPISODES) final_df = df.copy() final_df['ave_return'] = ( 1.0 - (final_df.total_regret.diff() / final_df.episode.diff())) final_df = final_df[final_df.episode > 0.9 * NUM_EPISODES] # Convert (+1, -1) average return --> (+1, 0) accuracy score acc_score = np.mean(final_df.ave_return + 1) * 0.5 return 0.5 * (regret_score + acc_score)
def score(df: pd.DataFrame) -> float: """Output a single score for swingup = 50% regret, 50% does a swingup.""" df = cp_swingup_preprocess(df_in=df) scores = [] for _, sub_df in df.groupby('height_threshold'): regret_score = plotting.ave_regret_score( sub_df, baseline_regret=BASE_REGRET, episode=NUM_EPISODES, regret_column='perfection_regret') swingup_score = np.mean( sub_df.groupby('bsuite_id')['best_episode'].max() > GOOD_EPISODE) scores.append(0.5 * (regret_score + swingup_score)) return np.mean(scores)
def score(df: pd.DataFrame) -> float: """Output a single score for catch.""" return plotting.ave_regret_score(df, baseline_regret=BASE_REGRET, episode=sweep.NUM_EPISODES)
def score(df: pd.DataFrame) -> float: """Output a single score for mountain car.""" cp_df = mountain_car_preprocess(df_in=df) return plotting.ave_regret_score(cp_df, baseline_regret=BASE_REGRET, episode=sweep.NUM_EPISODES)