def optimize_params(self, trial, n_prune_evals_per_trial: int = 2, n_tests_per_eval: int = 1): train_provider, test_provider = self.data_provider.split_data_train_test( self.train_split_percentage) train_provider, validation_provider = train_provider.split_data_train_test( self.train_split_percentage) del test_provider train_env = DummyVecEnv([lambda: TradingEnv(train_provider)]) validation_env = DummyVecEnv([lambda: TradingEnv(validation_provider)]) model_params = self.optimize_agent_params(trial) model = self.Model(self.Policy, train_env, verbose=self.model_verbose, nminibatches=1, tensorboard_log=self.tensorboard_path, **model_params) last_reward = -np.finfo(np.float16).max n_steps_per_eval = int( len(train_provider.data_frame) / n_prune_evals_per_trial) for eval_idx in range(n_prune_evals_per_trial): try: model.learn(n_steps_per_eval) except AssertionError: raise rewards = [] n_episodes, reward_sum = 0, 0.0 trades = train_env.get_attr('trades') if len(trades[0]) < 1: self.logger.info( f'Pruning trial for not making any trades: {eval_idx}') raise optuna.structs.TrialPruned() state = None obs = validation_env.reset() while n_episodes < n_tests_per_eval: action, state = model.predict(obs, state=state) obs, reward, done, _ = validation_env.step([action]) reward_sum += reward[0] if all(done): rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = validation_env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward
if __name__ == "__main__": cfg = parse() cfg_log = ConfigLog(cfg) # test & train if cfg.test: cfg_log.load(CFG_FILE) # load data df_train, df_test, df_rate = load_data(cfg) rl_returns = [] naked_returns = [] covered_returns = [] delta_returns = [] env = DummyVecEnv([lambda: HedgeEnv(df_test, df_rate, cfg)]) T = env.get_attr('T')[0] model = DDPG(MlpPolicy, env, verbose=1) model.load(TEST_MODEL) delta = DeltaHedge() for i in range(cfg.test_times): # rl env.set_attr("b_rl", True) obs = env.reset() # every time, create a new transaction naked_returns.append(naked(env)) covered_returns.append(covered(env)) for i in range(T): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) # env.render() rl_returns.append(env.get_attr('final_reward')[0]) env.env_method('restart') # only trace back to the initial state
env = DummyVecEnv([lambda: pong]) model = DQN(MlpPolicy, env, verbose=1, gamma=0.95, tensorboard_log="./MinipongLog/") model.learn(total_timesteps=80000) #saving the model for future usability model.save('model/DQN_model_minipong') #you can check the tensor board log page easily by using this command in bash #tensorboard --logdir ./MinipongLog/ --host --localhost #displaying the training reward plot cummulative_reward_per_episode = env.get_attr( 'running_reward_list_per_episode')[0] plt.title('Training reward per episode') plt.xlabel('Number of episodes') plt.ylabel('Cummulative reward sum') plt.plot(cummulative_reward_per_episode) plt.show() #<---------------------------------------------------------testing----------------------------------------------------------> pong = Minipong(level=3, size=5) testing_env = DummyVecEnv([lambda: pong]) model = DQN.load('model/DQN_model_minipong') number_of_episodes = 100 for _ in range(number_of_episodes): done = False state = testing_env.reset()
else: model = PPO2(MlpPolicy, env, verbose=0, learning_rate=learning_rate) print(float(1e-5) == 0.00001) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(env=env, check_freq=1000, log_dir=log_dir) # Train the agent try: model.learn(total_timesteps=int(time_steps), callback=callback) model.save(models_dir + model_name) except KeyboardInterrupt: model.save(models_dir + model_name + "_abort") finally: mean_episode_reward = env.get_attr('mean_episode_reward') print(mean_episode_reward) plt.plot(mean_episode_reward[0], 'r.-', label="Mean Episode Reward(100)") mean_episode_length = env.get_attr('mean_episode_length') print(mean_episode_length) plt.plot(mean_episode_length[0], 'g.-', label="Mean Episode Length(100)") plt.legend() plt.show() t_steps = [i for i, k in enumerate(mean_episode_reward[0])] tp = [(np.array(t_steps), np.array(mean_episode_reward[0]))] print(tp) results_plotter.plot_curves(tp, 'timesteps', "TITLE") plt.show() # env = DummyVecEnv([lambda: ProcessorEnv(taskFile='data/example.xlsx')])
# ====== IMPORT MODEL ====== # fixme - should be able to import a previous model. modelToUse = selectFunctionAccordingToParams('model', params.get('model')) polictyToUse = selectFunctionAccordingToParams('policy', params.get('policy')) agentsDir = join(td, 'agents') model = modelToUse.load(join(agentsDir, 'agentFinal.pkl'), env=testEnv) # ===== TEST MODEL ====== obs, done = testEnv.reset(), False rewards = [] while not done: action, _states = model.predict(obs) worthHistory = testEnv.get_attr('net_worths') tradeHistory = testEnv.get_attr('trades') obs, reward, done, _ = testEnv.step(action) # testEnv.render(mode="human") rewards.append(reward) print(' Total reward: {}'.format(sum(rewards) / len(rewards))) # ===== PLOTS ===== worthHistory = worthHistory[ 0] # We only use one environment -> understand better why we have vectorized tradeHistory = tradeHistory[0] print('Size of worth history: ' + str(len(worthHistory))) print('Size of trade history: ' + str(len(tradeHistory))) bitcoinPrice = test_df['Close'].values[params.get('forecast_len'):]
def run_matchup(drafter1: str, drafter2: str, battler: str, games: int, seed: int, concurrency: int) \ -> Tuple[Tuple[float, float], Tuple[list, list], Tuple[list, list], List[List[Tuple]], Tuple[list, list], List[float]]: """ Run the match-up between `drafter1` and `drafter2` using `battler` battler :param drafter1: drafter to play as first player :param drafter2: drafter to play as second player :param battler: battler to simulate the matches :param games: amount of matches to simulate :param seed: seed used to generate the matches :param concurrency: amount of matches executed at the same time :return: a tuple containing (i) a tuple containing the win rate of the first and second players, (ii) a tuple containing the average mana curves of the first and second players, (iii) a tuple containing the `30 * games` individual draft choices of the first and second players; (iv) a tuple of 3-uples containing the card alternatives presented to the players at each of the `games` episodes; and (v) a tuple containing the `games` decks built by the first and second players. """ # parse the battle agent battler = agents.parse_battle_agent(battler) # initialize envs env = [lambda: LOCMDraftEnv(battle_agents=(battler(), battler())) for _ in range(concurrency)] # wrap envs in a vectorized env env = DummyVecEnv(env) for i in range(concurrency): # no overlap between episodes at each process current_seed = seed + (games // concurrency) * i current_seed -= 1 # resetting the env increases the seed by 1 # set seed to env env.env_method('seed', current_seed, indices=[i]) # reset the env env.reset() # initialize first player if drafter1.endswith('zip'): current_drafter = agents.RLDraftAgent(PPO2.load(drafter1)) current_drafter.use_history = "history" in drafter1 else: current_drafter = agents.parse_draft_agent(drafter1)() current_drafter.seed(seed) current_drafter.name = drafter1 drafter1 = current_drafter # initialize second player if drafter2.endswith('zip'): other_drafter = agents.RLDraftAgent(PPO2.load(drafter2)) other_drafter.use_history = "history" in drafter2 else: other_drafter = agents.parse_draft_agent(drafter2)() other_drafter.seed(seed) other_drafter.name = drafter2 drafter2 = other_drafter # initialize metrics episodes_so_far = 0 episode_rewards = [[0.0] for _ in range(env.num_envs)] drafter1.mana_curve = [0 for _ in range(13)] drafter2.mana_curve = [0 for _ in range(13)] drafter1.choices = [[] for _ in range(env.num_envs)] drafter2.choices = [[] for _ in range(env.num_envs)] drafter1.decks = [[[]] for _ in range(env.num_envs)] drafter2.decks = [[[]] for _ in range(env.num_envs)] alternatives = [[] for _ in range(env.num_envs)] # run the episodes while True: observations = env.get_attr('state') # get the current agent's action for all concurrent envs if isinstance(current_drafter, agents.RLDraftAgent): all_past_choices = env.get_attr('choices') new_observations = [] for i, observation in enumerate(observations): new_observation = encode_state_draft( observation, use_history=current_drafter.use_history, past_choices=all_past_choices[i][observation.current_player.id] ) new_observations.append(new_observation) actions = current_drafter.act(new_observations) else: actions = [current_drafter.act(observation) for observation in observations] # log chosen cards into current agent's mana curve for i, (action, observation) in enumerate(zip(actions, observations)): # get chosen index try: chosen_index = action.origin except AttributeError: chosen_index = action # save choice current_drafter.choices[i].append(chosen_index) # get chosen card chosen_card = observation.current_player.hand[chosen_index] # increase amount of cards chosen with the chosen card's cost current_drafter.mana_curve[chosen_card.cost] += 1 # add chosen card to this episode's deck current_drafter.decks[i][-1].append(chosen_card.id) # save card alternatives if observation.current_player.id == PlayerOrder.FIRST: alternatives[i].append(tuple(map(lambda c: c.id, observation.current_player.hand))) # perform the action and get the outcome _, rewards, dones, _ = env.step(actions) if isinstance(current_drafter, agents.RLDraftAgent): current_drafter.dones = dones # update metrics for i in range(env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) current_drafter.decks[i].append([]) other_drafter.decks[i].append([]) episodes_so_far += 1 # check exiting condition if episodes_so_far >= games: break # swap drafters current_drafter, other_drafter = other_drafter, current_drafter # normalize mana curves total_choices = sum(drafter1.mana_curve) drafter1.mana_curve = [freq / total_choices for freq in drafter1.mana_curve] drafter2.mana_curve = [freq / total_choices for freq in drafter2.mana_curve] # join all parallel rewards all_rewards = [reward for rewards in episode_rewards for reward in rewards[:-1]] # join all parallel choices drafter1.choices = [c for choices in drafter1.choices for c in choices] drafter2.choices = [c for choices in drafter2.choices for c in choices] # join all parallel decks drafter1.decks = [deck for decks in drafter1.decks for deck in decks if deck] drafter2.decks = [deck for decks in drafter2.decks for deck in decks if deck] # join all parallel alternatives alternatives = [turn for env in alternatives for turn in env] # cap any unsolicited data from additional episodes all_rewards = all_rewards[:games] drafter1.choices = drafter1.choices[:30 * games] drafter2.choices = drafter2.choices[:30 * games] drafter1.decks = drafter1.decks[:games] drafter2.decks = drafter2.decks[:games] alternatives = alternatives[:30 * games] # convert the list of rewards to the first player's win rate win_rate = (mean(all_rewards) + 1) * 50 return (win_rate, 100 - win_rate), \ (drafter1.mana_curve, drafter2.mana_curve), \ (drafter1.choices, drafter2.choices), \ alternatives, \ (drafter1.decks, drafter2.decks), \ all_rewards