def LunarLander_v2_DQN(): #TODO : 报错
    # Create environment
    env = gym.make('LunarLander-v2')

    # Instantiate the agent
    model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1)
    # Train the agent
    model.learn(total_timesteps=100000)
    # Save the agent
    model.save("dqn_lunar")
    del model  # delete trained model to demonstrate loading

    # Load the trained agent
    model = DQN.load("dqn_lunar")

    # Evaluate the agent
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
    print(mean_reward, std_reward)

    # Enjoy trained agent
    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Пример #2
0
def main():
    retro.data.Integrations.add_custom_path(
        os.path.join(SCRIPT_DIR, "custom_integrations"))
    print("PokemonRed-GameBoy" in retro.data.list_games(
        inttype=retro.data.Integrations.ALL))
    env = retro.make("PokemonRed-GameBoy", inttype=retro.data.Integrations.ALL)
    print(env)

    print(env.action_space)
    time.sleep(3)

    env = make_vec_env(lambda: env, n_envs=1)
    # check_env(env, warn=True)
    time.sleep(3)

    model = DQN(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=25000)

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()

    env.close()
def run_model(algorithm, training_timesteps, testing_timesteps,
              training_iterations, testing_iterations, learning_rate,
              batch_size):

    model = DQN(CustomPolicy,
                env,
                learning_rate=learning_rate,
                batch_size=batch_size)

    for k in range(training_iterations):
        model.learn(total_timesteps=int(training_timesteps))
        model.save("{}_{}_{}_{}".format("rcrs_wgts", k, algorithm, hostname))
        subprocess.Popen(path_for_kill_file, shell=True)

    for j in range(testing_iterations):
        # Load the trained agent

        model = DQN.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm,
                                              hostname))
        # Reset the environment
        obs = env.reset()
        # Create an empty list to store reward values
        final_rewards = []
        for _ in range(testing_timesteps):
            # predict the values
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            if dones == True:
                final_rewards.append(rewards)
        # Print the mean reward
        print(np.mean(final_rewards))
        # Print the standard deviation of reward
        print(np.std(final_rewards))
        # Create a DataFrame to save the mean and standard deviation
        df = df.append(
            {
                'Mean Rewards': np.mean(final_rewards),
                'Standard deviation': np.std(final_rewards)
            },
            ignore_index=True)

        df.to_csv("{}_{}_{}".format(1,
                                    algorithm,
                                    "MeanAndStdReward.csv",
                                    sep=',',
                                    index=True))

        subprocess.Popen(path_for_kill_file, shell=True)
    subprocess.Popen(path_for_kill_file, shell=True)
Пример #4
0
def main():
    retro.data.Integrations.add_custom_path(
        os.path.join(SCRIPT_DIR, "custom_integrations"))
    print("PokemonRed-GameBoy" in retro.data.list_games(
        inttype=retro.data.Integrations.ALL))
    env = retro.make("PokemonRed-GameBoy",
                     inttype=retro.data.Integrations.ALL,
                     use_restricted_actions=retro.Actions.DISCRETE)
    print(env)

    # print(env.action_space)
    # time.sleep(3)

    # env = make_vec_env(lambda: env, n_envs=1)
    # check_env(env, warn=True)
    # time.sleep(3)

    model = DQN(MlpPolicy, env, verbose=1)

    print("STARTING Training!!!")
    start_time = time.time()
    model.learn(total_timesteps=50000)
    print("TRAINING COMPLETE! Time elapsed: ", str(time.time() - start_time))

    print("Attempting to get first pokemon!")
    start_time = time.time()
    printed_done = False
    sampled_info = False

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()

        if not sampled_info:
            print("Here's the info that the AI uses:\n")
            print("obs:\n", obs, "\n</obs>\n")
            print("rewards:\n", rewards, "\n</rewards>\n")
            print("dones:\n", dones, "\n</dones>\n")
            print("Info:\n", info, "\n</info>\n")
            sampled_info = True

        if dones and not printed_done:
            print("Success! time elapsed: ", str(time.time() - start_time))
            printed_done = True

    env.close()
Пример #5
0
def optimize_agent(trial):
    env = wds(wds_name=hparams['env']['waterNet'] + '_master',
              speed_increment=hparams['env']['speedIncrement'],
              episode_len=hparams['env']['episodeLen'],
              pump_groups=hparams['env']['pumpGroups'],
              total_demand_lo=hparams['env']['totalDemandLo'],
              total_demand_hi=hparams['env']['totalDemandHi'],
              reset_orig_pump_speeds=hparams['env']['resetOrigPumpSpeeds'],
              reset_orig_demands=hparams['env']['resetOrigDemands'])

    model_params = optimize_dqn(trial)
    dict_layers = optimize_arch(trial)
    model = DQN(policy=CustomPolicy,
                policy_kwargs=dict_layers,
                env=env,
                verbose=0,
                train_freq=1,
                learning_starts=10000,
                buffer_size=350000,
                exploration_fraction=.95,
                exploration_final_eps=.0,
                param_noise=False,
                prioritized_replay=False,
                tensorboard_log=None,
                n_cpu_tf_sess=1,
                **model_params)
    model.learn(total_timesteps=1000000)

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    env.randomize_demands()
    obs = env.reset(training=False)
    while n_episodes < 50:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action, training=False)

        if done:
            rewards.append(reward)
            n_episodes += 1
            env.randomize_demands()
            obs = env.reset(training=False)

    mean_reward = np.mean(rewards)
    trial.report(-1 * mean_reward)
    del env, model
    gc.collect()
    return -1 * mean_reward
Пример #6
0
def test_baselineEnv():
    try:
        import gym
        from stable_baselines.common.vec_env import DummyVecEnv
        from stable_baselines.deepq.policies import MlpPolicy
        from stable_baselines import DQN

        env = gym.make('CartPole-v1')

        model = DQN(MlpPolicy, env, verbose=1)
        model.learn(total_timesteps=20)
        action, _ = model.predict(env.reset())
        env.step(action)
        return True
    except Exception as er:
        assert False, er
Пример #7
0
def test_action_mask_run_dqn(vec_env, policy, env_class):
    env = vec_env([env_class])

    model = DQN(policy, env, verbose=0)

    obs, done, action_masks = env.reset(), [False], []
    while not done[0]:
        action, _states = model.predict(obs, action_mask=action_masks)
        obs, _, done, infos = env.step(action)

        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
            action_masks.append(env_action_mask)

    env.close()
Пример #8
0
class DqnController:
    """
    Implements an RL (DQN) controller
    """
    def __init__(self, env):
        """
        :param: env: a thermostat environment
        """
        self.env = env
        self.model = DQN(MlpPolicy,
                         env,
                         verbose=1,
                         tensorboard_log="./dqn_thermostat_tensorboard/")

    @staticmethod
    def name():
        return "Dqn"

    def train(self):
        self.model.learn(total_timesteps=50000)

    def save(self):
        self.model.save("dqn.pk")

    def load(self):
        self.model = None
        self.model = DQN.load("dqn.pk")

    def simulate(self):
        state = self.env.reset()
        cumulative_reward = 0.0
        P_consumed = []
        done = False
        while not done:
            action, _state = self.model.predict(state)
            state, reward, done, info = self.env.step(action)
            cumulative_reward += reward
            P_consumed.append(action)
        print("MSE Setpoint- realized: %.3f - Energy consumed: %.2f" %
              (cumulative_reward, sum(P_consumed)))
        result_folder = "results/" + self.name(
        ) + "/" + self.env.start_date.strftime(
            "%m-%d-%Y") + "_to_" + self.env.end_date.strftime("%m-%d-%Y")
        self.env.store_and_plot(result_folder)

    def set_env(self, env):
        self.env = env
Пример #9
0
def AirRaid_main():
    env = retro.make('AirRaid-Atari2600', use_restricted_actions=retro.Actions.DISCRETE)
    model = DQN(CnnPolicy, env, verbose=1)
    model.learn(total_timesteps=25000)
    model.save("AirRaid_Model")

    del model

    model = DQN.load("AirRaid_Model")

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rew, done, info = env.step(action)
        #env.render()
        if done:
            obs = env.reset()
        env.close()
Пример #10
0
def run_model(algorithm, training_timesteps, testing_timesteps, training_iterations, testing_iterations, learning_rate, batch_size):
	columns = ['Mean Rewards', 'Standard deviation'] 
	df = pd.DataFrame(columns=columns)
	if (algorithm == "PPO2"):
	    from stable_baselines.common.policies import MlpPolicy
	    model = PPO2(MlpPolicy, env, verbose=1, learning_rate=learning_rate, tensorboard_log = "./{}_rcrs_tensorboard/".format(hostname), n_steps = batch_size)
	else:
	    from stable_baselines.deepq.policies import MlpPolicy
	    model = DQN(MlpPolicy, env, verbose=1, learning_rate=learning_rate, tensorboard_log = "./{}_rcrs_tensorboard/".format(hostname),  batch_size = batch_size)
	for k in range(training_iterations):
		# Train the agent
		model.learn(total_timesteps=int(training_timesteps))
		# Saving the model 
		model.save("{}_{}_{}_{}".format("rcrs_wgts", k, algorithm, hostname))
		subprocess.Popen(path_for_kill_file, shell=True)

	for j in range(testing_iterations):
	    # Load the trained agent
	    if (algorithm == "PPO2"):
	    	model = PPO2.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm, hostname))
	    else:
	    	model = DQN.load("{}_{}_{}_{}".format("rcrs_wgts", j, algorithm, hostname))
	    # Reset the environment
	    obs = env.reset()
	    # Create an empty list to store reward values 
	    final_rewards = []
	    for _ in range(testing_timesteps):
	        # predict the values
	        action, _states = model.predict(obs)
	        obs, rewards, dones, info = env.step(action)
	        if dones == True:
	            final_rewards.append(rewards)
	    # Print the mean reward
	    print(np.mean(final_rewards))
	    # Print the standard deviation of reward
	    print(np.std(final_rewards))
	    # Create a DataFrame to save the mean and standard deviation
	    df = df.append({'Mean Rewards': np.mean(final_rewards), 'Standard deviation': np.std(final_rewards)}, ignore_index=True)
	    df.to_csv("{}_{}_{}".format(algorithm, hostname, "MeanAndStdReward.csv", sep=',',index=True))
	    
	    subprocess.Popen(path_for_kill_file, shell=True)
	subprocess.Popen(path_for_kill_file, shell=True)
Пример #11
0
def run():
    # hyperparameters
    gamma = 0.99  #discount factor
    learning_rate = 0.00025  #learning rate for adam optimizer
    buffer_size = 50000  #size of the replay buffer
    exploration_fraction = 0.1  #fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps = 0.02  #final value of random action probability
    exploration_initial_eps = 1.0  #initial value of random action probability
    train_freq = 1  #update the model every train_freq steps. set to None to disable printing
    batch_size = 32  #size of a batched sampled from replay buffer for training
    double_q = True  #whether to enable Double-Q learning or not.
    learning_starts = 100  #how many steps of the model to collect transitions for before learning starts
    timesteps = 1000  #2000
    verbose = 1

    env = gym.make('Boxoban-Train-v1')

    model = DQN(MlpPolicy,
                env,
                gamma=gamma,
                learning_rate=learning_rate,
                buffer_size=buffer_size,
                exploration_fraction=exploration_fraction,
                exploration_final_eps=exploration_final_eps,
                exploration_initial_eps=exploration_initial_eps,
                train_freq=train_freq,
                batch_size=batch_size,
                double_q=double_q,
                learning_starts=learning_starts,
                verbose=1)
    model.learn(total_timesteps=timesteps)
    model.save("trained_models/dqn_sokoban_model")

    # Enjoy trained agent
    obs = env.reset()
    print(model.action_probability(obs))
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
Пример #12
0
def recieve(sid, data):

    global done
    global reward
    global maxactions
    jsonInput = json.loads(data)
    maxactions = jsonInput['maxactions']
    trainepisodes = jsonInput['trainepisodes']
    evalepisodes = jsonInput['evalepisodes']
    totalepisodes = trainepisodes + evalepisodes

    env = UnrealEnvWrap()
    # wrap it
    env = make_vec_env(lambda: env, n_envs=1)

    # Train the agent with different algorityhms from stable baselines

    #model = DQN(MlpPolicy, env, verbose=1, tensorboard_log="./DQN_newobservations/")
    model = DQN(MlpPolicy, env, verbose=1)
    #model = A2C(MlpPolicy, env, verbose=1, tensorboard_log="./A2C_newobservations/")
    #model = A2C(MlpPolicy, env, verbose=1)
    print("Agent training in process...")
    model.learn(total_timesteps=trainepisodes)

    # Test the trained agent, (currently not needed, all testing occurs in Unreal itself)
    env.render(mode='console')
    #env.render()

    obs = env.reset()
    print("Training complete, Starting Evaluation of Trained Model:")
    intaction = 0
    #Begin strategic behvaior
    for step in range(evalepisodes):
        action, _ = model.predict(obs, deterministic=True)
        intaction = action[0]
        print("Action: ", intaction)
        obs, reward, done, info = env.step(action)
        print('obs=', obs, 'reward=', reward, 'done=', done)

    sio.disconnect(sid)
Пример #13
0
def main():
    # create the environment
    env = gym.make("gym_balanceBot-v0")

    if os.path.isfile("trained_model/dqn_balanceBot.zip") == False:
        # Instantiate the agent
        model = DQN('MlpPolicy',
                    env,
                    learning_rate=1e-3,
                    prioritized_replay=True,
                    verbose=1)

        # Train the agent
        model.learn(total_timesteps=int(2e5))
        # Save the agent
        model.save("trained_model/dqn_balanceBot")
        del model  # delete trained model to demonstrate loading

        # Load the trained agent
        model = DQN.load("trained_model/dqn_balanceBot")

        # Evaluate the agent
        mean_reward, std_reward = evaluate_policy(model,
                                                  model.get_env(),
                                                  n_eval_episodes=10)

    else:
        # Load the trained agent
        model = DQN.load("trained_model/dqn_balanceBot")

    # Enjoy trained agent
    obs = env.reset()
    for i in range(3000):
        action, states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
        sleep(1. / 240.)

    env.close()
Пример #14
0
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_dqn(trial)
    seed = trial.suggest_int('numpyseed', 1, 429496729)
    np.random.seed(seed)
    original_env = gym.make('rustyblocks-v0')
    original_env.max_invalid_tries = 3
    env = DummyVecEnv([lambda: original_env])
    model = DQN("MlpPolicy", env, verbose=0, **model_params)
    print("DOING LEARING dqn")
    original_env.force_progression = False
    model.learn(int(2e4), seed=seed)
    print("DONE LEARING dqn")
    original_env.max_invalid_tries = -1

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    original_env.force_progression = True
    original_env.invalid_try_limit = 5000
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(last_reward)

    return last_reward
Пример #15
0
         symptomatic_infection = array[j][5] / total_symptomatic_infection
         asymptomatic_infection = array[j][4] / total_asymptomatic_infection
         recovered = array[j][6] / (total_recovered + 0.01)
         pathogen = array[j][7] / total_pathogen
         env = DistributionEnv(1, 1, 1000000, susceptible, exposed,
                               symptomatic_infection,
                               asymptomatic_infection, recovered, pathogen,
                               200)
         nn_model = DQN('MlpPolicy',
                        env,
                        learning_rate=1e-3,
                        prioritized_replay=True,
                        verbose=1)
         nn_model.learn(total_timesteps=int(1e4), log_interval=10000)
         observation = env.reset()
         action, states = nn_model.predict(observation)
         observation, done, reward = env.step(action)
         day_actions.append(action)
         day_rewards.append(reward)
     actions.append(day_actions)
     rewards.append(day_rewards)
 distributions = []
 for action in actions:
     s = sum(action)
     l = []
     for a in action:
         l.append(a / s)
     distributions.append(l)
 print(actions)
 print(distributions)
 print(rewards)
Пример #16
0
class DQNAgentBaseline(BaseAgent):
    def __init__(self, state_size, action_size, agent_settings, is_agent_to_load, env, signal_done, signal_episode,
                 statistic: StatisticsBaseline, game_settings, game_type, agent_to_load_directory, game_name):
        """

        :param state_size:
        :param action_size:
        :param agent_settings:
        :param is_agent_to_load:
        :param env:
        :param signal_done:
        :param signal_episode:
        :param statistic:
        :param game_settings:
        :param game_type: can be box or atari
        """
        super().__init__(state_size, action_size, agent_settings, is_agent_to_load, game_name)
        self.env = env

        self.is_baseline = True
        self.signal_done = signal_done
        self.signal_episde = signal_episode
        self.statistic = statistic
        self.game_settings = game_settings



        self.gamma = agent_settings.gamma
        self.learning_rate = agent_settings.learning_rate
        self.epsilon_decay = agent_settings.exploration_decay
        self.epsilon_min = agent_settings.mnimal_exploration
        self.batch_size = agent_settings.mini_batch
        self.replay_size=agent_settings.replay_size


        self.last_episode_emited=0
        self.game_type = game_type
        self.start_time = time.time()
        self.last_save_time = time.time()
        if is_agent_to_load:
            self.load_model(agent_to_load_directory)
        else:
            self.build_model()
    def build_model(self):

        if self.game_type=="box":
            self.env = DummyVecEnv([lambda: self.env])
            self.model = DQN(MlpPolicy, self.env, verbose=0,gamma=self.gamma,exploration_fraction=self.epsilon_decay,exploration_final_eps=self.epsilon_min,learning_rate=self.learning_rate,buffer_size=self.replay_size,batch_size=self.batch_size)
        if self.game_type=="atari":

            self.model = DQN(CnnPolicy, self.env, verbose=1,gamma=self.gamma,exploration_fraction=self.epsilon_decay,exploration_final_eps=self.epsilon_min,learning_rate=self.learning_rate,buffer_size=self.replay_size,batch_size=self.batch_size)

    def update_target_model(self):
        super().update_target_model()

    def get_action(self, state):
        action, _states = self.model.predict(state)
        return action

    def append_sample(self, state, action, reward, next_state, done):
        super().append_sample(state, action, reward, next_state, done)

    def save_model(self,file_name="./models/agentDQN"):
        self.model.save(file_name)
        out=open(file_name+".txt","w")
        out.write(self.game_name)
        out.close()

    def load_model(self,agent_to_load_directory):
        if  agent_to_load_directory=="":
            self.model=DQN.load("./models/agentDQN.pkl",env=self.env)
        else:
            self.model=DQN.load(agent_to_load_directory,env=self.env)

    def train_model(self):

        self.model.learn(total_timesteps=self.game_settings.max_steps_number,callback=self.callback)


    def callback(self,_locals, _globals):
        self.statistic.append_score(_locals['episode_rewards'],_locals['episode_rewards'].__len__())

        if _locals['episode_rewards'].__len__()!=self.last_episode_emited and _locals['episode_rewards'].__len__()>1:
            self.signal_episde.emit(_locals['episode_rewards'].__len__()-1,self.statistic.get_current_mean_score(),_locals['episode_rewards'][-2],_locals['_'])
            self.last_episode_emited=_locals['episode_rewards'].__len__()

        if self.statistic.get_current_mean_score()>=self.game_settings.target_accuracy or _locals['_']+1>=self.game_settings.max_steps_number:
            self.signal_done.emit(_locals['episode_rewards'].__len__(), self.statistic.get_current_mean_score())
            self.done=True
            output=open("./models/trenningResults.txt","w")
            output.write("czas trening:"+str((time.time()-self.start_time)/3600)+"h \n")
            output.write("liczba epizodów:"+str(_locals['episode_rewards'].__len__()) + "\n")
            output.write("liczba kroków:" + str(_locals['_']) + "\n")
            output.close()

            return False
        if time.time()-self.last_save_time>60*10:
            output = open("./models/trenningResults.txt", "w")
            output.write("czas trening:" + str((time.time() - self.start_time) / 3600) + "h \n")
            output.write("liczba epizodów:" + str( _locals['episode_rewards'].__len__()) + "\n")
            output.write("liczba kroków:" + str(_locals['_']) + "\n")
            output.close()

            self.last_save_time=time.time()

            self.save_model("./models/agentDQNtemp")
        return True
Пример #17
0
	model = DQN.load('../model/DQN_without_prioritized', env=env)
	result = {}

	mean_reward = []
	scores = []

	episodes = 1000
	with open("../result/DQN_without_prioritized.txt", "w") as txtfile:
	    for episode in range(1, episodes+1):
	        print(f"episode: {episode}")
	        state = env.reset()
	        done = False
	        temp_result = {}
	        score = 0 
	        while done!= True:
	        	action, _states = model.predict(state)
	        	n_state, reward, done, info = env.step(action)
	        	score+=reward
	        mean_reward.append(score)
	        scores.append(info[0]['score'])

	        temp = str(episode) + "," + str(score[0]) + "," + str(info[0]['score']) + "\n"
	        txtfile.write(temp)

	        mean = sum(mean_reward)/len(mean_reward)
	        mean_score = sum(scores)/len(scores)

	        print(f"The mean reward is {mean}")
	        print(f"The mean score reward is {mean_score}")
	        print(f"The max score is {max(scores)}")
Пример #18
0
def main():
    # parameters for the gym_carla environment
    params = {
        'number_of_vehicles': 25,
        'number_of_walkers': 0,
        'display_size': 256,  # screen size of bird-eye render
        'max_past_step': 1,  # the number of past steps to draw
        'dt': 0.1,  # time interval between two frames
        'discrete': True,  # whether to use discrete control space
        'continuous_accel_range': [-3.0, 3.0],  # continuous acceleration range
        'ego_vehicle_filter':
        'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        'town': 'Town06',  # which town to simulate
        'task_mode':
        'acc_1',  # mode of the task, [random, roundabout (only for Town03)]
        'max_time_episode': 1000,  # maximum timesteps per episode
        'max_waypt': 12,  # maximum number of waypoints
        'obs_range': 32,  # observation range (meter)
        'lidar_bin': 0.125,  # bin size of lidar sensor (meter)
        'd_behind': 12,  # distance behind the ego vehicle (meter)
        'out_lane_thres': 2.0,  # threshold for out of lane
        'desired_speed': 16.67,  # desired speed (m/s)
        'max_ego_spawn_times': 200,  # maximum times to spawn ego vehicle
        'display_route': True,  # whether to render the desired route
        'pixor_size': 64,  # size of the pixor labels
        'pixor': False,  # whether to output PIXOR observation
        'RGB_cam': True,  # whether to use RGB camera sensor
    }
    solver_params = {
        'layers': [64, 64, 64],
        'alpha': 0.001,
        'gamma': 0.99,
        'epsilon': 0.1,
        'replay_memory_size': 500000,
        'update_target_estimator_every': 10000,
        'batch_size': 64,
    }
    # Set gym-carla environment
    env = gym.make('carla-v0', params=params)
    #check_env(env)
    obs = env.reset()
    checkpoint_callback = CheckpointCallback(save_freq=5000,
                                             save_path='./dqn_checkpoint/',
                                             name_prefix='dqn_check')

    #model = DQN.load("./dqn_checkpoint/dqn_check_200_steps.zip",env=env,tensorboard_log="./dqn)
    model = DQN('LnMlpPolicy',
                env,
                learning_rate=1e-3,
                prioritized_replay=True,
                verbose=1,
                tensorboard_log="./dqn")
    model.learn(total_timesteps=35000,
                tb_log_name="35k-with_checkpoint",
                callback=checkpoint_callback)

    model.save("deepq_carla")

    del model  # remove to demonstrate saving and loading

    model = DQN.load("deepq_carla")

    obs = env.reset()

    for i in range(100):
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            if dones:
                obs = env.reset()
                break
Пример #19
0
    model.learn(total_timesteps=1000000)
    model.save("./models/dqn_snake_multi_player")
else:
    model = DQN.load("./models/dqn_snake_multi_player")

print("finished training, now use the trained model and render the env")
n_episodes = 1

turn = 0
done_running = n_snakes
while done_running > 0:
    env.render()

    if turn == 0:
        turn = 1
        action, state = model.predict(obs)
        obs, reward, done, info = env.step(action)
        if done:
            done_running -= 1
        env.render()

    elif turn == 1:
        turn = 0
        action, state = model.predict(obs)
        obs, reward, done, info = env.step(action)
        if done:
            done_running -= 1
        env.render()

env.close()
Пример #20
0
                gamma=0,
                exploration_fraction=0.6,
                exploration_final_eps=0,
                learning_rate=5e-4)
    #model = DQN.load("VSL_iter9600ver2.zip", env=env)
    start = time.time()
    model.learn(total_timesteps=time_steps, callback=callback)
    end = time.time()
    model.save(env_id + "_iter" + str(time_steps) + "_lane" + str(num_lanes))
    print("Training time: ", end - start)

    #Results Plot
    #results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "Speed Limit Manager")
    #plt.show()

    #Additional Logs
    for k in range(3):
        array = np.zeros(shape=(10, 10))
        for i in range(10):
            for j in range(10):
                obs = [i, j, k + 1]
                array[i][j] = model.predict(obs, deterministic=True)[0]
        ax = sns.heatmap(array, linewidth=0.5)
        plt.show()

    #Run Simulation after training
    #obs = env.reset()
    #for _ in range(1000):
    #    action, _states = model.predict(obs)
    #    obs, rewards, dones, info = env.step(action)
    #    env.render()
Пример #21
0
actions = []

# noisy conditions
S_n = [noisyObs[0]]
I_n = [noisyObs[1]]
R_n = [noisyObs[2]]

# max steps(days) for test
max_steps = 100

n_steps = 0 # for tracking number of steps taken
for step in range(max_steps):
  # increment
  n_steps += 1
  noisy_obs = env.noisy_state
  action, _ = model.predict(noisy_obs, deterministic=True)
  obs, reward, done, info = env.step(action)

  # save data to be plotted
  S.append(obs[0])
  I.append(obs[1])
  R.append(obs[2])
  actions.append(action)

  # print update
  print("Step {}".format(step + 1))
  print("Action: ", action)
  print('obs=', obs, 'reward=', reward, 'done=', done)

  if done:
    print("Done.", "reward=", reward)
Пример #22
0
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN

env = gym.make('ModuleSelect-v1')

model = DQN(
    env=env,
    policy=MlpPolicy,
    verbose=1,
)

print("> start train test")
model.learn(total_timesteps=1000)

env.close()
print("save the model")
model.save("test_dqn_model.pkl")

del model
model = DQN.load("test_dqn_model.pkl")

obs = env.reset()
print("> start load test")
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.reset()
env.close()
Пример #23
0
totalSteps = 0
totalRewSum = 0.0

bestSolution = -1
solvedDifferences = 0

for i_episode in range(1000):
    obs = env.reset()
    t = 0

    rewSum = 0
    useless = 0
    sameStep = 0
    badGood = 0
    while True:
        act, _states = model.predict(obs)
        obs, reward, done, info = env.step(act)
        print(obs)

        env.render()

        if info is not None:
            print()
            print("=== DEBUG INFO ===")
            print("Step: {0}/{1}".format(info[0]["current_step"],
                                         info[0]["max_step"]))
            print("Reward: ", info[0]["reward"])

            totalRewSum += float(info[0]["reward"])

            print("==== ACT INFO ====")
Пример #24
0
class CustomDQNPolicy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomDQNPolicy, self).__init__(*args,
                                              **kwargs,
                                              layers=[16, 16],
                                              layer_norm=False,
                                              feature_extraction="mlp")


model = DQN(CustomDQNPolicy, env, verbose=1)

#model.learn(total_timesteps=25000)

#generate_expert_traj(model, "I:\Code\BachelorThesis\cartpole\data\expert_cartpole", n_episodes=10)

#test it
reward_sum = 0.0
obs = env.reset()
for i in range(0, 10):
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        env.render()
    print(reward_sum)
    reward_sum = 0.0
    obs = env.reset()

env.close()
Пример #25
0
def train_once(graph: nx.Graph, clusters: list, pos: dict, env_name: str='Controller-Select-v0', compute_optimal: bool=True, trained_model: DQN=None, steps: int=2e5, logdir: str='train_log_compare', env_kwargs: dict={}) -> (DQN, float, float):
	"""
	Main training loop. Initializes RL environment, performs training, and outputs results
	Args:
		graph (nx.Graph): NetworkX graph to train on
		clusters (list): List of lists of nodes in each cluster
		pos (dict): Graph rendering positions
		env_name (str): Name of Gym environment
		compute_optimal (bool): Whether to compute optimal set of controllers by brute-force
		trained_model (DQN): Provide starting model to train on
	Return:
		Trained model
	"""
	# Selecting controllers one-at-a-time environment
	env = gym.make(env_name, graph=graph, clusters=clusters, pos=pos, **env_kwargs)
	heuristic_controllers, heuristic_distance = env.compute_greedy_heuristic()
	print("WMSCP Greedy Heuristic: {}, {}".format(heuristic_controllers, heuristic_distance))
	#for i in range(1000):
	#	env.reset()
	#	print(env.graph.size(weight='weight'))
	orig_graph = env.original_graph
	optimal_controllers = None
	if compute_optimal:
		print("Computing optimal!")
		optimal_controllers = env.calculateOptimal()


	# Generate custom replay buffer full of valid experiences to speed up exploration of training
	def add_wrapper(replay_buffer):
		# Replay buffer maxsize is by default 50000. Should this be lowered?
		# valid_controllers_set = [env._random_valid_controllers() for i in range(int(replay_buffer._maxsize * 0.5 / len(clusters)))]
		# Uses heuristic controller set as innitial 'random' controllers
		valid_controllers_set = env.graphCentroidAction()
	
		for valid_controllers in valid_controllers_set:
			obs_current = env.reset()  # Really strange issue - obs_current follows the change in env.state, making it equal to obs!
			for controller in valid_controllers:
				(obs, rew, done, _) = env.step(controller)
				replay_buffer.add(obs_current, controller, rew, obs, done)  # For some reason, obs is a pointer which ends up being the very last obs before reset, so need to copy
				obs_current = obs.copy()
		return replay_buffer

	# Agent
	model = None
	if trained_model is None:
		print("Creating new training model!")
		model = DQN(LnMlpPolicy, env, tensorboard_log=logdir, verbose=0, full_tensorboard_log=True, exploration_initial_eps=0.5, exploration_fraction=0.2, learning_starts=0, target_network_update_freq=100, batch_size=32, learning_rate=0.00025)
	else:
		print("Using provided training model!")
		model = trained_model
		model.set_env(env)
		model.tensorboard_log = logdir

	# Train the agent
	print("Training!")
	model.learn(total_timesteps=int(steps))#, callback=callback)#, replay_wrapper=add_wrapper)

	# Run a single run to evaluate the DQN
	obs = env.reset()
	reward = 0 #We want the last reward to be minimal (perhaps instead do cumulative?)
	reward_final = 0
	done = False
	action = None
	final_rl_actions = []
	while not done:
		action, states = model.predict(obs)
		(obs, rew, done, _) = env.step(action)
		final_rl_actions.append(action)
		reward += rew
		reward_final = rew

	# Show controllers chosen by the model
	env.render(mode='graph_end.png')
	print(env.controllers, reward_final)
	print("BEST EVER:")
	print(env.best_controllers, env.best_reward)
	best_reward = env.optimal_neighbors(graph, env.best_controllers)
	print(best_reward)

	average_graph = env.average_graph.copy()
	rl_controllers = env.controllers
	rl_best_controllers = env.best_controllers
	if env_name == 'Controller-Cluster-v0':
		rl_controllers.sort()
		rl_best_controllers.sort()
		cluster_len = len(clusters[0])
		for i in range(len(clusters)):
			rl_controllers[i] -= i * cluster_len
			rl_best_controllers[i] -= i * cluster_len
	env.reset(adjust=False, full=True)
	nx.write_gpickle(average_graph, 'average_graph.gpickle')
	env.graph = average_graph.copy()
	for cont in rl_controllers:
		(_, reward_final, _, _) = env.step(cont)
	print("RL Controllers on average change graph {} - {}".format(env.controllers, reward_final))
	env.reset(adjust=False, full=True)
	env.graph = average_graph.copy()
	for cont in rl_best_controllers:
		(_, reward_final, _, _) = env.step(cont)
	print("RL Best Controllers on average change graph {} - {}".format(env.best_controllers, reward_final))
	# Show controllers chosen using heuristic
	centroid_controllers, heuristic_distance = env.graphCentroidAction()
	#centroid_controllers, heuristic_distance = env.compute_greedy_heuristic()
	# Convert heuristic controllers to actual
	if env_name == 'Controller-Cluster-v0' or env_name == 'Controller-Cluster-Options-v0':
		# Assume all clusters same length
		centroid_controllers.sort()
		cluster_len = len(clusters[0])
		for i in range(len(clusters)):
			centroid_controllers[i] -= i * cluster_len
	env.reset(adjust=False, full=True)
	env.graph = average_graph.copy()
	for cont in centroid_controllers:
		(_, reward_final, _, _) = env.step(cont)
	env.render(mode='graph_heuristic.png')
	best_heuristic = reward_final
	print("Heuristic on average change graph {} - {}".format(env.controllers, reward_final))
	#print("Heuristic optimal {} - {}".format(*env.optimal_neighbors(graph,  env.controllers)))
	heuristic_controllers = env.controllers

	rl_rewards = []
	heuristic_rewards = []
	rl_best_rewards = []
	NUM_GRAPHS = 100
	for i in range(NUM_GRAPHS):
		rl_reward = None
		heuristic_reward = None
		rl_best_reward = None
		env.reset()
		nx.write_gpickle(env.graph, '100Graphs/graph_{}.gpickle'.format(i))
		for cont in final_rl_actions:
			(_, rl_reward, _, _) = env.step(cont)
		env.reset(adjust=False, full=False)
		for cont in centroid_controllers:
			(_, heuristic_reward, _, _) = env.step(cont)
		env.reset(adjust=False, full=False)
		for cont in rl_best_controllers:
			(_, rl_best_reward, _, _) = env.step(cont)
		print("RL REWARD, RL BEST REWARD, HEURISTIC: {}\t{}\t{}".format(rl_reward, rl_best_reward, heuristic_reward))
		rl_rewards.append(rl_reward)
		heuristic_rewards.append(heuristic_reward)
		rl_best_rewards.append(rl_best_reward)

	def create_hist(fig, data, title=None, color=None):
		bins = np.arange(min(data) - 100, max(data) + 100, 100)
		plt.xlim([min(data) - 100, max(data) + 100])
		fig.hist(data, bins=bins, alpha=0.5, color=color)
		if title:
			fig.title(title)
		plt.xlabel('Controller Distances')
		plt.ylabel('Count')
	fig = plt.figure()
	ax1 = fig.add_subplot(2, 1, 1)
	create_hist(ax1, rl_rewards, color='blue')
	create_hist(ax1, heuristic_rewards, color='red')
	create_hist(ax1, rl_best_rewards, color='green')
	ax2 = fig.add_subplot(2, 1, 2)
	ax2.plot(np.arange(0, NUM_GRAPHS, 1), rl_rewards, c='blue')
	ax2.plot(np.arange(0, NUM_GRAPHS, 1), heuristic_rewards, c='red')
	ax2.plot(np.arange(0, NUM_GRAPHS, 1), rl_best_rewards, c='green')
	plt.show()
	# Show optimal
	if optimal_controllers is not None:
		env.reset()
		for cont in optimal_controllers[0]:
			(_, reward_final, _, _) = env.step(cont)
		env.render(mode='graph_optimal.png')
		print(env.controllers, reward_final)
		print(optimal_controllers)
	return model, best_reward, best_heuristic
tensorboard_folder = './tensorboard/Bomberman/base/'
model_folder = './models/Bomberman/base/'
if not os.path.isdir(tensorboard_folder):
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = 'Cnn'
model_tag = 'Cnn'
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: BaseEnv()])
env = VecFrameStack(env, 2)

model = DQN(CustomCnnPolicy, env, verbose=0, tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=10000000, tb_log_name='DQN' + model_tag)

model.save(model_folder + "DQN" + model_tag)
del model
model = DQN.load(model_folder + "DQN" + model_tag)

done = False
states = None
obs = env.reset()

while not done:
    action, states = model.predict(obs, states)
    obs, _, done, info = env.step(action)
    env.render()
Пример #27
0
import gym

from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN

env = gym.make('Atlantis-ram-v4')

model = DQN(MlpPolicy, env, verbose=3)
model.learn(total_timesteps=1000000, log_interval=1)

observation = env.reset()
total_reward = 0
for i in range(18000):
    action, _states = model.predict(observation)
    observation, reward, done, info = env.step(action)
    env.render()
    total_reward += reward
    if done:
        break

print(total_reward)
Пример #28
0
def launchAgent(env_name: int,
                model_name: str,
                test_mode=False,
                filepath=None):
    """
    :param test_mode: 에이전트를 테스트 모드로 불러와 주행시킬지를 확인하는 모드입니다. 이럴 시에 학습은 이루어지지 않으며, 주행만 이루어집니다.
    :param env_name: 불러올 환경의 이름입니다.
        1 : 미니맵 이미지를 사용하지 않은, 점 사이의 거리 계산을 한 환경입니다.
        2 : 미니맵 이미지를 사용하고, 보상을 업데이트한 모델입니다.
        다른 값(기본) : 현재 쓰는 모델입니다. 미니맵 이미지를 사용하고, 보상을 다시 업데이트한 모델입니다.
    :param model_name: 설정할 모델의 이름입니다.
        DQN : DQN 모델을 불러옵니다.
        HER : HER 모델을 불러옵니다.
        다른 값(기본) : PPO2 모델을 불러옵니다.
    :return: 마지막으로 episode를 수행한 모델을 return합니다.
    """

    from stable_baselines import DQN, HER, PPO2

    if env_name == 1:
        from Reinforcement_AI.env.a_env import KartEnv
        kart_env = KartEnv()
        policy = "MlpPolicy"
    elif env_name == 2:
        from Reinforcement_AI.env.d_image_env import DetailedMiniMapEnv as DetailedMiniMapEnv1
        kart_env = DetailedMiniMapEnv1()
        policy = "CnnPolicy"
    elif env_name == 3:
        from Reinforcement_AI.env.a_env2 import KartEnv
        kart_env = KartEnv()
        policy = "MlpPolicy"
    elif env_name == 4:
        from Reinforcement_AI.env.a_env3 import KartEnv
        kart_env = KartEnv()
        policy = "MlpPolicy"
    else:  #env_name == "detailed_minimap_enhanced" or env_name == "4":
        from Reinforcement_AI.env.e_enhanced_image_env import DetailedMiniMapEnv as DetailedMiniMapEnv2
        kart_env = DetailedMiniMapEnv2()
        policy = "CnnPolicy"

    if model_name == "DQN":
        model = DQN(policy=policy,
                    env=kart_env,
                    double_q=True,
                    prioritized_replay=True,
                    verbose=1)
    elif model_name == "HER":
        model = HER(policy=policy, env=kart_env, model_class=DQN, verbose=1)
    else:  # model_name == "PPO2"
        model = PPO2(policy=policy,
                     learning_rate=0.0001,
                     env=kart_env,
                     verbose=1)

    if test_mode:  # 테스트 모드일때 에이전트 불러와서 작동하게함
        model.load(filepath)
        kart_env.set_continuos(True)

        while True:
            observation = kart_env.reset()
            while True:
                action, _states = model.predict(observation)
                observation, rewards, dones, info = kart_env.step(action)
                if dones:
                    break

    else:
        for i in range(1000):
            model.learn(total_timesteps=12500)
            model.save(str(env_name) + "_" + model_name + "_" + str(i + 1))
Пример #29
0
import gym_donkeycar
import numpy as np
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN

#SET UP ENVIRONMENT
os.environ[
    'DONKEY_SIM_PATH'] = f"./DonkeySimMac/donkey_sim.app/Contents/MacOS/donkey_sim"
os.environ['DONKEY_SIM_PORT'] = str(9091)
os.environ['DONKEY_SIM_HEADLESS'] = str(1)  # "1" is headless

env = gym.make("donkey-warehouse-v0")
#gym.make("donkey-generated-roads-v0")

timesteps = 100000  # Set this to a reasonable number
model_name = "dqn_model"  # Change the model name to your preferences
training = True  # Change this to test or use the model

if training:
    model = DQN(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=timesteps)
    model.save(model_name)
else:
    model = DQN.load(model_name)
    obv = env.reset()
    for t in range(10000):
        action, _states = model.predict(obv)  # drive straight with small speed
        # execute the action
        obv, reward, done, info = env.step(action)
Пример #30
0
class Defense:
    def __init__(self,
                 method,
                 K,
                 P,
                 adverse_set_prob=0.0,
                 disj_supp_prob=0.0,
                 model_state=np.array([])):
        self.method = method

        self.K = K
        self.state_size = 2 * (self.K + 1)
        self.action_size = 2
        self.reward = []
        self.adverse_set_prob = adverse_set_prob
        self.disj_supp_prob = disj_supp_prob

        env_name = 'ErdosDefender-v0'
        self.log_dir = "/tmp/gym/"
        os.makedirs(self.log_dir, exist_ok=True)

        env = gym.make(env_name)
        env.init_params(K, P, adverse_set_prob, disj_supp_prob, model_state)
        env = Monitor(env, self.log_dir, allow_early_resets=True)
        self.envs = DummyVecEnv([lambda: env])

        if method == 'PPO':
            self.model = PPO2(MLP_PPO, self.envs, verbose=0)
        elif method == 'DQN':
            self.model = DQN(MLP_DQN, self.envs, verbose=0)
        elif method == 'A2C':
            self.model = A2C(MLP_A2C, self.envs, verbose=0)
        else:
            raise Exception("Erreur ! Méthode: 'PPO' ou 'DQN' ou 'A2C'")
        print("Model Initialized !")

        self.best_mean_reward, self.n_steps = -np.inf, 0

    def callback(self, _locals, _globals):
        """
        Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
        :param _locals: (dict)
        :param _globals: (dict)
        """

        # Print stats every 1000 calls
        if (self.n_steps + 1) % 1000 == 0:
            # Evaluate policy performance
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            if len(x) > 0:
                mean_reward = np.mean(y[-100:])
                print(x[-1], 'timesteps')
                print(
                    "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}"
                    .format(self.best_mean_reward, mean_reward))

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward

        self.n_steps += 1
        return True

    def learn(self, timesteps=10000):
        self.model.learn(total_timesteps=timesteps, callback=self.callback)
        print("======\n{} LEARNING DONE DEFENSE\n======".format(self.method))

    def printViz(self, viz, k, N):

        plt.figure(figsize=(12, 8))
        for el in range(k + 1):
            plt.axhline(y=el - 0.5, linestyle='-')
        for el in range(N + 1):
            plt.axvline(x=el - 0.5, linestyle='-')

        plt.xticks(np.arange(N + 1))
        plt.yticks(np.arange(k + 1))
        plt.imshow(viz, origin='lower', cmap='gray', interpolation="none")

        plt.show()

    def simulate_trainedDefender(self):
        initial_state = self.envs.reset()
        A = initial_state[0][:self.K + 1]
        B = initial_state[0][self.K + 1:]
        N = np.sum(initial_state)
        viz = np.zeros((self.K + 1, N))

        for ind, el in enumerate((A + B).reshape(-1, 1)):
            viz[ind, :int(el)] = np.ones(int(el))

        print("Start..")
        print("Initial state:", A + B)
        self.printViz(viz, self.K, N)
        time.sleep(2)
        state = np.reshape(np.array(initial_state), [1, self.state_size])
        done = False

        while not done:
            clear_output(wait=True)
            print("Attacker turn..")
            partitionA = state[0][:self.K + 1]
            partitionB = state[0][self.K + 1:]
            print("Partitions : ", partitionA, partitionB)

            viz = np.zeros((self.K + 1, N))
            for i in range(self.K):
                ind1 = int(partitionA[i])
                ind2 = int(partitionB[i])
                viz[i, ind1:(ind1 + ind2)] = np.ones(ind2) * 0.3
                viz[i, :ind1] = np.ones(ind1) * 0.2
            self.printViz(viz, self.K, N)
            time.sleep(2)

            viz = np.zeros((self.K + 1, N))
            clear_output(wait=True)
            print("Defender turn..")

            action, _states = self.model.predict(state)
            state, reward, done, _ = self.envs.step(action)
            if (len(_[0]) != 0):
                state = _[0]['terminal_observation']
                state = np.reshape(np.array(state), [1, self.state_size])

            state = np.reshape(np.array(state), [1, self.state_size])
            A = state[0][:self.K + 1]
            B = state[0][self.K + 1:]
            if (action[0] == 1):
                print("Defender keeps:", partitionA)
                if partitionA[-1] > 0 or np.sum(partitionA) == 0:
                    done = True
            else:
                print("Defender keeps:", partitionB)
                if partitionB[-1] > 0 or np.sum(partitionB) == 0:
                    done = True

            for ind, el in enumerate((A + B).reshape(-1, 1)):
                if ind > 0:
                    viz[ind, :int(el)] = np.ones(int(el))

            self.printViz(viz, self.K, N)
            time.sleep(2)

            if done:
                if reward == 1:
                    print("Defender wins!!")
                else:
                    print("Attacker wins!!")
            else:
                partitionA = A
                partitionB = B

    def run(self, nb_episodes=1000):
        self.reward = []
        self.nb_episodes = nb_episodes

        for index_episode in range(nb_episodes):
            state = self.envs.reset()
            state = np.reshape(np.array(state), [1, self.state_size])
            done = False
            steps = 0
            while not done:
                action, _states = self.model.predict(state)
                next_state, reward, done, _ = self.envs.step(action)
                next_state = np.reshape(np.array(next_state),
                                        [1, self.state_size])
                state = next_state
                steps += 1
            if index_episode % 100 == 0:
                print("Episode {}#; \t Nb of steps: {}; \t Reward: {}.".format(
                    index_episode, steps + 1, reward))
            if index_episode > 0:
                self.reward += [
                    ((self.reward[-1] * len(self.reward)) + reward) /
                    (len(self.reward) + 1)
                ]
            else:
                self.reward += [reward]