Пример #1
0
def learn(gym_id, episodes=1000, batch_size=32, model_path="models/model.h5"):
    env = gym.make(gym_id)
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.n
    agent = DQN(create_model(num_states, num_actions))
    for e in range(episodes):
        state = env.reset()
        state = np.reshape(state, [1, num_states])
        total_reward = 0.
        for steps in range(500):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, agent.state_size])
            agent.remember(state, action, reward, next_state, done)
            total_reward += reward
            state = next_state
            if done:
                print(
                    'Episode {}/{} done in {} steps, total reward {}: '.format(
                        e + 1, episodes, steps + 1, total_reward))
                if total_reward >= 200:
                    agent.save(model_path)
                    return agent
                break
            if agent.memory_size > batch_size:
                agent.train(
                    batch_size
                )  # train the agent with the experience of the episode
    env.close()
    return None
Пример #2
0
 def __init__(self, stateCnt, actionCnt, **kwargs):
     if 'state_1d' in kwargs:
         state_1d = kwargs['state_1d']
     else:
         state_1d = False
     if 'dueling' in kwargs:
         dueling = kwargs['dueling']
     else:
         dueling = False
     self.steps = 0
     self.epsilon = globalvars.MAX_EPSILON
     self.stateCnt = stateCnt
     self.actionCnt = actionCnt
     self.dqn = DQN(self.stateCnt,
                    self.actionCnt,
                    state_1d=state_1d,
                    dueling=dueling)
     self.memory = Memory()
def learn(args):
    grid = args.grids[0]
    rico = Ricochet()
    rico.grid.load_grid(grid)
    app = Application(board=rico.grid, show_grid=True)
    if args.deep:
        model = DQN((16, 16), 16, verbose=True)
    else:
        model = Qlearn()

    if args.input:
        model.load_model(args.input)

    _thread.start_new_thread(
        _model_act, (app, model, *args.grids), {
            "output_path": (args.output if args.output else None),
            "learning": 1
        })
    app.mainloop()
def play(args):
    grid = args.grids[0]
    rico = Ricochet()
    rico.grid.load_grid(grid)
    app = Application(board=rico.grid, show_grid=True)
    if args.deep:
        model = DQN((16, 16),
                    16,
                    exploration_rate=0,
                    exploration_decay=0,
                    exploration_min=0)
    else:
        model = Qlearn(exploration_rate=0,
                       exploration_decay=0,
                       exploration_min=0)
    model.load_model(args.model)

    _thread.start_new_thread(_model_act, (app, model, *args.grids), {
        "learning": False,
        "nb_episode": 1,
        "nb_step": 0,
        "max_moves": 500
    })
    app.mainloop()
Пример #5
0
        sys.stdout.write("\r" + text)
        sys.stdout.flush()


if __name__ == "__main__":
    seed = 1364
    total_episodes = 5001
    reward_curve_display_frequency = 100
    save_model_frequency = 100

    learning_rate = 0.001
    epsilon_decay = 0.0001
    gradient_clipping_norm = 0.7

    # Instantiate RL objects
    env = CartPoleV0(seed=seed)
    explorer = ActionExplorer(epsilon_decay=epsilon_decay, seed=seed)
    agent = DQN(env.input_dim,
                env.num_actions,
                explorer=explorer,
                gradient_clipping_norm=gradient_clipping_norm,
                learning_rate=learning_rate,
                double_dqn=True,
                seed=seed)
    # Run training
    train(env,
          agent,
          total_episodes=total_episodes,
          reward_curve_display_frequency=reward_curve_display_frequency,
          save_model_frequency=save_model_frequency)
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(action_size))

# Learning rate and Optimizer (Must be TF!)
LEARNING_RATE = 1e-3
tf_optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE)

# Create the policy, this is Epsilon-Greedy (exponential decay)
policy = EGP(init_eps=0.95, min_epsilon=0.01, decay=0.003)

# Can create an object for Prioritized Experience Replay
per = PER(priority_importance=0.6, initial_anneal=0.5, anneal_growth_rate=0.00008)

# Make the agent! In this case a Double DQN with PER
# Can make it dueling and auto add streams, or like the commented model above just dueling and set add streams to false
agent = DQN(double_dqn=True, PER=per, dueling_dqn=False, add_dueling_streams=False, model=model, optimizer=tf_optimizer,
            policy=policy, action_size=action_size, state_processor=state_processor, gamma=0.95,
            target_model_update_policy='soft', target_model_hard_policy_wait=500, target_model_soft_policy_constant=0.9,
            replay_period_wait=4, reward_clipping=True, huber_loss=True, batch_size=64, max_memory_length=10000)

# Make callbacks if you want, reward and epsilon are implemented
rew_cb = PrintReward()
eps_cb = PrintEpsilon(episodic=True, iterations=None)

# Make a benchmark if you want to keep track of info and data on the agents testing performance
benchmark = Benchmark('bench_0', episode_iteration=1)

agent.train(env, 100000, None, print_rew_cb=rew_cb, print_eps_cb=eps_cb, visualize=False, allow_printing=True)

agent.test(env, 50000, None, print_rew_cb=rew_cb, benchmark=benchmark, visualize=False, allow_printing=True)
Пример #7
0
def run_lrm(env_params, lp, rl):
    """
    This code learns a reward machine from experience and uses dqn to learn an optimal policy for that RM:
        - 'env_params' is the environment parameters
        - 'lp' is the set of learning parameters
    Returns the training rewards
    """
    # Initializing parameters and the game
    env = Game(env_params)
    rm = RewardMachine(lp.rm_u_max, lp.rm_preprocess,
                       lp.rm_tabu_size, lp.rm_workers, lp.rm_lr_steps,
                       env.get_perfect_rm(), lp.use_perfect_rm)
    actions = env.get_actions()
    policy = None
    train_rewards = []
    rm_scores = []
    reward_total = 0
    last_reward = 0
    step = 0

    # Collecting random traces for learning the reward machine
    print("Collecting random traces...")
    while step < lp.rm_init_steps:
        # running an episode using a random policy
        env.restart()
        trace = [(env.get_events(), 0.0)]
        for _ in range(lp.episode_horizon):
            # executing a random action
            a = random.choice(actions)
            reward, done = env.execute_action(a)
            o2_events = env.get_events()
            reward_total += reward
            trace.append((o2_events, reward))
            step += 1
            # Testing
            if step % lp.test_freq == 0:
                print("Step: %d\tTrain: %0.1f" %
                      (step, reward_total - last_reward))
                train_rewards.append((step, reward_total - last_reward))
                last_reward = reward_total
            # checking if the episode finishes
            if done or lp.rm_init_steps <= step:
                if done: rm.add_terminal_observations(o2_events)
                break
        # adding this trace to the set of traces that we use to learn the rm
        rm.add_trace(trace)

    # Learning the reward machine using the collected traces
    print("Learning a reward machines...")
    _, info = rm.learn_the_reward_machine()
    rm_scores.append((step, ) + info)

    # Start learning a policy for the current rm
    finish_learning = False
    while step < lp.train_steps and not finish_learning:
        env.restart()
        o1_events = env.get_events()
        o1_features = env.get_features()
        u1 = rm.get_initial_state()
        trace = [(o1_events, 0.0)]
        add_trace = False

        for _ in range(lp.episode_horizon):

            # reinitializing the policy if the rm changed
            if policy is None:
                print("Learning a policy for the current RM...")
                if rl == "dqn":
                    policy = DQN(lp, len(o1_features), len(actions), rm)
                elif rl == "qrm":
                    policy = QRM(lp, len(o1_features), len(actions), rm)
                else:
                    assert False, "RL approach is not supported yet"

            # selecting an action using epsilon greedy
            a = policy.get_best_action(o1_features, u1, lp.epsilon)

            # executing a random action
            reward, done = env.execute_action(a)
            o2_events = env.get_events()
            o2_features = env.get_features()
            u2 = rm.get_next_state(u1, o2_events)

            # updating the number of steps and total reward
            trace.append((o2_events, reward))
            reward_total += reward
            step += 1

            # updating the current RM if needed
            rm.update_rewards(u1, o2_events, reward)
            if done: rm.add_terminal_observations(o2_events)
            if rm.is_observation_impossible(u1, o1_events, o2_events):
                # if o2 is impossible according to the current RM,
                # then the RM has a bug and must be relearned
                add_trace = True

            # Saving this transition
            policy.add_experience(o1_events, o1_features, u1, a, reward,
                                  o2_events, o2_features, u2, float(done))

            # Learning and updating the target networks (if needed)
            policy.learn_if_needed()

            # Testing
            if step % lp.test_freq == 0:
                print("Step: %d\tTrain: %0.1f" %
                      (step, reward_total - last_reward))
                train_rewards.append((step, reward_total - last_reward))
                last_reward = reward_total
                # finishing the experiment if the max number of learning steps was reached
                if policy._get_step() > lp.max_learning_steps:
                    finish_learning = True

            # checking if the episode finishes or the agent reaches the maximum number of training steps
            if done or lp.train_steps <= step or finish_learning:
                break

            # Moving to the next state
            o1_events, o1_features, u1 = o2_events, o2_features, u2

        # If the trace isn't correctly predicted by the reward machine,
        # we add the trace and relearn the machine
        if add_trace and step < lp.train_steps and not finish_learning:
            print("Relearning the reward machine...")
            rm.add_trace(trace)
            same_rm, info = rm.learn_the_reward_machine()
            rm_scores.append((step, ) + info)
            if not same_rm:
                # if the RM changed, we have to relearn all the q-values...
                policy.close()
                policy = None
            else:
                print("the new RM is not better than the current RM!!")
                #input()

    if policy is not None:
        policy.close()
        policy = None

    # return the trainig rewards
    return train_rewards, rm_scores, rm.get_info()
Пример #8
0
                    'dense_layers': [30, 15, env.num_actions],
                    'activation': 'relu',
                    # 'dense_bn': True
                },
                # 'gradient_clipping_norm': 0.7,
                'reward_to_go': True,
                'set_device': 'cpu',
                'learning_rate': 0.1,
                'seed': seed
            },
            'total_episodes': 10001
        }
    }

    agents = {
        'dqn': DQN(**parameters['dqn']['parameters']),
        'vpg': VanillaPolicyGradient(**parameters['vpg']['parameters'])
    }

    for agent_name in agents.keys():
        agent = agents[agent_name]
        # Run training
        train(
            env,
            agent,
            total_episodes=parameters[agent_name]['total_episodes'],
            rolling_window_size=rolling_window_size,
            reward_curve_display_frequency=reward_curve_display_frequency,
            save_model_frequency=save_model_frequency
        )
Пример #9
0
def main(args):

    # gpus
    if args.gpus is not None:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus

    # setup environment
    env = ContinuousPuckStack(args.num_blocks,
                              args.num_pucks,
                              args.num_pucks,
                              args.num_actions,
                              height_noise_std=args.height_noise_std,
                              random_shape=args.random_shape,
                              action_failure_prob=args.action_failure_prob)
    env.initStride = args.init_env_stride  # stride for initial puck placement
    env.stride = args.env_stride  # stride for action specification

    # setup the agent
    state_shape = (args.num_blocks * 28, args.num_blocks * 28, 1)
    output_shape = (args.num_actions, args.num_actions)

    agent = DQN(env,
                state_shape,
                output_shape,
                args.num_filters,
                args.filter_sizes,
                args.strides,
                args.hiddens,
                args.learning_rate,
                args.batch_size,
                constants.OPT_MOMENTUM,
                args.exploration_fraction,
                1.0,
                args.final_epsilon,
                args.max_time_steps,
                buffer_size=args.buffer_size,
                prioritized_replay=not args.disable_prioritized_replay,
                target_net=not args.disable_target_network,
                target_update_freq=args.target_update_freq,
                target_size=args.target_size,
                fix_dones=args.fix_dones)

    agent.start_session(args.num_cpu, args.gpu_memory_fraction)

    # maybe load weights
    if args.load_weights:
        agent.load(args.load_weights)
        print("Loaded weights.")

    # initialize a solver
    transitions = []

    collect_pre, collect_post = collect_factory_bisim(args.num_pucks)
    collect_data = collect_data_factory_bisim(transitions, args.save_exp_num)

    t_collect_pre, t_collect_post, t_collect_data = None, None, None
    if not args.save_exp_after_training:
        t_collect_pre, t_collect_post, t_collect_data = collect_pre, collect_post, collect_data

    solver = Solver(env,
                    agent,
                    args.max_time_steps,
                    learning_start=LEARNING_STARTS,
                    train_freq=TRAIN_FREQ,
                    max_episodes=args.max_episodes,
                    rewards_file=args.rewards_file,
                    animate=args.animate,
                    animate_from=args.animate_from,
                    gif_save_path=args.save_gifs_path,
                    gif_save_limit=args.save_limit,
                    gif_save_only_successful=args.save_only_successful,
                    max_depth_value=args.num_pucks,
                    collect_pre=t_collect_pre,
                    collect_post=t_collect_post,
                    collect_data=t_collect_data)

    # solve the environment
    solver.run()

    # save the weights of the network
    if args.save_weights is not None:
        agent.save(args.save_weights)

    # maybe run trained DQN
    if args.save_exp_after_training:

        agent.exploration_fraction = 1.0
        agent.init_explore = args.save_exp_eps
        agent.final_explore = args.save_exp_eps
        agent.setup_exploration_()

        solver = Solver(env,
                        agent,
                        args.save_exp_num,
                        learning_start=args.save_exp_num,
                        train_freq=TRAIN_FREQ,
                        train=False,
                        max_episodes=args.save_exp_num * 100,
                        collect_pre=collect_pre,
                        collect_post=collect_post,
                        collect_data=collect_data)
        solver.run()

    # maybe save the collected experience
    if args.save_exp_path is not None:

        if args.save_q_values:
            set_q_values_for_transitions(transitions, agent, args.batch_size)

        save_dir = os.path.dirname(args.save_exp_path)
        if len(save_dir) > 0 and not os.path.isdir(save_dir):
            os.makedirs(save_dir)

        with open(args.save_exp_path, "wb") as file:
            pickle.dump(transitions, file)

    # stop session
    agent.stop_session()
Пример #10
0
                               num_actions=env.available_actions,
                               policy=policy,
                               test_policy=policy,
                               processor=processor)
     else:
         # Setup DQN agent
         if opt.recurrent:
             model = DRQN_Model(window_length=opt.dqn_window_length,
                                num_actions=env.available_actions)
         else:
             model = DQN_Model(window_length=opt.dqn_window_length,
                               num_actions=env.available_actions)
         # Setup DQN agent
         agent = DQN(model=model,
                     num_actions=env.available_actions,
                     policy=policy,
                     test_policy=policy,
                     processor=processor)
 else:
     assert not opt.recurrent
     # Setup random process for exploration
     random_process = [
         GaussianWhiteNoiseProcess(sigma=0.0, mu=1.0),
         GaussianWhiteNoiseProcess(sigma=1.0, mu=0.0)
     ]
     # Setup DDPG agent model
     actor, critic, action_input = DDPG_Model(
         window_length=opt.ddpg_window_length,
         num_actions=env.available_actions)
     # Setup DDPG agent
     agent = DDPG(actor=actor,
Пример #11
0
class Agent:
    def __init__(self, stateCnt, actionCnt, **kwargs):
        if 'state_1d' in kwargs:
            state_1d = kwargs['state_1d']
        else:
            state_1d = False
        if 'dueling' in kwargs:
            dueling = kwargs['dueling']
        else:
            dueling = False
        self.steps = 0
        self.epsilon = globalvars.MAX_EPSILON
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt
        self.dqn = DQN(self.stateCnt,
                       self.actionCnt,
                       state_1d=state_1d,
                       dueling=dueling)
        self.memory = Memory()

    def acts(self, s):
        if random.random() < self.epsilon:
            return random.randint(0, self.actionCnt - 1)
        else:
            return np.argmax(self.dqn.predictOne(s))

    def observe(self, sample):
        if self.steps <= globalvars.REPLAY_START_SIZE:
            error = abs(sample[2])
            self.memory.add(error, sample)
        else:
            x, y, a, errors = self._getTargets([(0, sample)])
            self.memory.add(errors[0], sample)
            if self.steps % globalvars.SYNC_TARGET == 0:
                self.dqn.update_target_model()
            # Epsilon decay
            self.epsilon = globalvars.MIN_EPSILON + \
                           (globalvars.MAX_EPSILON - globalvars.MIN_EPSILON) * math.exp(-globalvars.LAMBDA * (self.steps \
                                                                                                              - globalvars.REPLAY_START_SIZE))
        self.steps += 1

    def _getTargets(self, batch):
        states = np.array([o[1][0] for o in batch])
        if len(self.stateCnt) > 1:
            no_state = np.zeros(self.stateCnt)
        else:
            no_state = np.zeros(self.stateCnt[0])
        states_ = np.array([(no_state if o[1][3] is None else o[1][3]) \
                            for o in batch])
        p = self.dqn.predict(states)
        p_ = self.dqn.predict(states_, target=False)
        pTarget_ = self.dqn.predict(states_, target=True)

        if len(self.stateCnt) > 1:
            x = np.zeros(states.shape)
        else:
            x = np.zeros((len(batch), self.stateCnt[0]))
        y = np.zeros((len(batch), self.actionCnt))
        errors = np.zeros(len(batch))
        actions = []
        for i in range(len(batch)):
            o = batch[i][1]
            s = o[0]
            a = o[1]
            r = o[2]
            s_ = o[3]
            t = p[i]
            oldVal = t[a]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + globalvars.GAMMA * pTarget_[i][np.argmax(p_[i])]
            x[i] = s
            y[i] = t
            actions.append(a)
            errors[i] = abs(oldVal - t[a])
        return x, y, a, errors

    def replay(self):
        batch = self.memory.sample(globalvars.BATCH_SIZE)
        x, y, a, errors = self._getTargets(batch)
        for i in range(len(batch)):
            idx = batch[i][0]
            self.memory.update(idx, errors[i])
        self.dqn.train(x, y)

    def save(self, name):
        self.dqn.save(name)
        print('Saved model to ', name)

    def load(self, name):
        self.dqn.load(name)
        print('Loaded model from ', name)
Пример #12
0
    parser.add_argument('-n',
                        '--normalize',
                        help='Normalize inputs',
                        action='store_true')
    args = parser.parse_args()

    if not args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    # Build environment
    env = Environment(args.env, args.render, args.normalize)

    # Load config
    with open(
            pkg_resources.resource_filename(
                __name__, f'../config/{args.agent.lower()}.yaml')) as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
        config = config[args.env]

    # Build model
    model = None
    if args.agent == 'DQN':
        model = DQN(env, config)
    elif args.agent == 'A2C':
        model = A2C(env, config)
    elif args.agent == 'PPO':
        model = PPO(env, config)

    # Train model
    model.train()
Пример #13
0
                              num_actions=env.action_space.n,
                              policy=policy,
                              test_policy=policy,
                              processor=processor)
    else:
        # Setup DQN agent
        if opt.recurrent:
            model = DRQN_Model(window_length=opt.dqn_window_length,
                               num_actions=env.action_space.n)
        else:
            model = DQN_Model(window_length=opt.dqn_window_length,
                              num_actions=env.action_space.n)
        # Setup DQN agent
        agent = DQN(model=model,
                    num_actions=env.action_space.n,
                    policy=policy,
                    test_policy=policy,
                    processor=processor)
else:
    agent = RandomAgent(num_actions=env.action_space.n, processor=processor)

print(args.env_name + ' initialized.')

# Setup weights path
path = os.path.join('weights', 'Atari', '{}'.format(args.env_name))
if not os.path.exists(path):
    os.makedirs(path)
weights_path = os.path.join(path, 'weights.hdf5')

# Run the agent
agent.fit(env=env,
Пример #14
0
    # parser.add_argument('--alpha', help='weigth for intrinsic reward and external reward')
    # parser.add_argument('--term', help='termination factor')

    args = parser.parse_args()

    # initialization rl elements
    env = gym.make('CartPole-v0')
    env = env.unwrapped
    sess = tf.Session()

    # seed setting
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    # info about cartpole task
    # print env.action_space
    # print env.observation_space
    # print env.observation_space.high # bound for state
    # print env.observation_space.low

    buffer = ReplayBuffer(args.capacity, args.batch, args.seed)

    agent = DQN(
        sess,
        env.observation_space.shape[0],
        env.action_space.n,
        buffer=buffer)

    example = DQNDemo(agent, env, max_episode=args.episodes)
    example.run()
Пример #15
0
import unittest

import gym
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from collections import deque
import numpy as np
    
from agents.dqn import DQN

env = gym.make('CartPole-v0')
agent = DQN(env)

class UnitTests(unittest.TestCase):
    '''
    Unittest suite
    '''

    def test_build_model(self):
        '''
        Unittest for _build_model function
        Check if the model has input and output layers that
        match the observation space and action space 
        specifically for cartpole problem
        '''
        random_model = agent._build_model()
        assert agent._build_model().input_shape[1] == 4, "The model is not compatible for cartpole with observation space equal to 4"
Пример #16
0
def train():

    # build SFDQN
    print('building SFDQN')
    deep_sf = DeepSF(keras_model_handle=sf_model_lambda, **sfdqn_params)
    sfdqn = SFDQN(deep_sf=deep_sf,
                  buffer=ReplayBuffer(sfdqn_params['buffer_params']),
                  **sfdqn_params,
                  **agent_params)

    # train SFDQN
    print('training SFDQN')
    train_tasks, test_tasks = generate_tasks(False)
    sfdqn_perf = sfdqn.train(train_tasks,
                             n_samples,
                             test_tasks=test_tasks,
                             n_test_ev=agent_params['n_test_ev'])

    # build DQN
    print('building DQN')
    dqn = DQN(model_lambda=dqn_model_lambda,
              buffer=ReplayBuffer(dqn_params['buffer_params']),
              **dqn_params,
              **agent_params)

    # training DQN
    print('training DQN')
    train_tasks, test_tasks = generate_tasks(True)
    dqn_perf = dqn.train(train_tasks,
                         n_samples,
                         test_tasks=test_tasks,
                         n_test_ev=agent_params['n_test_ev'])

    # smooth data
    def smooth(y, box_pts):
        return np.convolve(y, np.ones(box_pts) / box_pts, mode='same')

    sfdqn_perf = smooth(sfdqn_perf, 10)[:-5]
    dqn_perf = smooth(dqn_perf, 10)[:-5]
    x = np.linspace(0, 4, sfdqn_perf.size)

    # reporting progress
    ticksize = 14
    textsize = 18
    plt.rc('font', size=textsize)  # controls default text sizes
    plt.rc('axes', titlesize=textsize)  # fontsize of the axes title
    plt.rc('axes', labelsize=textsize)  # fontsize of the x and y labels
    plt.rc('xtick', labelsize=ticksize)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=ticksize)  # fontsize of the tick labels
    plt.rc('legend', fontsize=ticksize)  # legend fontsize

    plt.figure(figsize=(8, 6))
    ax = plt.gca()
    ax.plot(x, sfdqn_perf, label='SFDQN')
    ax.plot(x, dqn_perf, label='DQN')
    plt.xlabel('training task index')
    plt.ylabel('averaged test episode reward')
    plt.title('Testing Reward Averaged over all Test Tasks')
    plt.tight_layout()
    plt.legend(frameon=False)
    plt.savefig('figures/sfdqn_return.png')
Пример #17
0
def run_baseline(env_params, lp, rl, k_order):
    """
    This code learns a reward machine from experience and uses dqn to learn an optimal policy for that RM:
        - 'env_params' is the environment parameters
        - 'lp' is the set of learning parameters
    Returns the training rewards
    """
    # Initializing parameters and the game
    env = Game(env_params)
    actions = env.get_actions()
    policy = None
    train_rewards = []
    reward_total = 0
    last_reward  = 0
    step = 0

    # Start learning a policy for the current rm
    while step < lp.train_steps:
        env.restart()
        o1_events   = env.get_events()
        o1_features = env.get_features()
        # computing the stack of features for o1
        k_prev_obs = [np.zeros(len(o1_features)) for _ in range(k_order-1)] # saves the k-previous observations
        k_prev_obs.insert(0, o1_features)
        o1_stack = np.concatenate(tuple(k_prev_obs), axis=None)
        for _ in range(lp.episode_horizon):

            # reinitializing the policy if the rm changed
            if policy is None:
                if rl == "dqn":
                    policy = DQN(lp, k_order * len(o1_features), len(actions), None)
                elif rl == "human":
                    policy = None
                else:
                    assert False, "RL approach is not supported yet"            

            # selecting an action using epsilon greedy
            if rl == "human":
                if random.random() < 0.1: a = random.randrange(4)
                else: a = env.get_optimal_action().value
            else:
                a = policy.get_best_action(o1_stack, 0, lp.epsilon)

            # executing a random action
            reward, done = env.execute_action(a)
            o2_events   = env.get_events()
            o2_features = env.get_features()

            # Appending the new observation and computing the stack of features for o2
            k_prev_obs.insert(0, o2_features)
            k_prev_obs.pop()
            o2_stack = np.concatenate(tuple(k_prev_obs), axis=None)

            # updating the number of steps and total reward
            reward_total += reward
            step += 1

            if rl != "human":
                # Saving this transition
                policy.add_experience(o1_events, o1_stack, 0, a, reward, o2_events, o2_stack, 0, float(done))

                # Learning and updating the target networks (if needed)
                policy.learn_if_needed()

            # Testing
            if step % lp.test_freq == 0:
                print("Step: %d\tTrain: %0.1f"%(step, reward_total - last_reward))
                train_rewards.append((step, reward_total - last_reward))
                last_reward = reward_total

            # checking if the episode finishes
            if done or lp.train_steps <= step: 
                break 

            # Moving to the next state
            o1_events, o1_features, o1_stack = o2_events, o2_features, o2_stack

    # closing the policy
    if policy is not None:
        policy.close()
        policy = None

    # return the trainig rewards
    return train_rewards