Пример #1
0
def play_multi_episode(submit_model, episode_num=2, vis=False, seed=0):
    np.random.seed(seed)

    env = L2M2019Env(difficulty=3, visualize=vis)
    env.change_model(model='3D', difficulty=3)
    env = ForwardReward(env)
    env = FrameSkip(env, 4)
    env = ActionScale(env)
    env = OfficialObs(env)
    all_reward = []

    for e in range(episode_num):
        episode_reward = 0.0
        observation = env.reset(project=True, obs_as_dict=True)
        step = 0
        target_change_times = 0
        while True:
            step += 1
            action = submit_model.pred_batch(observation, target_change_times)
            observation, reward, done, info = env.step(
                action, project=True, obs_as_dict=True)
            if info['target_changed']:
                target_change_times += 1
            episode_reward += reward
            if done:
                break
        all_reward.append(episode_reward)
        logger.info("[episode/{}] episode_reward:{} mean_reward:{}".format(\
                      e, episode_reward, np.mean(all_reward)))
Пример #2
0
    def __init__(self,
                 difficulty,
                 vel_penalty_coeff,
                 muscle_penalty_coeff,
                 penalty_coeff,
                 only_first_target=False):

        random_seed = np.random.randint(int(1e9))

        env = L2M2019Env(difficulty=difficulty,
                         visualize=False,
                         seed=random_seed)
        max_timelimit = env.time_limit

        env = FinalReward(env,
                          max_timelimit=max_timelimit,
                          vel_penalty_coeff=vel_penalty_coeff,
                          muscle_penalty_coeff=muscle_penalty_coeff,
                          penalty_coeff=penalty_coeff)

        if only_first_target:
            assert difficulty == 3, "argument `only_first_target` is available only in `difficulty=3`."
            env = FirstTarget(env)

        env = FrameSkip(env)
        env = ActionScale(env)
        self.env = OfficialObs(env, max_timelimit=max_timelimit)
Пример #3
0
    def __init__(self, config):
        super(LearnToMove, self).__init__(config)

        self.env = L2M2019Env(visualize=bool(config['visualize']), integrator_accuracy=0.001)
        self.project = True # False - dict of size 14, True - dict of size 4
        self.env.reset(project=self.project)
        self.observation_transformer = ObservationTransformer()
    def __init__(self,
                 history_len=1,
                 frame_skip=1,
                 reward_scale=1,
                 reload_period=None,
                 action_mean=None,
                 action_std=None,
                 visualize=False,
                 mode="train",
                 **params):
        super().__init__(visualize=visualize, mode=mode)

        env = L2M2019Env(**params, visualize=visualize)
        env = EnvNormalizer(env)
        self.env = env

        self._history_len = history_len
        self._frame_skip = frame_skip
        self._visualize = visualize
        self._reward_scale = reward_scale
        self._reload_period = reload_period or BIG_NUM
        self.episode = 0

        self.action_mean = np.array(action_mean) \
            if action_mean is not None else None
        self.action_std = np.array(action_std) \
            if action_std is not None else None

        self._prepare_spaces()
Пример #5
0
class MyEnv(L2M2019Env):
    env = L2M2019Env(visualize=False)
    def reset(self, **kwargs):
        obs_dict = self.env.reset()
        return get_observation(obs_dict)
    def step(self, action, **kwargs):
        obs_dict, reward, done, info = self.env.step(action)
        return get_observation(obs_dict), reward, done, info
Пример #6
0
    def __init__(self, it_max, ep_max):
        super().__init__()
        self.it_max = it_max
        self.ep_max = ep_max
        self.env = L2M2019Env(visualize=False, difficulty=3)
#        self.obs_high = np.array(self.env.observation_space.high)
#        self.obs_low = np.array(self.env.observation_space.low)
        self.stop_measure = 0
        self.patience = 5
Пример #7
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--visualize', action='store_true')
    parser.add_argument('--load_data', action='store_true')
    parser.add_argument('--load_policy', action='store_true')
    parser.add_argument("--max_timesteps", type=int)
    parser.add_argument('--num_expert_rollouts',
                        type=int,
                        default=1,
                        help='Number of expert roll outs')
    parser.add_argument('--num_dagger_updates',
                        type=int,
                        default=20,
                        help='Number of dagger iterations')
    parser.add_argument(
        '--rollouts_per_update',
        type=int,
        default=5,
        help='Number of rollouts collected per dagger iteration')
    parser.add_argument('--epochs', type=int, default=1000)
    parser.add_argument('--batch_size', type=int, default=32)

    mode = '2D'
    difficulty = 2
    visualize = False
    seed = None
    sim_dt = 0.01
    sim_t = 10
    timstep_limit = int(round(sim_t / sim_dt))

    if mode is '2D':
        params = np.loadtxt('./osim/control/params_2D.txt')
    elif mode is '3D':
        params = np.loadtxt('./osim/control/params_3D.txt')

    args = parser.parse_args()

    locoCtrl = OsimReflexCtrl(mode=mode, dt=sim_dt)
    locoCtrl.set_control_params(params)
    env = L2M2019Env(visualize=args.visualize,
                     seed=seed,
                     difficulty=difficulty)
    env.change_model(model=mode, difficulty=difficulty, seed=seed)

    env.spec.timestep_limit = timstep_limit

    max_steps = args.max_timesteps or env.spec.timestep_limit

    with tf.Session():
        initialize()
        dagger_policy_fn = DAgger().run_dagger(
            env, args.load_data, args.load_policy, max_steps,
            args.num_expert_rollouts, args.num_dagger_updates,
            args.rollouts_per_update, args.epochs, args.batch_size, locoCtrl)
Пример #8
0
    def __init__(self, args):
        self.id = rpc.get_worker_info().id

        if args.env_name == 'L2M2019Env':
            self.env = L2M2019Env(visualize=False,
                                  difficulty=args.difficulty,
                                  seed=args.seed + self.id)
            self.test_env = L2M2019Env(visualize=False,
                                       difficulty=args.difficulty,
                                       seed=args.seed + self.id + 999)
            self.obs_mean = np.array(args.obs_mean)
            self.obs_std = np.array(args.obs_std)
        else:
            self.env = gym.make(args.env_name)
            self.test_env = gym.make(args.env_name)
            self.env.seed(args.seed + self.id)
            self.test_env.seed(args.seed + self.id + 999)

        self.act_limit = self.env.action_space.high[0]
        self.done = True
        self.len = 0

        self.args = args
def f_ind(n_gen, i_worker, params):
    flag_model = '2D'
    flag_ctrl_mode = '2D'  # use 2D
    seed = None
    difficulty = 0
    sim_dt = 0.01
    sim_t = 20
    timstep_limit = int(round(sim_t / sim_dt))

    init_error = True
    error_count = 0
    while init_error:
        try:
            locoCtrl = OsimReflexCtrl(mode=flag_ctrl_mode, dt=sim_dt)
            env = L2M2019Env(seed=seed, difficulty=difficulty, visualize=False)
            env.change_model(model=flag_model,
                             difficulty=difficulty,
                             seed=seed)
            obs_dict = env.reset(project=True,
                                 seed=seed,
                                 init_pose=init_pose,
                                 obs_as_dict=True)
            init_error = False
        except Exception as e_msg:
            error_count += 1
            print('\ninitialization error (x{})!!!'.format(error_count))
            #print(e_msg)
            #import pdb; pdb.set_trace()
    env.spec.timestep_limit = timstep_limit + 100

    total_reward = 0
    error_sim = 0
    t = 0
    while True:
        t += sim_dt

        locoCtrl.set_control_params(params)
        action = locoCtrl.update(obs_dict)
        obs_dict, reward, done, info = env.step(action,
                                                project=True,
                                                obs_as_dict=True)
        total_reward += reward

        if done:
            break

    print('\n    gen#={} sim#={}: score={} time={}sec #step={}'.format(
        n_gen, i_worker, total_reward, t, env.footstep['n']))

    return total_reward  # minimization
Пример #10
0
    def __init__(self, world_size, args):
        if args.env_name == 'L2M2019Env':
            env = L2M2019Env(visualize=False, difficulty=args.difficulty)
            obs_dim = 99
        else:
            env = gym.make(args.env_name)
            obs_dim = env.observation_space.shape[0]

        act_dim = env.action_space.shape[0]

        self.device = torch.device(args.device)

        self.args = args
        self.world_size = world_size

        self.actor_critic = MLPActorCritic(obs_dim,
                                           act_dim,
                                           hidden_sizes=args.hidden_sizes).to(
                                               self.device)
        self.replay_buffer = [
            ReplayBuffer(obs_dim, act_dim, args.buffer_size)
            for _ in range(1, world_size)
        ]

        self.gac = GAC(self.actor_critic,
                       self.replay_buffer,
                       device=self.device,
                       gamma=args.gamma,
                       alpha_start=args.alpha_start,
                       alpha_min=args.alpha_min,
                       alpha_max=args.alpha_max)

        self.test_len = 0.0
        self.test_ret = 0.0

        self.ob_rrefs = []
        for ob_rank in range(1, world_size):
            ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank))
            self.ob_rrefs.append(remote(ob_info, Observer, args=(args, )))

        self.agent_rref = RRef(self)
import time
import csv
import matplotlib.pyplot as plt
from datetime import datetime
from osim.env import L2M2019Env
import sys
import h5py

# Show entire qtable
np.set_printoptions(threshold=sys.maxsize)

# Adjust max_episode_steps and episodes for different learning

# Initialize Environment
env_name = 'L2M2019Env'
env = L2M2019Env(visualize=False)
env.reset()
env._max_episode_steps = 10  #set max steps per episode
#env.seed(0) #set environment seed for same initial positions
#np.random.seed(0) #set numpy rng to reproduce same "random" action sequence

# Get State Space
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

# Set Hyperparameters
initial_lr = 1.0  #learning rate
min_lr = 0.005  #min learning rate
gamma = 0.8  #discount factor = balances immediate and future reward (ranges 0.8 to 0.99)
epsilon = 0.05  #higher -> more exploitation, less exploration
n_states = 339  #number of states
Пример #12
0
    while n < num_episodes:
        # if render:
        #     env.render()
        #     time.sleep(1e-3)

        a = get_action(o)
        o, r, d, _ = env.step(a, obs_as_dict=False)
        o = np.array(o)
        ep_ret += r
        ep_len += 1

        if d or (ep_len == max_ep_len):
            print('Episode %d \t EpRet %.3f \t EpLen %d'%(n, ep_ret, ep_len))
            o, r, d, ep_ret, ep_len = env.reset(obs_as_dict=False), 0, False, 0, 0
            o = np.array(o)
            n += 1

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('fpath', type=str)
    parser.add_argument('--len', '-l', type=int, default=0)
    parser.add_argument('--episodes', '-n', type=int, default=100)
    parser.add_argument('--render', '-r', action='store_true')
    parser.add_argument('--deterministic', '-d', action='store_true')
    parser.add_argument('--device', type=str, default='cuda:0')
    parser.add_argument('--difficulty', type=int, default=1)
    args = parser.parse_args()
    env = L2M2019Env(visualize=args.render, difficulty=args.difficulty)
    get_action = load_pytorch_policy(args.fpath, args.device, args.deterministic)
    run_policy(env, get_action, args.len, args.episodes, args.render)
Пример #13
0
from keras.layers import Dense, Activation, Flatten, Input, concatenate
from keras.optimizers import Adam

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

import argparse

# Command line parameters
parser = argparse.ArgumentParser(description='Train or test neural net motor controller')
parser.add_argument('--model', dest='model', action='store', default="example.h5f")
parser.add_argument('--episodes', type=int, default=5)
args = parser.parse_args()

env = L2M2019Env(visualize=True)

nb_actions = env.action_space.shape[0]

# Total number of steps in training

# Create networks for DDPG
# Next, we build a very simple model.
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(32))
actor.add(Activation('relu'))
actor.add(Dense(32))
actor.add(Activation('relu'))
actor.add(Dense(32))
actor.add(Activation('relu'))
    pd.DataFrame(Reward).to_csv("./results/reward.csv")

    actor.save("./results/")
    critic.save("./results/")


if __name__ == '__main__':

    model = '3D'
    difficulty = 1
    seed = None
    project = True
    obs_as_dict = False

    env = L2M2019Env(seed=seed, difficulty=difficulty, visualize=False)
    env.change_model(model=model, difficulty=difficulty, seed=seed)
    obs_dict = env.reset(project=project, seed=seed, obs_as_dict=obs_as_dict)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    lr = 0.0002
    tau = 0.001

    #print(env.action_space.high,env.action_space.low)

    #state_dim = vectorized_state.shape[0]
    #action_dim = int(vectorized_state.shape[0]/4)
    '''
    print("state",vectorized_state)
Пример #15
0
                res, [diff_vel_x / 5.0, diff_vel_z / 5.0, diff_vel / 5.0])

        # current relative target theta
        target_v_x, target_v_z = obs_dict['v_tgt_field'][0][5][5], obs_dict[
            'v_tgt_field'][1][5][5]

        target_theta = math.atan2(target_v_z, target_v_x)
        diff_theta = target_theta
        res = np.append(res, [diff_theta / np.pi])

        return res


if __name__ == '__main__':
    from osim.env import L2M2019Env

    env = L2M2019Env(difficulty=3, visualize=False)
    env.change_model(model='3D', difficulty=3)
    env = ForwardReward(env)
    env = FrameSkip(env, 4)
    env = ActionScale(env)
    env = OfficialObs(env)
    observation = env.reset(project=True, obs_as_dict=True)
    print(observation.shape)
    while True:
        _, _, done, _ = env.step(env.action_space.sample(),
                                 project=True,
                                 obs_as_dict=True)
        if done:
            break
Пример #16
0
def main(args):

    if 'L2M2019Env' in args.env_name:
        env = L2M2019Env(visualize=False, difficulty=args.difficulty)
        test_env = L2M2019Env(visualize=False, difficulty=args.difficulty)
    else:
        env = gym.make(args.env_name)
        test_env = gym.make(args.env_name)
    device = torch.device(args.device)

    data = np.load('./official_obs_scaler.npz')
    obs_mean, obs_std = data['mean'], data['std']

    # 1.Set some necessary seed.
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)
    test_env.seed(args.seed + 999)

    # 2.Create actor, critic, EnvSampler() and PPO.
    if 'L2M2019Env' in args.env_name:
        obs_dim = 99
    else:
        obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_high = env.action_space.high
    act_low = env.action_space.low

    actor_critic = MLPActorCritic(obs_dim,
                                  act_dim,
                                  hidden_sizes=args.hidden_sizes).to(device)

    replay_buffer = ReplayBuffer(obs_dim, act_dim, args.buffer_size)

    gac = GAC(actor_critic,
              replay_buffer,
              device=device,
              gamma=args.gamma,
              alpha_start=args.alpha_start,
              alpha_min=args.alpha_min,
              alpha_max=args.alpha_max)

    def act_encoder(y):
        # y = [min, max] ==> x = [-1, 1]
        # if args.env_name == 'L2M2019Env':
        #     return y
        return (y - act_low) / (act_high - act_low) * 2.0 - 1.0

    def act_decoder(x):
        # x = [-1, 1] ==> y = [min, max]
        # if args.env_name == 'L2M2019Env':
        #     return np.abs(x)
        return (x + 1.0) / 2.0 * (act_high - act_low) - act_low

    def get_observation(env):
        obs = np.array(env.get_observation()[242:])

        obs = (obs - obs_mean) / obs_std

        state_desc = env.get_state_desc()
        p_body = [
            state_desc['body_pos']['pelvis'][0],
            -state_desc['body_pos']['pelvis'][2]
        ]
        v_body = [
            state_desc['body_vel']['pelvis'][0],
            -state_desc['body_vel']['pelvis'][2]
        ]
        v_tgt = env.vtgt.get_vtgt(p_body).T

        return np.append(obs, v_tgt)

    def get_reward(env):
        reward = 10.0

        # Reward for not falling down
        state_desc = env.get_state_desc()
        p_body = [
            state_desc['body_pos']['pelvis'][0],
            -state_desc['body_pos']['pelvis'][2]
        ]
        v_body = [
            state_desc['body_vel']['pelvis'][0],
            -state_desc['body_vel']['pelvis'][2]
        ]
        v_tgt = env.vtgt.get_vtgt(p_body).T

        vel_penalty = np.linalg.norm(v_body - v_tgt)

        muscle_penalty = 0
        for muscle in sorted(state_desc['muscles'].keys()):
            muscle_penalty += np.square(
                state_desc['muscles'][muscle]['activation'])

        ret_r = reward - (vel_penalty * 3 + muscle_penalty * 1)

        if vel_penalty < 0.3:
            ret_r += 10

        return ret_r

    # 3.Start training.
    def get_action(o, deterministic=False):
        o = torch.FloatTensor(o.reshape(1, -1)).to(device)
        a = actor_critic.act(o, deterministic)
        return a

    def test_agent():
        test_ret, test_len = 0, 0
        for j in range(args.epoch_per_test):
            _, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            o = get_observation(test_env)
            while not (d or (ep_len == args.max_ep_len)):
                # Take deterministic actions at test time
                a = get_action(o, True)
                a = act_decoder(a)

                for _ in range(args.frame_skip):
                    _, r, d, _ = test_env.step(a)
                    ep_ret += r
                    ep_len += 1
                    if d: break

                o = get_observation(test_env)

            test_ret += ep_ret
            test_len += ep_len
        return test_ret / args.epoch_per_test, test_len / args.epoch_per_test

    total_step = args.total_epoch * args.steps_per_epoch
    _, d, ep_len = env.reset(), False, 0
    o = get_observation(env)
    for t in range(1, total_step + 1):
        if t <= args.start_steps:
            a = act_encoder(env.action_space.sample())
        else:
            a = get_action(o, deterministic=False)

        a = act_decoder(a)

        r = 0.0
        for _ in range(args.frame_skip):
            _, _, d, _ = env.step(a)
            r += get_reward(env)
            ep_len += 1
            if d: break

        o2 = get_observation(env)

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)

        d = False if ep_len == args.max_ep_len else d

        # if not d:
        #     new_o, new_r, new_o2 = generate_success(o, o2)
        #     replay_buffer.store(new_o, a, new_r * args.reward_scale, new_o2, d)

        # Store experience to replay buffer
        replay_buffer.store(o, a, r * args.reward_scale, o2, d)

        o = o2
        if d or (ep_len == args.max_ep_len):
            _, ep_len = env.reset(obs_as_dict=False), 0
            o = get_observation(env)

        if t >= args.update_after and t % args.steps_per_update == 0:
            for _ in range(args.steps_per_update):
                loss_a, loss_c, alpha = gac.update(args.batch_size)
            gac.update_beta()
            print(
                "loss_actor = {:<22}, loss_critic = {:<22}, alpha = {:<20}, beta = {:<20}"
                .format(loss_a, loss_c, alpha, gac.beta))

        # End of epoch handling
        if t >= args.update_after and t % args.steps_per_epoch == 0:
            test_ret, test_len = test_agent()
            print("Step {:>10}: test_ret = {:<20}, test_len = {:<20}".format(
                t, test_ret, test_len))
            print(
                "-----------------------------------------------------------")
            yield t, test_ret, test_len, actor_critic
    1.709011708233401095e-01,  # ankle flex
    0 * np.pi / 180,  # [left] hip adduct
    -5.282323914341899296e-02,  # hip flex
    -8.041966456860847323e-01,  # knee extend
    -1.745329251994329478e-01
])  # ankle flex

if mode is '2D':
    params = np.loadtxt('params_2D.txt')
elif mode is '3D':
    params = np.loadtxt('params_3D_init.txt')

locoCtrl = OsimReflexCtrl(mode=mode, dt=sim_dt)
locoCtrl.set_control_params(params)

env = L2M2019Env(visualize=visualize, seed=seed, difficulty=difficulty)
env.change_model(model=mode, difficulty=difficulty, seed=seed)
obs_dict = env.reset(project=True,
                     seed=seed,
                     obs_as_dict=True,
                     init_pose=INIT_POSE)
env.spec.timestep_limit = timstep_limit

total_reward = 0
t = 0
i = 0

# initiate onn network

#onn_network = ONN(features_size=2, max_num_hidden_layers=5,
#                 qtd_neuron_per_hidden_layer=10, n_classes=2,loss_fun = 'mse')
Пример #18
0
parser.add_argument('--seed', type=int, default=0,
                    help='random seed for evaluation')

args = parser.parse_args()
# Settings
remote_base = 'http://osim-rl-grader.aicrowd.com/'
cgp_id = args.ind

# Create environment
if args.live:
    with open(args.token, 'r') as f:
        aicrowd_token = f.read().strip()
    client = Client(remote_base)
    observation = client.env_create(aicrowd_token, env_id='L2M2019Env')
else:
    env = L2M2019Env(visualize=args.visual)
    observation = env.reset(seed=args.seed)

# CGP controller
library = build_funcLib()
ind = CGP.load_from_file(cgp_id, library)
l2meval = L2MEvaluator(1e8, 1)
i = 0
j = 0
r_total = 0.0

while True:
    inputs = l2meval.get_inputs(observation)
    outputs = l2meval.scale_outputs(ind.run(inputs))

    if args.live:
Пример #19
0
Файл: l2m.py Проект: ShawK91/l2m
    def __init__(self,
                 visualize=False,
                 integrator_accuracy=5e-5,
                 frameskip=4,
                 T=2500,
                 action_clamp=False,
                 difficulty=2,
                 project=True):
        """
        A base template for all environment wrappers.
        """
        from osim.env import L2M2019Env
        self.env = L2M2019Env(visualize=visualize,
                              integrator_accuracy=integrator_accuracy,
                              seed=0,
                              report=None,
                              difficulty=difficulty)
        self.frameskip = frameskip
        self.T = T
        self.istep = 0
        self.action_clamp = action_clamp
        self.project = project

        #Self Params
        self.state_dim = 169 if self.project else 228 + 72
        self.action_dim = 22
        self.test_size = 5

        #Trackers
        self.shaped_reward = {
            'num_footsteps': [],
            'crouch_bonus': [],
            'knee_bend': [],
            'toes_low': [],
            'x_penalty': [],
            'z_penalty': []
        }
        self.original_reward = 0.0
        self.fell_down = False

        #Reward Shaping components
        self.ltoes = {
            'x': [],
            'y': [],
            'z': []
        }
        self.rtoes = {
            'x': [],
            'y': [],
            'z': []
        }
        self.ltibia = {
            'x': [],
            'y': [],
            'z': []
        }
        self.rtibia = {
            'x': [],
            'y': [],
            'z': []
        }
        self.pelvis = {'x': [], 'y': [], 'z': []}

        self.ltibia_angle = []
        self.rtibia_angle = []
        self.lfemur_angle = []
        self.rfemur_angle = []
Пример #20
0
#round_n = 1 # Round 1
round_n = 3  # Round 2

if round_n == 1:
    difficulty = 2  # 2: Round 1; 3: Round 2
    seed = None
    project = True
    obs_as_dict = True
elif round_n == 2:
    difficulty = 3  # 2: Round 1; 3: Round 2
    seed = None
    project = True
    obs_as_dict = True
else:
    difficulty = 0  # 0: constant forward velocities; 1: consecutive sinks forward for walking
    seed = None
    project = True
    obs_as_dict = True
#=== this is the official setting for Learn to Move 2019 ===#

env = L2M2019Env(seed=seed, difficulty=difficulty)
env.change_model(model=model, difficulty=difficulty, seed=seed)
obs_dict = env.reset(project=project, seed=seed, obs_as_dict=obs_as_dict)

while True:
    obs_dict, reward, done, info = env.step(env.action_space.sample(),
                                            project=project,
                                            obs_as_dict=obs_as_dict)
    if done:
        break
Пример #21
0
 def __init__(self, visualization):
     # Create environment
     self.env = L2M2019Env(visualize=visualization)
     self.observation = self.env.reset()
     self.reward = 0