def main():
    import gym
    from gym.wrappers.monitor import Monitor
    import quanser_robots

    def evaluate(env, policy, num_evlas=25):
        ep_returns = []
        for eval_num in range(num_evlas):
            episode_return = 0
            dones = False
            obs = env.reset()
            while not dones:
                action = policy(obs)
                obs, rewards, dones, info = env.step(action)
                episode_return += rewards
            ep_returns.append(episode_return)
        return ep_returns

    def render(env, policy):
        obs = env.reset()
        done = False
        while not done:
            env.render()
            act = policy(obs)
            obs, _, done, _ = env.step(act)

    def check(env, policy):
        render(env, policy)
        ret_all = evaluate(env, policy)
        print(np.mean(ret_all), np.std(ret_all))
        env.close()

    # DQN I: Check learned policy
    env = Monitor(gym.make('CartpoleSwingShort-v0'), 'dqn_eval')
    policy = load_dqn_policy()
    check(env, policy)

    # DQN II: Check learning procedure
    env = Monitor(gym.make('CartpoleSwingShort-v0'),
                  'dqn_train',
                  video_callable=False)
    policy = train_dqn_policy(env)
    check(env, policy)

    # LSPI I: Check learned policy
    env = Monitor(gym.make('CartpoleStabShort-v0'), 'lspi_eval')
    policy = load_lspi_policy()
    check(env, policy)

    # LSPI II: Check learning procedure
    env = Monitor(gym.make('CartpoleStabShort-v0'),
                  'lspi_train',
                  video_callable=False)
    policy = train_lspi_policy(env)
    check(env, policy)
示例#2
0
  def play_poison(self, n_step=10000, n_episode=1000, test_ep=None, render=False):
    print('play poison: ', self.poison)
    print('is_trian: ', self.is_train)
    print('+++++++++++++++++++++++++++++++++==')

    if test_ep == None:
      test_ep = self.ep_end

    test_history = History(self.config)

    
    if not self.display:
      gym_dir = '/tmp/%s-%s' % (self.env_name, get_time())
      #self.env.env.monitor.start(gym_dir)
      monitor = Monitor(self.env.env, directory = gym_dir)

    best_reward, best_idx = 0, 0
    total_reward = 0.

    for idx in xrange(n_episode):
      screen, reward, action, terminal = self.env.new_random_game()
      current_reward = 0

      for _ in range(self.history_length):
        test_history.add(screen)

      for t in tqdm(range(n_step), ncols=70):
        # 1. predict
        action = self.predict(test_history.get(), test_ep)
        # 2. act
        screen, reward, terminal = self.env.act(action, is_training=False)
        # 3. observe
        test_history.add(screen)

        # print('step: ', t, ' action: ', action, ' reward: ', reward)

        current_reward += reward
        if terminal:
          break

      if current_reward > best_reward:
        best_reward = current_reward
        best_idx = idx

      total_reward += current_reward

      print("="*30)
      print(" [%d] Best reward : %d" % (best_idx, best_reward))
      print("="*30)

    print('average reward is: ', total_reward/n_episode)
    if not self.display:
      monitor.close()
示例#3
0
    def reset(self, record):
        self.episode_step = 0

        if record:
            self.env = Monitor(gym.make('LunarLander-v2'),
                               "recordings",
                               video_callable=lambda episode_id: True,
                               force=True)
        else:
            self.env = gym.make('LunarLander-v2')

        return self.env.reset()
示例#4
0
 def make_env():
     env = gym.make(env_id)
     if record_video:
         video_path = os.path.join(output_dir, 'video')
         ensure_dir(video_path)
         env = Monitor(env, video_path, video_callable=lambda episode_id: episode_id % record_video_freq == 0, force=True)
     return env    
示例#5
0
def inizialize_wrapper(env, frame_skip: int, frame_width: int,
                       frame_height: int, record_path: str):
    """ Applica un set di wrappers per i giochi Atari"""
    env = Monitor(env=env, directory=record_path, resume=True)
    env = MaxAndSkipEnv(env=env, skip=frame_skip)
    env = WarpFrame(env=env, width=frame_width, height=frame_height)
    env = ScaledFloatFrame(env=env)
    return env
示例#6
0
 def _thunk():
     env = make_atari(env_id)
     env.seed(seed + rank)
     if record_video:
         video_path = os.path.join(output_dir, 'video/env-%d' % rank)
         ensure_dir(video_path)
         env = Monitor(env, video_path, video_callable=lambda episode_id: episode_id % record_video_freq == 0, force=True)
     return wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False)
示例#7
0
def make_atari_env(name, seed):
    from gym.wrappers.monitor import Monitor
    from gym.envs.atari.atari_env import AtariEnv
    env = AtariEnv(game=name, frameskip=4, obs_type='image')
    env = Monitor(env, 'videos/', force=True, video_callable=lambda e: False)
    env = wrappers.wrap_deepmind(env)
    env.seed(seed)
    return env
示例#8
0
def make_atari_env(name, history_len):
    from gym.envs.atari.atari_env import AtariEnv
    from gym.wrappers.monitor import Monitor
    env = AtariEnv(game=name, frameskip=4, obs_type='image')
    env = Monitor(env, 'videos/', force=True, video_callable=lambda e: False)
    env = wrappers.wrap_deepmind(env)
    env = wrappers.HistoryWrapper(env, history_len)
    env.seed(utils.random_seed())
    return env
示例#9
0
 def make_env():
     env = gym.make(env_id)
     if record_video:
         print("RECORDING VIDEO")
         video_path = os.path.join(output_dir, 'video')
         ensure_dir(video_path)
         env = Monitor(env, video_path, video_callable= lambda episode_id: episode_id % record_video_freq == 0, force=True)
 #        env.render()
     return env
示例#10
0
def train_and_evaluate(args, monitor_path, checkpoint_step_filename,
                       checkpoint_weights_filename, weights_filename,
                       log_filename):

    env = gym.make(args["env_name"])
    env = Monitor(env,
                  monitor_path,
                  resume=True,
                  uid=args["run_id"],
                  video_callable=lambda episode_num: episode_num % args[
                      "record_video_every"] == 0)
    np.random.seed(args["random_seed"])
    env.seed(args["random_seed"])
    starting_step = 0
    if os.path.exists(checkpoint_step_filename):
        with open(checkpoint_step_filename, 'r') as f:
            starting_step = int(f.read())
    args["starting_step"] = starting_step
    dqn = make_deep_q_network(env, args)
    if args["starting_step"] > 0:
        dqn.load_weights(checkpoint_weights_filename)

    callbacks = [
        ReloadModelIntervalCheckpoint(checkpoint_weights_filename,
                                      step_path=checkpoint_step_filename,
                                      interval=args["checkpoint_frequency"],
                                      starting_step=starting_step),
        MyTrainLogger(args["checkpoint_frequency"], args["training_steps"],
                      starting_step, log_filename)
    ]

    if args["mode"] == "Train":
        dqn.fit(env,
                callbacks=callbacks,
                verbose=0,
                nb_steps=args["training_steps"] - starting_step,
                nb_max_start_steps=args["strarting_fire_steps"],
                start_step_policy=lambda obs: 1)  # 1 is fire action

        dqn.save_weights(weights_filename, overwrite=True)
    else:
        dqn.load_weights(weights_filename)

    env = gym.make(args["env_name"])
    env = Monitor(env, monitor_path, resume=True, uid=args["run_id"] + "_test")
    np.random.seed(args["random_seed"])
    env.seed(args["random_seed"])
    dqn.test(env,
             nb_episodes=1,
             visualize=False,
             nb_max_start_steps=args["strarting_fire_steps"],
             start_step_policy=lambda obs: 1)  # 1 is fire action
示例#11
0
def check_pyglet():
    from pyglet.window import key

    a = np.array([0.0, 0.0, 0.0])

    def key_press(k, _mod):
        if k == key.LEFT:
            a[0] = +1.0
        if k == key.RIGHT:
            a[0] = -1.0
        if k == key.UP:
            a[1] = +1.0
        if k == key.DOWN:
            a[2] = +0.8  # set 1.0 for wheels to block to zero rotation

    def key_release(k, _mod):
        if k == key.LEFT and a[0] == +1.0:
            a[0] = 0
        if k == key.RIGHT and a[0] == -1.0:
            a[0] = 0
        if k == key.UP:
            a[1] = 0
        if k == key.DOWN:
            a[2] = 0

    env = CarRacingFix()
    env.render()
    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release
    record_video = False
    if record_video:
        from gym.wrappers.monitor import Monitor

        env = Monitor(env, '/tmp/video-test', force=True)
    if_open = True
    while if_open:
        env.reset()
        total_reward = 0.0
        steps = 0
        while True:
            s, r, done, info = env.step(a)
            total_reward += r
            if steps % 200 == 0 or done:
                print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
                print("step {} total_reward {:+0.2f}".format(
                    steps, total_reward))
                # import matplotlib.pyplot as plt
                # plt.imshow(s)
                # plt.savefig("test.jpeg")
            steps += 1
            if_open = env.render()
            if done or not if_open:
                break
    env.close()
def main(side_force):
    from pyglet.window import key
    a = np.array([0.0, 0.0, 0.0])

    def key_press(k, mod):
        global restart
        if k == 0xff0d: restart = True
        if k == key.LEFT: a[0] = -1.0
        if k == key.RIGHT: a[0] = +1.0
        if k == key.UP: a[1] = +1.0
        if k == key.DOWN:
            a[2] = +0.8  # set 1.0 for wheels to block to zero rotation

    def key_release(k, mod):
        if k == key.LEFT and a[0] == -1.0: a[0] = 0
        if k == key.RIGHT and a[0] == +1.0: a[0] = 0
        if k == key.UP: a[1] = 0
        if k == key.DOWN: a[2] = 0

    env = CarRacing(side_force=side_force)
    env.render()
    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release
    record_video = False
    if record_video:
        from gym.wrappers.monitor import Monitor
        env = Monitor(env, '/tmp/video-test', force=True)
    isopen = True
    while isopen:
        env.reset()
        total_reward = 0.0
        steps = 0
        restart = False
        while True:
            s, r, done, info = env.step(a)
            total_reward += r
            if steps % 200 == 0 or done:
                print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
                print("step {} total_reward {:+0.2f}".format(
                    steps, total_reward))
            steps += 1
            isopen = env.render()
            if done or restart or isopen == False:
                break
    env.close()
示例#13
0
def train_ddpg_official():
    env = LunarLanderContinuous()
    # env = LunarLander()
    env.render()

    record_video = False
    if record_video:
        from gym.wrappers.monitor import Monitor

        env = Monitor(env, config.video_folder / "ddpg/", force=True)

    num_states = env.observation_space.shape[0]
    print("Size of State Space ->  {}".format(num_states))
    if env.continuous:
        num_actions = env.action_space.shape[0]
        print("Size of Action Space ->  {}".format(num_actions))

        upper_bound = env.action_space.high[0]
        lower_bound = env.action_space.low[0]

        print("Max Value of Action ->  {}".format(upper_bound))
        print("Min Value of Action ->  {}".format(lower_bound))
    else:
        num_actions = env.action_space.n
        print("Size of Action Space ->  {}".format(num_actions))

        upper_bound = num_actions
        lower_bound = 0
        print("Max Value of Action ->  {}".format(upper_bound))
        print("Min Value of Action ->  {}".format(lower_bound))

    ddpg = DDPG_OFF(num_states, num_actions, lower_bound, upper_bound)
    avg_reward_list = ddpg.train(env, 10)

    # Plotting graph
    # Episodes versus Avg. Rewards
    plt.plot(avg_reward_list)
    plt.xlabel("Episode")
    plt.ylabel("Avg. Epsiodic Reward")
    plt.show()
示例#14
0
def train_ddpg():
    env = LunarLander()
    env.render()
    record_video = False
    if record_video:
        from gym.wrappers.monitor import Monitor

        env = Monitor(env, config.video_folder / "ddpg/", force=True)

    agent = DDPG(env=env,
                 num_actions=4,
                 input_shape=env.observation_space.shape[0],
                 continuous=False)  # env.action_space.shape[0])
    agent.load_model()
    n_games = 51

    figure_file = config.plots_folder / "ddpg/lunar_planer.png"
    load_checkpoint = True

    score_history = agent.train(env, n_games, load_checkpoint)

    if not load_checkpoint:
        x = [i + 1 for i in range(n_games)]
        plot_learning_curve(x, score_history, figure_file)
示例#15
0
def render(env, agent, name="", record=False):
    if record:
        env = Monitor(env,
                      './video-test/{}'.format(name),
                      force=True,
                      mode="evaluation")
    for i_episode in range(5):
        state = env.reset()
        total_reward = 0
        for step, _ in enumerate(range(STEPS), start=1):
            state = np.expand_dims(state, axis=0)
            env.render()

            action_index = agent.act(state)
            action = decode_action(action_index)

            next_state, reward, done, info = env.step(action)
            if done:
                break
            state = next_state
            total_reward += reward

        print("Episode achieves total reward {}".format(total_reward))
示例#16
0
     if k==key.RIGHT: a[0] = +1.0
     if k==key.UP:    a[1] = +1.0
     if k==key.DOWN:  a[2] = +0.8   # set 1.0 for wheels to block to zero rotation
 def key_release(k, mod):
     if k==key.LEFT  and a[0]==-1.0: a[0] = 0
     if k==key.RIGHT and a[0]==+1.0: a[0] = 0
     if k==key.UP:    a[1] = 0
     if k==key.DOWN:  a[2] = 0
 env = CarRacing()
 env.render()
 env.viewer.window.on_key_press = key_press
 env.viewer.window.on_key_release = key_release
 record_video = False
 if record_video:
     from gym.wrappers.monitor import Monitor
     env = Monitor(env, '/tmp/video-test', force=True)
 isopen = True
 while isopen:
     env.reset()
     total_reward = 0.0
     steps = 0
     restart = False
     while True:
         s, r, done, info = env.step(a)
         total_reward += r
         if steps % 200 == 0 or done:
             print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
             print("step {} total_reward {:+0.2f}".format(steps, total_reward))
             #import matplotlib.pyplot as plt
             #plt.imshow(s)
             #plt.savefig("test.jpeg")
示例#17
0
                feed.update({rwd: rwd_queue[rand_indexs]})
                feed.update({next_obs: next_obs_queue[rand_indexs]})
                if not learning_finished:  # If not solved, we train and get the step loss
                    step_loss_value, _ = sess.run([loss, train_step], feed_dict=feed)
                else:  # If solved, we just get the step loss
                    step_loss_value = sess.run(loss, feed_dict=feed)
                # Use sum to calculate average loss of this episode
                sum_loss_value += step_loss_value

        print("====== Episode {} ended with score = {}, avg_loss = {} ======".format(i_episode + 1, score,
                                                                               sum_loss_value / score))
        score_queue.append(score)
        if len(score_queue) > MAX_SCORE_QUEUE_SIZE:
            score_queue.pop(0)
            if np.mean(score_queue) > 195:  # The threshold of being solved
                learning_finished = True
            else:
                learning_finished = False
        if learning_finished:
            print("Testing !!!")
        # save progress every 100 episodes
        if learning_finished and i_episode % 100 == 0:
            saver.save(sess, 'checkpoints-cartpole/' + GAME + '-dqn', global_step=global_step)


if __name__ == "__main__":
    env = gym.make(GAME)
    Monitor(env, './test/', force=True)
    train(env)
    env.close()
示例#18
0
        if k == key.RIGHT and a[0] == +0.3:
            a[0] = 0
        if k == key.UP:
            a[1] = 0
        if k == key.DOWN:
            a[2] = 0

    env = CarRacing()
    env.render()
    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release
    record_video = False
    if record_video:
        from gym.wrappers.monitor import Monitor

        env = Monitor(env, "./tmp/video-test", force=True)
    isopen = True

    record_s, record_a, record_r = [], [], []
    episode_num = 0
    while isopen:
        env.reset()
        total_reward = 0.0
        steps = 0
        restart = False
        while True:
            s, r, done, info = env.step(a)

            if a[0] == -0.3:
                record_a.append(0)
            elif a[0] == 0.3:
def do_run(run, dirname, args):
    """
    global snapshot
    snapshot2 = tracemalloc.take_snapshot()
    print(('MEMORY', run, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
    if snapshot is not None:
        top_stats = snapshot2.compare_to(snapshot, 'lineno')
        print("[ Top 10 differences ]")
        for stat in top_stats[:10]:
            print(stat)
        print()
    snapshot = snapshot2
    """
    with tf.Graph().as_default():
        learner_assumptions = get_learner_assumption_kwargs(args)

        # Each run has a different random seed equal to the run id.
        np.random.seed(run)
        random.seed(run)

        is_gridworld = not 'lunar' in args.env_name.lower()

        # TODO: Reset test goal inside here? Or use environment instead?
        rollouts = [[]]
        # Initialize model with wrong transition model based on aristotle learner.
        rollouts[0] += make_rollouts(
            #policy=aristotle_pilot_policies[0],  # Was from a noisy policy.
            policy=policies.make_perfect_pilot_policy(
                goal=test_goal,
                act_labels=train_act_labels,
            ),
            env=test_env,
            n=args.n_initial_rollouts,
            task_idx=task_idx,
        )
        assert(len(rollouts[0]) == args.n_initial_rollouts)
        rollouts[0] += make_rollouts(
            #policy=aristotle_pilot_policies[0],  # Was from a noisy policy.
            policy=policies.make_perfect_pilot_policy(
                goal=test_goal,
                act_labels=train_act_labels,
            ),
            env=wrong_train_env,
            n=args.n_initial_wrong_rollouts,
            task_idx=task_idx,
        )

        model = None
        Q = None
        start_pos = None

        logs = []
        evals = []
        evals_unassisted = []
        learner_q_values = []
        with tf.Session() as sess:
            if needs_model:
                model = inverse_softq.InverseSoftQModel(
                    train_envs=[test_env]
                )

            # NOTE: Used to be inside episode loop!
            # TODO: Check if this broke anything!
            support_env = get_support_env(
                s=args.learner_support,
                model=model,
                sess=sess,
                goal=test_goal,
                test_act_labels=test_act_labels,
                n_act_dim=n_act_dim,
                threshold=args.bumper_threshold,
                q_bumper_boltzmann=args.q_bumper_boltzmann,
                q_bumper_version=args.q_bumper_version,
                q_bumper_target_r=args.q_bumper_target_r,
                q_bumper_length_normalized=args.q_bumper_length_normalized,
                q_bumper_logistic_upper_prob=args.q_bumper_logistic_upper_prob,
                q_bumper_alpha=args.q_bumper_alpha,
                q_threshold=args.q_threshold,
                test_env=test_env,
                env_name=args.env_name,
                start_pos=start_pos,
                trajectory_distance=args.trajectory_distance,
                dirname=dirname,
                p_override=args.p_override,
                undoing=args.undoing,
                p_suboptimal_override=args.p_suboptimal_override,
                override_next_best=args.override_next_best,
                optimal_agent_training_timesteps=args.optimal_agent_training_timesteps,
                optimal_agent_smoothing_timesteps=args.optimal_agent_smoothing_timesteps,
                gamma=args.gamma,
            )
            policy = get_learner_policy(
                s=args.learner_policy,
                #model=model,
                #sess=sess,
                #test_goal=test_goal,
                #train_act_labels=train_act_labels,
                #test_act_labels=test_act_labels,
                #n_act_dim=n_act_dim,
                #Q=Q,
                env=support_env,
                exploration_fraction=args.exploration_fraction,
                exploration_final_eps=args.exploration_final_eps,
                exploration_final_lr=args.exploration_final_lr,
                total_episodes=args.n_episodes,
                run=run,
            )


            for ep in range(args.n_episodes):
                #print('Rn: {} Ep: {}'.format(run, ep), flush=True)
                support_env_with_monitor = Monitor(
                    support_env,
                    directory=os.path.join(
                        dirname,
                        'assisted',
                        str(run).zfill(3),
                        str(ep).zfill(3),
                    ),
                    force=True,
                    video_callable=lambda e: True if is_gridworld or utils.IS_LOCAL else False,
                    #video_callable=(lambda e: True) if is_gridworld else None,
                )
                # Simulate human learning
                """
                if args.learner_policy == 'q':
                    assert(args.n_learn_rollouts > 0)
                    Q = policies.q_learning(
                        rollouts if ep == 0 else [rollouts[0][-args.n_learn_rollouts:]],
                        n_obs_dim=n_obs_dim,
                        n_act_dim=n_act_dim,
                        user_action=args.think_all_actions_own,
                        Q_init=Q,
                        learning_rate=args.q_learning_rate,
                    )
                """

                _logs = None
                if needs_model:
                    _logs = inverse_softq.run_learning(
                        model=model,
                        sess=sess,
                        # train_tasks=train_aristotle_envs[:1],
                        rollouts=rollouts,
                        test_goal=test_goal,
                        test_act_labels=test_act_labels,
                        train_act_labels=train_act_labels,
                        n_iters=args.n_softq_train_iters,
                        train_frac=0.9,  # TODO: Change to 1
                        **learner_assumptions
                    )

                # Test
                #episode_seed = [run, ep]

                perf = compute_assisted_perf(
                    model=model,
                    sess=sess,
                    #test_act_labels=test_act_labels,
                    #train_act_labels=train_act_labels,
                    test_env=support_env_with_monitor,
                    policy=policy,
                    goal=test_goal,
                    #seed=episode_seed,
                    n_eval_rollouts=args.n_eval_rollouts,
                    policy_explore=True,
                    policy_update=True,
                    **learner_assumptions
                )

                unassisted_perf = None
                if args.n_eval_unassisted_rollouts is not None:
                    unassisted_support_env = get_support_env(
                        s='unassisted',
                        goal=test_goal,
                        test_act_labels=test_act_labels,
                        n_act_dim=n_act_dim,
                        test_env=test_env,
                        env_name=args.env_name,
                        start_pos=start_pos,
                        trajectory_distance=args.trajectory_distance,
                        dirname=dirname,
                    )
                    unassisted_support_env_with_monitor = Monitor(
                        unassisted_support_env,
                        directory=os.path.join(
                            dirname,
                            'unassisted',
                            str(run).zfill(3),
                            str(ep).zfill(3),
                        ),
                        force=True,
                        video_callable=lambda e: True if is_gridworld or utils.IS_LOCAL else False,
                        #video_callable=(lambda e: True) if is_gridworld else None,
                    )
                    unassisted_perf = compute_assisted_perf(
                        model=model,
                        sess=sess,
                        #test_act_labels=test_act_labels,
                        #train_act_labels=train_act_labels,
                        test_env=unassisted_support_env_with_monitor,
                        policy=policy,
                        goal=test_goal,
                        #seed=episode_seed,
                        n_eval_rollouts=args.n_eval_unassisted_rollouts,
                        policy_explore=False,
                        policy_update=False,
                    )
                    unassisted_support_env_with_monitor.close()
                    unassisted_support_env.close()

                new_rollouts = perf['rollouts']
                rollouts[task_idx] += new_rollouts[:args.n_learn_rollouts]
                if _logs is not None:
                    logs.append(_logs)
                evals.append(perf)
                evals_unassisted.append(unassisted_perf)
                if args.learner_policy == 'q':
                    learner_q_values.append(copy(policy.Q))

                support_env_with_monitor.close()

        support_env.close()
        policy.close()

        out_d = {
                'logs': logs,
                'evals': evals,
                'evals_unassisted': (
                    evals_unassisted
                    if args.n_eval_unassisted_rollouts is not None
                    else None
                ),
                'q_values': learner_q_values,
                'args': vars(args),
                'run': run,
                'support_details': support_env.get_support_details(),
        }
        with open(
                os.path.join(dirname, 'data{}.json'.format(str(run).zfill(3))),
                'w',
        ) as f:
            json.dump(out_d, f, cls=NumpyEncoder)
示例#20
0
            a[2] = +0.8  # set 1.0 for wheels to block to zero rotation

    def key_release(k, mod):
        if k == key.LEFT and a[0] == -1.0: a[0] = 0
        if k == key.RIGHT and a[0] == +1.0: a[0] = 0
        if k == key.UP: a[1] = 0
        if k == key.DOWN: a[2] = 0

    env = CarRacingV1()
    env.render()
    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release
    record_video = False
    if record_video:
        from gym.wrappers.monitor import Monitor
        env = Monitor(env, '/tmp/video-test', force=True)
    isopen = True
    while isopen:
        env.reset()
        total_reward = 0.0
        steps = 0
        restart = False
        while True:
            s, r, done, info = env.step(a)
            total_reward += r
            if steps % 200 == 0 or done:
                print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
                print("step {} total_reward {:+0.2f}".format(
                    steps, total_reward))
                #import matplotlib.pyplot as plt
                #plt.imshow(s)
示例#21
0
            a[0] = 0
        if k == key.D:     set_trace()
        if k == key.R:     env.reset()
        if k == key.Z:     env.change_zoom()
        if k == key.G:     env.switch_intersection_groups()
        if k == key.I:     env.switch_intersection_points()
        if k == key.X:     env.switch_xt_intersections()
        if k == key.E:     env.switch_end_of_track()
        if k == key.S:     env.switch_start_of_track()
        if k == key.T:     env.screenshot('./')
        if k == key.Q:     sys.exit()

    env.render()
    if record_video:
        PATH = os.path.abspath('env/') + '/data_samples/core_environments/'
        env = Monitor(env, PATH + 'car_racing_advanced', force=True)
    # env.key_press_fn = key_press
    # env.key_release_fn = key_release

    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release
    while True:
        env.reset()
        total_reward = 0.0
        steps = 0
        restart = False

        while True:
            if discretize is not None:
                a_tmp = a[0]
            else:
示例#22
0
"""
Use this file to check that your implementation complies with our evaluation
interface.
"""

import gym
from gym.wrappers.monitor import Monitor
from challenge1 import get_model, get_policy

# 1. Learn the model f: s, a -> s', r
env = Monitor(gym.make('Pendulum-v0'),
              'training',
              video_callable=False,
              force=True)
env.seed(98251624)
max_num_samples = 10000
model = get_model(env, max_num_samples)
env.close()

# Your model will be tested on the quality of prediction
obs = env.reset()
act = env.action_space.sample()
nobs, rwd, _, _ = env.step(act)
nobs_pred, rwd_pred = model(obs, act)
print(f'truth = {nobs, rwd}\nmodel = {nobs_pred, rwd_pred}')
env.close()

# 2. Perform dynamic programming using the learned model
env = Monitor(gym.make('Pendulum-v0'), 'evaluation', force=True)
env.seed(31186490)
policy = get_policy(model, env.observation_space, env.action_space)
示例#23
0
        self.steps += 1
        if self.steps == self.max_length:
            done = True
        return ob, reward, done, info


if __name__ == '__main__':

    max_iterations = 80
    max_episodes = 100
    max_trajectory = 50

    task = MaxLength(WarpFrame(CollectEnv(goal_condition=lambda x: (x.colour == 'beige' and x.shape == 'square')
                                                                   or (x.colour == 'purple' and x.shape == 'circle'))),
                     max_trajectory)
    env = Monitor(task, './experiment_weighted_or/', video_callable=False, force=True)

    dqn_purple_circle = load('./models/purple_circle/model.dqn', task)  # entropy regularised functions
    dqn_beige_crate = load('./models/beige_crate/model.dqn', task)  # entropy regularised functions
    weights = np.arange(1/3, 3.01, 0.05)

    tally = {i: [] for i in range(len(weights))}

    for iter in range(max_iterations):
        for i, weight in enumerate(weights):
            collected_count = [0, 0]
            weight = 1
            dqn_composed = ComposedDQN([dqn_beige_crate, dqn_purple_circle], [weight, 1])
            for episode in range(max_episodes):
                if episode % 1000 == 0:
                    print(episode)
示例#24
0
import gym, time
import numpy as np
from getModel import getModelQube, getModelPendel
from gym.wrappers.monitor import Monitor
from sklearn.neural_network import MLPRegressor
from challenge1_template import get_model, get_policy
from scipy import spatial

env = Monitor(gym.make('Pendulum-v0'),
              'training',
              video_callable=False,
              force=True)
env.seed(98251624)
max_num_samples = 10000
model = get_model(env, max_num_samples)

max_state = env.observation_space.high
min_state = env.observation_space.low
max_action = env.action_space.high
min_action = env.action_space.low
discret_states = 100
discrete_actions = 4
discount_factor = 0.99
theta = 1


def discreizeSpace(min_state, max_state, discret_num):
    discrete_space = []
    for i in range(0, len(max_state)):
        min = min_state[i]
        max = max_state[i]
示例#25
0
def record_game(env):
    # TODO: Test this
    return Monitor(env, '/tmp/video-test', force=True)
示例#26
0
from model import Model
import numpy as np
import tensorflow as tf
import gym
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from gym.wrappers.monitor import Monitor

# Test du modèle
env = gym.make("CartPole-v0")
env = Monitor(env, "videos",force=True)
model = Model(num_actions=env.action_space.n)

obs = env.reset()
action, value = model.action_value(obs[None, :])
print(action, value)

# Création de l'agent 
class Agent():
    def __init__(self, model):
        self.params = { "value": 0.5,
                        "entropy": 0.0001,
                        "gamma": 0.99}

        self.model = model 
        self.model.compile(
            optimizer=tf.keras.optimizers.Adam(lr=0.0005),
            loss = [self._logits_loss, self._value_loss]
        )

    def test(self, env, render=True):
        obs, done, ep_reward = env.reset(), False, 0
示例#27
0
def main():
    toRender = {
        "line": 1,
        "circle": 1,
        "parabola": 0,
        "cycloid": 1,
        "random": 1,
        "rl": 0
    }

    if (len(sys.argv) == 2):
        #read actions from file
        global env4list
        #toRender["rl"] = 1
        #fin = open(sys.argv[1],"r")
        #line = fin.readline()
        env4list = np.load(sys.argv[1])
        env4list = smooth(env4list)
        toRender["rl"] = 1

        #fin.close()

    global gViewer
    gViewer = rendering.Viewer(600, 600)
    saveVideo = True

    global env0, env0theta, env0done
    if toRender["random"]:
        env0 = bc.BrachistochroneEnv("random", gViewer, (0, 0, 0))
        if saveVideo:
            from gym.wrappers.monitor import Monitor
            env0 = Monitor(env0, './video-test', force=True)

        env0.reset()
        env0theta = 0
        env0done = False
        env0.score_label.x = gViewer.width - 150
        env0.score_label.y = gViewer.height - 10
    if toRender["line"]:
        global env1, env1theta, env1done
        env1 = bc.BrachistochroneEnv("line", gViewer, (1, 0, 0))
        if toRender["random"]:
            env1.setStartPosition(env0.start_position)
        env1done = False
        env1theta = math.atan(
            (env1.goal_position[1] - env1.start_position[1]) /
            (env1.goal_position[0] - env1.start_position[0])) / (math.pi)
        env1.reset()
        env1.score_label.x = gViewer.width - 150
        env1.score_label.y = gViewer.height - 25

    if toRender["circle"]:
        global env2, env2theta, env2done
        env2 = bc.BrachistochroneEnv("circle", gViewer, (0, 0, 1))
        if toRender["random"]:
            env2.setStartPosition(env0.start_position)
        env2done = False
        env2theta = 2 * math.atan(
            (env2.goal_position[1] - env2.start_position[1]) /
            (env2.goal_position[0] - env2.start_position[0])) / (math.pi)
        env2.reset()
        env2.score_label.x = gViewer.width - 150
        env2.score_label.y = gViewer.height - 40

    if toRender["cycloid"]:
        global env3, env3theta, env3done, R_cycloid, T_Cycloid
        env3 = bc.BrachistochroneEnv("cycloid", gViewer, (0, 0.75, 0.25))
        if toRender["random"]:
            env3.setStartPosition(env0.start_position)
        R_cycloid, T_Cycloid = solveCycloidInit(env3.start_position,
                                                env3.goal_position)
        env3theta = 2 * math.atan(
            (env3.goal_position[1] - env3.start_position[1]) /
            (env3.goal_position[0] - env3.start_position[0])) / (math.pi)
        env3done = False
        env3.reset()
        env3.score_label.x = gViewer.width - 150
        env3.score_label.y = gViewer.height - 55
    if toRender["rl"]:
        global env4, env4theta, env4done
        env4 = bc.BrachistochroneEnv("RL Agent", gViewer, (1, 0.5, 0))
        env4.reset()
        env4theta = 0
        env4done = False
        env4.score_label.x = gViewer.width - 150
        env4.score_label.y = gViewer.height - 70

    numsteps = 1000
    for i in range(numsteps):

        toRender["random"] and env0.render()
        toRender["line"] and env1.render()
        toRender["circle"] and env2.render()
        toRender["cycloid"] and env3.render()
        toRender["rl"] and env4.render()

        if toRender["random"] and not env0done:
            env0theta = env0.action_space.sample()
            _, _, env0done, _ = env0.step(np.float32(env0theta))
        if toRender["line"] and not env1done:
            _, _, env1done, _ = env1.step(np.float32([env1theta]))
        if toRender["circle"] and not env2done:
            _, _, env2done, _ = env2.step(np.float32([env2theta]))
            env2theta = 2 * math.atan(
                (env2.goal_position[1] - env2.state[1]) /
                (env2.goal_position[0] - env2.state[0])) / math.pi
        if toRender["cycloid"] and not env3done:
            _, _, env3done, _ = env3.step(np.float32([env3theta]))
            env3theta = solveCycloid(env3.start_position,
                                     [env3.state[0], env3.state[1]])
        """
        if toRender["rl"] and not env5done:
            line = fin.readline()
            if line:
                env0theta = [float(line)]
                _,_,env0done,_ = env5.step(np.float32([env5theta]))
            else:
                env0done = True
        """
        if toRender["rl"] and not env4done:
            if i >= len(env4list):
                continue
            env4theta = env4list[i]
            _, _, env4done, _ = env4.step(np.float32([env4theta]))

    toRender["random"] and env0.close()
    toRender["line"] and env1.close()
    toRender["circle"] and env2.close()
    toRender["cycloid"] and env3.close()
    if toRender["rl"]:
        pts = env4.path
        print(pts)
        coeffs = polyfit(pts)
        env4.close()
    return
示例#28
0
class DQNAgent:
    def __init__(self, lr, momentum, alpha, gamma, target_update_frequency,
                 local_update_frequency, replay_start_size, queue_len,
                 batch_size):
        gym.logger.set_level(40)
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.env = gym.make('LunarLander-v2')
        self.replay_buffer = ReplayBuffer(queue_len, self.device, alpha)

        self.local_qnetwork = DQNModel().to(self.device)
        self.target_qnetwork = DQNModel().to(self.device)
        self.target_qnetwork.load_state_dict(self.local_qnetwork.state_dict())
        self.optimizer = optim.RMSprop(self.local_qnetwork.parameters(),
                                       lr=lr,
                                       momentum=momentum)

        self.gamma = gamma
        self.target_update_frequency = target_update_frequency
        self.local_update_frequency = local_update_frequency
        self.replay_start_size = replay_start_size
        self.batch_size = batch_size

        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.episode_step = 0

    def agent_step(self, state, eps, beta):
        next_state, reward, done = self.env_step(state, eps)
        if len(self.replay_buffer.queue) < self.replay_start_size:
            return next_state, reward, None, done

        # Update the local q network every local_update_frequency steps
        loss = None
        if self.episode_step % self.local_update_frequency == 0:
            loss = self.qnetwork_step(beta)

        # Update the target q network every target_update_frequency steps
        if self.episode_step % self.target_update_frequency == 0:
            self.target_qnetwork.load_state_dict(
                self.local_qnetwork.state_dict())

        self.episode_step += 1
        return next_state, reward, loss, done

    def env_step(self, state, eps):
        action = self.policy(state, eps)
        next_state, reward, done, _ = self.env.step(action)

        self.replay_buffer.put([state, action, reward, next_state, done])
        return next_state, reward, done

    def qnetwork_step(self, beta):
        states, actions, rewards, next_states, dones, indices, is_weights = self.replay_buffer.batch_get(
            self.batch_size, self.state_size, beta)

        # Double DQN
        next_target_actions = torch.argmax(self.local_qnetwork(next_states),
                                           dim=1).unsqueeze(1)
        next_target_rewards = self.target_qnetwork(next_states).gather(
            1, next_target_actions)
        target_rewards = rewards + self.gamma * next_target_rewards * (1 -
                                                                       dones)
        local_rewards = self.local_qnetwork(states).gather(1, actions.long())

        self.optimizer.zero_grad()
        td_error = (local_rewards - target_rewards.detach())**2
        loss = torch.mean(is_weights.unsqueeze(1) * td_error)
        loss.backward()
        for param in self.local_qnetwork.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        self.replay_buffer.update_priorities(indices,
                                             td_error.data.cpu() + 0.0001)
        return loss.item()

    def policy(self, state, eps):
        if random.random() < eps:
            # Random action
            return self.env.action_space.sample()
        else:
            # Act according to local q network
            self.local_qnetwork.eval()
            with torch.no_grad():
                out = self.local_qnetwork(
                    torch.FloatTensor(state).to(
                        self.device).unsqueeze(0)).cpu()
            self.local_qnetwork.train()

            return torch.argmax(out).item()

    def reset(self, record):
        self.episode_step = 0

        if record:
            self.env = Monitor(gym.make('LunarLander-v2'),
                               "recordings",
                               video_callable=lambda episode_id: True,
                               force=True)
        else:
            self.env = gym.make('LunarLander-v2')

        return self.env.reset()
示例#29
0
文件: Game.py 项目: eoinmca/AI_Driver
def record_game(env):
    # TODO: Test this
    print('Recording')
    if not os.path.exists(VIDEO_PATH):
        os.mkdir('video-test')
    return Monitor(env, VIDEO_PATH, force=True)
示例#30
0
import gym
from gym.wrappers.monitor import Monitor
from tf_rl.env.pybullet.env_list import ENVS

for key, env_name in ENVS.items():
    print(env_name)
    env = gym.make(env_name)
    env = Monitor(env=env, directory="./video/{}".format(key), force=True)

    state = env.reset()
    for t in range(100):
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        if done:
            break

    env.close()