示例#1
0
def main():

    tensorboard_log = "./log"

    env = Pinokio5()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists(save_file):
        model = PPO.load(save_file,
                         env=DummyVecEnv([lambda: env]),
                         tensorboard_log=tensorboard_log)
    else:
        model = PPO(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log)

    try:
        while True:
            #model.learn(total_timesteps=10000)
            model.learn(total_timesteps=8000000, tb_log_name=tb_log_name)

            model.save(save_file)

            obs = env.reset()
            for i in range(100):
                action, _states = model.predict(obs)
                obs, reward, done, info = env.step(action)
                env.render()
                if done:
                    print("resetting because " + str(done))
                    env.reset()
    except KeyboardInterrupt:
        print("Saving before exiting...")
        model.save(save_file)
        print("k bye")
示例#2
0
def main():

    tensorboard_log = "./log"

    env = Pinokio3()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists( save_file ):
        model = PPO.load( save_file, env=DummyVecEnv([lambda:env]),tensorboard_log=tensorboard_log )
    else:
        policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=net_arch)
        model = PPO(MlpPolicy, DummyVecEnv([lambda:env]), verbose=1,tensorboard_log=tensorboard_log)

    #https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html
    checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./checkpoints/',
                                         name_prefix='pinokio3')


    while True:
        model.learn(total_timesteps=15000000, callback=checkpoint_callback, tb_log_name=tb_log_name )

        model.save( save_file )
        print( "saved" )

        obs = env.reset()
        for i in range(20):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            print( "action {} -> reward {}".format( env.decode_action(action), reward ) )
            env.render()
            if done:
                print( "resetting because " + str(done) )
                env.reset()
示例#3
0
def trained_agent(episodes=256,
                  continuous=True,
                  load=None,
                  save_name="test",
                  ent_coef=0.00001,
                  total_timesteps=25000,
                  learning_rate=lr()):
    env = gym.make("bilboquet-v0", continuous=continuous, amplitude=10)
    env.reset((300, 300))

    if load is None:
        model = PPO('MlpPolicy',
                    env,
                    verbose=1,
                    ent_coef=ent_coef,
                    learning_rate=learning_rate,
                    tensorboard_log=f"./ppo_bilboquet_tensorboard/")
        model.learn(total_timesteps=total_timesteps, tb_log_name=save_name)
        model.save(save_name + '.zip')
        print('DONE')
        obs = env.reset()
    else:
        model = PPO.load(load)
        obs = env.reset()

    for i in range(episodes):
        action, _states = model.predict(obs, deterministic=True)
        # print(action)
        obs, reward, done, info = env.step(action)
        # print(reward)
        env.render()
        if done:
            obs = env.reset()
示例#4
0
文件: rl_demo.py 项目: yonkshi/bulb
def main(args):
    envs = make_vec_env(args.env_name,
                        n_envs=args.num_envs,
                        vec_env_cls=SubprocVecEnv)
    viz_env = None
    if args.viz:
        nm_core, nm_vrsn, = args.env_name.split('-')
        nm_core += 'Viz' if args.viz else 'Dbg' if args.debug else ''
        viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1)
    rl_learner = PPO('MlpPolicy',
                     envs,
                     verbose=1,
                     seed=args.seed,
                     device='cpu')
    for epoch in range(args.num_epochs):
        rl_learner.learn(args.steps_per_epoch)
        if args.viz:
            obs = viz_env.reset()
            done = False
            while not done:
                act, _ = rl_learner.predict(obs)
                if len(act.shape) > len(viz_env.action_space.shape):
                    act = act[0:1]  # just one viz env
                obs, rwd, done, _ = viz_env.step(act)
                time.sleep(0.01)  # to make motions visible
示例#5
0
def main():
    base_args, base_parser = get_logger2_args()
    args = get_args(base_parser)
    args.device = init_gpus_and_randomness(args.seed, args.gpu)
    logger = Logger2('/tmp/tmp', use_tensorboardX=True)
    logger.log_tb_object(args, 'args')
    envs = make_vec_env(args.env_name,
                        n_envs=args.num_envs,
                        vec_env_cls=SubprocVecEnv)
    viz_env = None
    if args.visualize:
        nm_core, nm_vrsn, = args.env_name.split('-')
        nm_core += 'Viz' if args.visualize else 'Dbg' if args.debug else ''
        viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1)
    rl_learner = PPO('MlpPolicy',
                     envs,
                     verbose=1,
                     seed=args.seed,
                     device='cpu')
    for epoch in range(args.num_epochs):
        rl_learner.learn(args.steps_per_epoch)
        if args.visualize:
            obs = viz_env.reset()
            done = False
            while not done:
                act, _ = rl_learner.predict(obs)
                if len(act.shape) > len(viz_env.action_space.shape):
                    act = act[0:1]  # just one viz env
                obs, rwd, done, _ = viz_env.step(act)
                time.sleep(0.01)  # to make motions visible
示例#6
0
def main():
    test_or_train = TEST_OR_TRAIN
    assert test_or_train in ["train", "test"]
    gym_config = SimulationParameters(time_step=TIME_STEP)
    robot_class = QuadrupedRobot
    robot_params = MiniCheetahParams(
        on_rack=False,
        enable_self_collision=True,
        motor_control_mode=MotorControlMode.HYBRID_COMPUTED_POS_TROT)
    task = TestTask(train_or_test=TEST_OR_TRAIN)

    env = LocomotionGymEnv(gym_config, robot_class, robot_params, task)

    policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'data/policies')
    if not (os.path.exists(policy_save_dir)):
        os.makedirs(policy_save_dir)

    policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    policy_save_path = os.path.join(policy_save_dir, policy_save_filename)

    if TEST_OR_TRAIN == "train":
        model = PPO('MlpPolicy', env, verbose=1)
        model.learn(total_timesteps=100000000)
        model.save(policy_save_path)
    else:
        model = PPO.load(POLICY_SAVE_PATH)
        obs = env.reset()
        while True:
            action, _state = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                obs = env.reset()
示例#7
0
def main():

    env = Pinokio2()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists(save_file):
        model = PPO.load(save_file, env=DummyVecEnv([lambda: env]))
    else:
        model = PPO(MlpPolicy, env, verbose=1)

    while True:
        #model.learn(total_timesteps=10000)
        model.learn(total_timesteps=100000)

        model.save(save_file)

        obs = env.reset()
        for i in range(10):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                print("resetting because " + str(done))
                env.reset()
示例#8
0
def main():
    # multiprocess environment
    # n_cpu = 8
    # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)])
    # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True)

    n_cpu = 1
    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env,
                       norm_obs=True,
                       clip_obs=2.0,
                       norm_reward=False,
                       training=True)

    model = PPO('MlpPolicy',
                env,
                verbose=1,
                n_steps=int(4096 / n_cpu),
                wandb_use=False)
    model.learn(total_timesteps=40000000)
    file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now())
    model.save(file_name)
    env.save(file_name + "_env.pkl")

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)

    del model  # remove to demonstrate saving and loading
    del env

    # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089"

    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load(file_name + "_env.pkl", env)
    env.training = False

    model = PPO.load(file_name, env=env, wandb_use=False)

    #Enjoy trained agent
    obs = np.copy(env.reset())
    epi_reward = 0

    while True:
        action, _states = model.predict(obs, deterministic=True)

        obs, rewards, dones, info = env.step(action)
        env.render()
        epi_reward += rewards

        if dones:
            print("Episode Reward: ", epi_reward)
            epi_reward = 0
示例#9
0
class Agent(object):
    def __init__(self, env, model=None):
        if model:
            self.model = model
        else:
            self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace(
                ":", "-")
            os.makedirs(self.log_dir, exist_ok=True)
            monitor_env = Monitor(env, self.log_dir, allow_early_resets=True)
            vec_env = DummyVecEnv([lambda: monitor_env])
            policy_kwargs = dict(
                features_extractor_class=CustomCNN,
                features_extractor_kwargs=dict(features_dim=256),
                net_arch=[dict(pi=[64, 64], vf=[64, 64])])
            self.model = PPO(CustomCnnPolicy,
                             vec_env,
                             policy_kwargs=policy_kwargs,
                             verbose=1,
                             learning_rate=0.001)

    def function(self, obs, conf):
        import random
        col, _ = self.model.predict(np.array(obs['board']).reshape(
            6, 7, 1))  # TODO: Connect-4 specific so far
        is_valid = (obs['board'][int(col)] == 0)
        if is_valid:
            return int(col)
        else:
            return random.choice([
                col for col in range(config.columns)
                if obs.board[int(col)] == 0
            ])

    def train(self, timesteps):
        self.model.learn(total_timesteps=timesteps)

    def save(self, name: str):
        self.model.save(name)

    def load(self, name: str, env, replace_parameters=None):
        self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace(
            ":", "-")
        os.makedirs(self.log_dir, exist_ok=True)
        monitor_env = Monitor(env, self.log_dir, allow_early_resets=True)
        vec_env = DummyVecEnv([lambda: monitor_env])
        self.model = PPO.load(name,
                              env=vec_env,
                              custom_objects=replace_parameters)

    def plot(self):
        # Plot cumulative reward
        with open(os.path.join(self.log_dir, "monitor.csv"), 'rt') as fh:
            firstline = fh.readline()
            assert firstline[0] == '#'
            df = pd.read_csv(fh, index_col=None)['r']
        df.rolling(window=1000).mean().plot()
        plt.show()
示例#10
0
def main():
    test_or_train = TEST_OR_TRAIN
    assert test_or_train in ["train", "test"]

    env_params = {
        'time_step': TIME_STEP,
        'robot_class': QuadrupedRobot,
        'on_rack': False,
        'enable_self_collision': True,
        'motor_control_mode': MotorControlMode.HYBRID_COMPUTED_POS_TROT,
        'train_or_test': test_or_train
    }

    policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'data/policies')
    policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    policy_save_path = os.path.join(policy_save_dir, policy_save_filename)

    policy_kwargs = {"net_arch": [{"pi": [512, 256], "vf": [512, 256]}]}

    if TEST_OR_TRAIN == "train":
        env = make_vec_env(env_change_input,
                           n_envs=NUM_CPUS,
                           seed=0,
                           env_kwargs=env_params,
                           vec_env_cls=SubprocVecEnv)
        env = VecNormalize(env)
        if not (os.path.exists(policy_save_dir)):
            os.makedirs(policy_save_dir)
        model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1)
        model.learn(total_timesteps=100000000)
        model.save(policy_save_path)
    else:
        # env = env_change_input(time_step=env_params['time_step'],
        #                        robot_class=env_params['robot_class'],
        #                        on_rack=env_params['on_rack'],
        #                        enable_self_collision=env_params['enable_self_collision'],
        #                        motor_control_mode=env_params['motor_control_mode'],
        #                        train_or_test=env_params['train_or_test'])
        env = env_change_input(**env_params)
        model_load_path = os.path.join(policy_save_dir,
                                       'ppo_3_17-03-2021_15-39-42')
        model = PPO.load(model_load_path)
        obs = env.reset()
        while True:
            action, _state = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                obs = env.reset()
示例#11
0
def main():
    env = gym.make(ENV_NAME)
    model = PPO('MlpPolicy', env, verbose=1)
    model.learn(total_timesteps=100000)

    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()
        if done:
            obs = env.reset()

    env.close()
示例#12
0
def main(fitts_W, fitts_D, ocular_std, swapping_std, run, timesteps,
         logs_folder):
    # Create log dir
    lc_dir = f'./{logs_folder}/w{fitts_W}d{fitts_D}ocular{ocular_std}swapping{swapping_std}/'
    log_dir = f'{lc_dir}/run{run}/'
    os.makedirs(log_dir, exist_ok=True)

    # Instantiate the env
    env = Gaze(fitts_W=fitts_W,
               fitts_D=fitts_D,
               ocular_std=ocular_std,
               swapping_std=swapping_std)
    env = Monitor(env, log_dir)

    # Train the agent
    model = PPO('MlpPolicy', env, verbose=0, clip_range=0.15)
    model.learn(total_timesteps=int(timesteps))
    # save the model
    model.save(f'{log_dir}savedmodel/model_ppo')

    # plot learning curve
    plot_results2(log_dir)

    plt.savefig(f'{lc_dir}learning_curve{run}.png')
    plt.close('all')

    ###########################################################################
    # Record Behaviour of the trained policy
    ###########################################################################
    # save the step data

    # Test the trained agent
    n_eps = 5000
    number_of_saccades = np.ndarray(shape=(n_eps, 1), dtype=np.float32)
    eps = 0
    while eps < n_eps:
        done = False
        step = 0
        obs = env.reset()
        while not done:
            step += 1
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            if done:
                number_of_saccades[eps] = step
                eps += 1
                break

    np.savetxt(f'{log_dir}num_saccades.csv', number_of_saccades, delimiter=',')
示例#13
0
 def DRL_prediction(model: PPO, environment: StockTradingEnv) -> object:
     test_env, test_obs = environment.get_sb_env()
     """make a prediction"""
     account_memory = []
     actions_memory = []
     test_env.reset()
     for i in range(len(environment.df.index.unique())):
         action, _ = model.predict(test_obs, deterministic=True)
         test_obs, rewards, dones, info = test_env.step(action)
         if i == (len(environment.df.index.unique()) - 2):
             account_memory = test_env.env_method(
                 method_name="save_asset_memory")
             actions_memory = test_env.env_method(
                 method_name="save_action_memory")
         if dones[0]:
             print("hit end!")
             break
     return account_memory[0], actions_memory[0]
示例#14
0
文件: agent.py 项目: tkxkd0159/torch
def train(is_learn=False, log=False):
    if is_learn:
        model = PPO("MlpPolicy", env, verbose=0)
        model.learn(total_timesteps=20000)
        model.save("ppo_stock")

    else:
        model = PPO.load("ppo_stock")


    obs = env.reset()
    for i in range(2000):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        if log:
            with open('./log/ppo.txt', 'a') as f:
                f.write(f'{env.env_method("log")[0]}\n')
        env.render()
示例#15
0
def just_bob():
    for i in [100000, 500000, 1000000, 5000000]:
        start = time.time()
        bob = PPO("CnnPolicy",
                  VectorizedClass(GetBobEnvClass(25), 6),
                  verbose=0).learn(i)
        end = time.time()
        print(
            f"For {i} we took {end-start} and got {evaluate(bob, 25, episodes=100)}"
        )
    exit()

    done = False
    env = GetBobEnvClass(25)()
    obs = env.reset()
    while not done:
        action = bob.predict(obs)
        obs, rew, done, _ = env.step(action[0])
        env.render()
示例#16
0
def main():
    # Initialize the environment
    env = WebotsStickEnv()
    check_env(env)

    # Train
    model = PPO('MlpPolicy', env, n_steps=2048, verbose=1)
    model.learn(total_timesteps=1e5)

    # Replay
    print('Training is finished, press `Y` for replay...')
    env.wait_keyboard()

    obs = env.reset()
    for t in range(100000):
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        print(obs)
        if done:
            obs = env.reset()
示例#17
0
class AgentDemoWrapper(gym.Wrapper):
    def __init__(self, env, agent_path=None, tempdir_path=None):
        self.alg = PPO('MlpPolicy', env, verbose=0)
        if agent_path is not None:
            load_path = agent_path
            self.alg.set_parameters(load_path, exact_match=True)

        if tempdir_path is None:
            tempdir_path = 'temp'

        try:
            os.mkdir(tempdir_path)
        except:
            pass
        self.save_dir = tempdir_path
        self.max_attempt = 1000
        super(AgentDemoWrapper, self).__init__(env)

    def reset(self):
        obs = self.env.reset()
        return obs

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        return obs, reward, done, info

    def generate_episode_gif(self, init_map):
        images = []
        done = False
        obs = self.env.manual_reset(init_map)
        im = room_to_rgb(obs)
        images.append(im)
        while not done:
            action, _ = self.alg.predict(obs, deterministic=True)
            obs, _, done, _ = self.env.step(action)
            im = room_to_rgb(obs)
            images.append(im)

        im_name = '{}/agent_episode.gif'.format(self.save_dir)
        imageio.mimsave(im_name, images, 'GIF', fps=2)
示例#18
0
def stock_trade(stock_file):
    day_profits = []
    df = pd.read_csv(stock_file)
    df = df.sort_values('date')

    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: StockTradingEnv(df)])

    model = PPO('MlpPolicy', env, verbose=0, tensorboard_log='./log')
    model.learn(total_timesteps=int(1e6))

    df_test = pd.read_csv(stock_file.replace('train', 'test'))

    env = DummyVecEnv([lambda: StockTradingEnv(df_test)])
    obs = env.reset()
    for i in range(len(df_test) - 1):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        profit = env.render()
        day_profits.append(profit)
        if done:
            break
    return day_profits
示例#19
0
def multiprocessing_example():
    # Multiprocessing: Unleashing the Power of Vectorized Environments

    def make_env(env_id, rank, seed=0):
        """
		Utility function for multiprocessed env.

		:param env_id: (str) the environment ID.
		:param num_env: (int) the number of environments you wish to have in subprocesses.
		:param seed: (int) the inital seed for RNG.
		:param rank: (int) index of the subprocess.
		"""
        def _init():
            env = gym.make(env_id)
            env.seed(seed + rank)
            return env

        set_random_seed(seed)
        return _init

    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use.
    # Create the vectorized environment.
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    # Stable Baselines provides you with make_vec_env() helper which does exactly the previous steps for you.
    # You can choose between 'DummyVecEnv' (usually faster) and 'SubprocVecEnv'.
    #env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv)

    model = PPO("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=25_000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
示例#20
0
def charlie():
    for i in [100000 // 6, 500000 // 6, 1000000 // 6, 5000000 // 6]:
        start = time.time()
        bob = PPO("CnnPolicy",
                  VectorizedClass(GetBobEnvClass(10), 6),
                  verbose=0,
                  n_steps=200)
        charli = PPO("MlpPolicy",
                     CharlieEnv(bob, t=200, maxsize=10),
                     verbose=0,
                     n_steps=1000).learn(i)
        end = time.time()
        print(
            f"For {i} we took {end-start} and got {evaluate(bob, 10, episodes=100)}"
        )
    exit()

    done = False
    env = GetBobEnvClass(25)()
    obs = env.reset()
    while not done:
        action = bob.predict(obs)
        obs, rew, done, _ = env.step(action[0])
        env.render()
示例#21
0
        n_cpu = 6
        batch_size = 64
        env = make_vec_env("highway-fast-v0",
                           n_envs=n_cpu,
                           vec_env_cls=SubprocVecEnv)
        model = PPO(
            "MlpPolicy",
            env,
            policy_kwargs=dict(net_arch=[dict(pi=[256, 256], vf=[256, 256])]),
            n_steps=batch_size * 12 // n_cpu,
            batch_size=batch_size,
            n_epochs=10,
            learning_rate=5e-4,
            gamma=0.8,
            verbose=2,
            tensorboard_log="highway_ppo/")
        # Train the agent
        model.learn(total_timesteps=int(2e4))
        # Save the agent
        model.save("highway_ppo/model")

    model = PPO.load("highway_ppo/model")
    env = gym.make("highway-fast-v0")
    for _ in range(5):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, info = env.step(action)
            env.render()
示例#22
0
done = False
reward = 0
evasions = 0
evasion_history = {}

# Train the agent
agent = PPO("MlpPolicy", env, verbose=1)
agent.learn(total_timesteps=2500)


# Test the agent
for i in range(episode_count):
    ob = env.reset()
    sha256 = env.env.sha256
    while True:
        action, _states = agent.predict(ob, reward, done)
        obs, rewards, done, ep_history = env.step(action)
        if done and rewards >= 10.0:
            evasions += 1
            evasion_history[sha256] = ep_history
            break

        elif done:
            break

# Output metrics/evaluation stuff
evasion_rate = (evasions / episode_count) * 100
mean_action_count = np.mean(env.get_episode_lengths())
print(f"{evasion_rate}% samples evaded model.")
print(f"Average of {mean_action_count} moves to evade model.")
示例#23
0
            raise ValueError(f'Unrecognized action {action}')

        self._state = np.clip(self._state, 0, self._grid_size - 1)
        done = bool(self._state == self._grid_size - 1)
        reward = 1 if done else 0
        return np.array([self._state]).astype(np.float32), reward, done, {}

    def reset(self):
        self._state = 0
        return np.array([self._state]).astype(np.float32)

    def render(self, mode='human'):
        pass


if __name__ == '__main__':
    check_env(GridWorld(10))
    env = make_vec_env(lambda: GridWorld(10), n_envs=1)

    model = PPO('MlpPolicy', env, verbose=1).learn(5000)

    state = env.reset()
    for _ in range(20):
        action, _ = model.predict(state, deterministic=True)
        # action = 0
        next_state, reward, done, info = env.step(action)
        print(f'{state} -> {action} -> {next_state}: {reward}')
        state = next_state
        if done:
            break
示例#24
0
    model = PPO.load('../model/ppo', env=env)
    result = {}

    mean_reward = []
    scores = []

    episodes = 1000
    with open("../result/PPO.txt", "w") as txtfile:
        for episode in range(1, episodes + 1):
            print(f"episode: {episode}")
            state = env.reset()
            done = False
            temp_result = {}
            score = 0
            while done != True:
                action, _states = model.predict(state)
                n_state, reward, done, info = env.step(action)
                score += reward
            mean_reward.append(score)
            scores.append(info[0]['score'])

            temp = str(episode) + "," + str(score[0]) + "," + str(
                info[0]['score']) + "\n"
            txtfile.write(temp)

            mean = sum(mean_reward) / len(mean_reward)
            mean_score = sum(scores) / len(scores)

            print(f"The mean reward is {mean}")
            print(f"The mean score reward is {mean_score}")
            print(f"The max score is {max(scores)}")
                                                    num_steps=num_gen_steps,
                                                    num_boxes=num_boxes,
                                                    second_player=False)
                _, state, _ = fix_room
            except:
                success = False
        for i in range(len(version_li)):
            version = version_li[i]
            load_path = '{}/agent_v{}.zip'.format(load_dir, version)
            agent.set_parameters(load_path, exact_match=True)
            # agent = agent_li[i]
            done = False
            obs = np.expand_dims(soko_env.env_method('manual_reset', state)[0],
                                 axis=0)
            while not done:
                action, _ = agent.predict(obs, deterministic=True)
                obs, _, done, info = soko_env.step(action)

            # solved
            if info[0]["all_boxes_on_target"]:
                num_solved_li[i] += 1
                if unique_solver_idx == -1:
                    unique_solver_idx = i
                else:
                    unique_solver_idx = -1

        if unique_solver_idx != -1:
            num_unique_solved_li[unique_solver_idx] += 1

    for i in range(len(version_li)):
        print('{} solved {}, uniquely solved {}'.format(
示例#26
0
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)

print(mean_reward)
print(std_reward)

render_env = base_env.copy().parallel_env()
render_env = ss.color_reduction_v0(render_env, mode='B')
render_env = ss.resize_v0(render_env, x_size=84, y_size=84)
render_env = ss.frame_stack_v1(render_env, 3)

obs_list = []
i = 0
render_env.reset()


while True:
    for agent in render_env.agent_iter():
        observation, _, done, _ = render_env.last()
        action = model.predict(observation, deterministic=True)[0] if not done else None

        render_env.step(action)
        i += 1
        if i % (len(render_env.possible_agents)) == 0:
            obs_list.append(np.transpose(render_env.render(mode='rgb_array'), axes=(1, 0, 2)))
    render_env.close()
    break

print('Writing gif')
write_gif(obs_list, 'kaz.gif', fps=15)
示例#27
0
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed


def make_env(env_id, rank, seed=0):
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init


if __name__ == '__main__':
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    model = PPO('MlpPolicy', env, verbose=1)
    model.learn(total_timesteps=25000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
示例#28
0
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1)

    print(mean_reward)
    print(std_reward)

    # Maximum number of steps before reset, +1 because I'm scared of OBOE
    print("Starting rendering")
    num_steps = (max_time // delta_time) + 1

    obs = env.reset()

    if os.path.exists("temp"):
        shutil.rmtree("temp")

    os.mkdir("temp")
    # img = disp.grab()
    # img.save(f"temp/img0.jpg")

    img = env.render()
    for t in trange(num_steps):
        actions, _ = model.predict(obs, state=None, deterministic=False)
        obs, reward, done, info = env.step(actions)
        img = env.render()
        img.save(f"temp/img{t}.jpg")
    
    subprocess.run(["ffmpeg", "-y", "-framerate", "5", "-i", "temp/img%d.jpg", "output.mp4"])

    print("All done, cleaning up")
    shutil.rmtree("temp")
    env.close()
示例#29
0
import gym

from stable_baselines3 import PPO

env = gym.make("CartPole-v1")

model = PPO("MlpPolicy", env, verbose=1)
import ipdb
ipdb.set_trace()
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()

env.close()
示例#30
0
class ALGDemoWrapper(gym.Wrapper):
    def __init__(self, env, alg_path=None, alg_version=0, tempdir_path=None):
        self.alg = PPO('MlpPolicy', env, verbose=0)
        if alg_path is not None:
            load_path = alg_path + str(alg_version)
            self.alg.set_parameters(load_path, exact_match=True)

        if tempdir_path is None:
            tempdir_path = 'temp'

        try:
            os.mkdir(tempdir_path)
        except:
            pass
        self.save_dir = tempdir_path
        self.max_attempt = 1000
        self.version = alg_version
        super(ALGDemoWrapper, self).__init__(env)

    def reset(self):
        obs = self.env.reset()
        return obs

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        return obs, reward, done, info

    def generate_level(self):
        while True:
            done = False
            obs = self.env.reset()
            while not done:
                action, _ = self.alg.predict(obs, deterministic=True)
                obs, _, done, info = self.env.step(action)

            if info['fail_type'] == -1:
                return obs

    def generate_episode_gif(self):
        attempt = 0
        while True:
            images = []
            done = False
            obs = self.env.reset()
            im = room_to_rgb(obs)
            images.append(im)
            while not done:
                action, _ = self.alg.predict(obs, deterministic=True)
                obs, _, done, info = self.env.step(action)
                im = room_to_rgb(obs)
                images.append(im)

            if info['train_result'] == 0:
                im_name = '{}/alg_episode_v{}.gif'.format(
                    self.save_dir, self.version)
                imageio.mimsave(im_name, images, 'GIF', fps=2)
                return True, obs

            attempt += 1
            if attempt >= self.max_attempt:
                print('Time out. Wasn\'t able to generate good map.')
                return False, None