Пример #1
0
def main(env_id, render, num_process, lr_p, lr_v, gamma, tau, epsilon,
         batch_size, ppo_mini_batch_size, ppo_epochs, max_iter, eval_iter,
         save_iter, model_path, log_path, seed):
    base_dir = log_path + env_id + "/PPO_exp{}".format(seed)
    writer = SummaryWriter(base_dir)

    ppo = PPO(env_id=env_id,
              render=render,
              num_process=1,
              min_batch_size=batch_size,
              lr_p=lr_p,
              lr_v=lr_v,
              gamma=gamma,
              tau=tau,
              clip_epsilon=epsilon,
              ppo_epochs=ppo_epochs,
              ppo_mini_batch_size=ppo_mini_batch_size,
              seed=seed,
              model_path='trained_models')

    for i_iter in range(1, 6):

        ppo.eval(i_iter, render=True)

        torch.cuda.empty_cache()
Пример #2
0
class PPORunner(Runner):
    def __init__(self, env_name, algo_params, runner_params):
        super(PPORunner, self).__init__(env_name, 'PPO', algo_params, runner_params)

    def _before_sim_loop(self):
        n_state = self._env.observation_space.shape[0]
        n_action = self._env.action_space.n
        self._algo = PPO(n_state, n_action, self._algo_params)
        self._score = 0.0

    def _episode_sim(self, n_epi):
        s = self._env.reset()
        done = False
        while not done:
            for t in range(self._algo.t_horizon):
                prob = self._algo.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()
                s_prime, r, done, info = self._step_wrapper(self._env.step(a))
                
                if self._train:
                    self._algo.put_data((s, a, r/self._reward_scale, s_prime, prob[a].item(), done))
                    
                s = s_prime
                self._score += r

                if done:
                    return
                    
            if self._train:
                self._algo.train_net()
Пример #3
0
def train(env, hyperparameters, actor_model, critic_model):
    """
		Trains the model.
		Parameters:
			env - the environment to train on
			hyperparameters - a dict of hyperparameters to use, defined in main
			actor_model - the actor model to load in if we want to continue training
			critic_model - the critic model to load in if we want to continue training
		Return:
			None
	"""
    print(f"Training", flush=True)

    # Create a model for PPO.
    model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters)

    # Tries to load in an existing actor/critic model to continue training on
    if actor_model != '' and critic_model != '':
        print(f"Loading in {actor_model} and {critic_model}...", flush=True)
        model.actor.load_state_dict(torch.load(actor_model))
        model.critic.load_state_dict(torch.load(critic_model))
        print(f"Successfully loaded.", flush=True)
    elif actor_model != '' or critic_model != '':  # Don't train from scratch if user accidentally forgets actor/critic model
        print(
            f"Error: Either specify both actor/critic models or none at all. We don't want to accidentally override anything!"
        )
        sys.exit(0)
    else:
        print(f"Training from scratch.", flush=True)

    # Train the PPO model with a specified total timesteps
    # NOTE: You can change the total timesteps here, I put a big number just because
    # you can kill the process whenever you feel like PPO is converging
    model.learn(total_timesteps=200_000_000)
Пример #4
0
def main(config):
    # wandb.init(project='rl', config=config)
    # wandb.save(str(pathlib.Path(wandb.run.dir) / '*.t7'))
    # wandb.run.summary['step'] = 0

    trainer = PPO(**config)

    sampler = RaySampler(config['track'])
    replay = ReplayBuffer(config['max_frames'])

    for epoch in range(config['max_epoch'] + 1):
        # wandb.run.summary['epoch'] = epoch

        for rollout_batch in sampler.get_samples(trainer.get_policy(epoch),
                                                 **config):
            for rollout, _ in rollout_batch:
                for data in rollout:
                    replay.add(data)

        print([x.r[0] for x in rollout])
        metrics = trainer.train(replay)

        # wandb.log(metrics, step=wandb.run.summary['step'])

        if epoch % 50 == 0:
            torch.save(trainer.actor.state_dict(),
                       pathlib.Path(wandb.run.dir) / ('model_%03d.t7' % epoch))
Пример #5
0
def train():
    env = gym.make(GAME).unwrapped
    all_ep_r = []
    memory = rpm(1000000)

    agent = PPO(state_space=S_DIM, action_space=A_DIM, max_episode_num=EP_MAX, episode_lens=EP_LEN,
                discount_factor=GAMMA, actor_learning_rate=A_LR, critic_learning_rate=C_LR,
                mini_batch_size=MINI_BATCH_SIZE, epochs=EPOCHS)

    # load weights
    # agent.load_weights(SAVE_INDEX)

    # run(env, agent)
    for i in range(EP_MAX):
        [steps, episode_r, c_time, aloss, closs] = execute_one_episode(env, agent, memory)
        print('Ep: %4d' % i, "|Ep_r: %i" % episode_r, '|aloss: %8.4f' % aloss, '|closs: %8.4f' % closs,
              '|steps: %4d' % steps, '|time: %6.4f' % c_time)

        if i == 0:
            all_ep_r.append(episode_r)
        else:
            all_ep_r.append(all_ep_r[-1] * 0.9 + episode_r * 0.1)

    # create_path('weights/' + SAVE_INDEX)
    agent.save_weights(SAVE_INDEX)

    plt.plot(np.arange(len(all_ep_r)), all_ep_r)
    plt.xlabel('Episode')
    plt.ylabel('Moving averaged episode reward')

    create_path('weights/' + SAVE_INDEX + '/figure')
    plt.savefig('weights/' + SAVE_INDEX + '/figure/fig.png')
    plt.show()
Пример #6
0
def main():
    #env = FourRoomsEnv(goal_pos=(12, 16))
    from gym_minigrid.envs import EmptyEnv5x5
    env = EmptyEnv5x5()
    #env = GridWorldMDP()
    torch.manual_seed(config.seed)
    print(config.agent)
    #env.seed(config.seed)
    env = MiniGridWrapper(env)
    model = PPO(action_space=env.action_space.n,
                observation_space=env.observation_space.shape[0],
                h_dim=config.h_dim)
    # dtm = datetime.now().strftime("%d-%H-%M-%S-%f")
    # writer = tb.SummaryWriter(log_dir=f"logs/{dtm}_as_ppo:{config.as_ppo}")
    for global_step in itertools.count():
        batch, info = gather_trajectories(env, model, config.horizon)
        config.tb.add_scalar("return",
                             info["env/returns"],
                             global_step=global_step)
        losses = model.train_net(batch)
        model.data.clear()
        for k, v in losses.items():
            config.tb.add_scalar(k, v, global_step=global_step)
        if global_step % config.save_interval == 0:
            log_dir = config.tb.add_object('model',
                                           model,
                                           global_step=global_step)
            # eval_policy(log_dir=log_dir)
        if (global_step * config.horizon) > config.max_steps:
            break

    env.close()
Пример #7
0
def worker_policy(args, manager, config):
    init_logging_handler(args.log_dir, '_policy')
    agent = PPO(None, args, manager, config, 0, pre=True)

    best = float('inf')
    for e in range(args.epoch):
        agent.imitating(e)
        best = agent.imit_test(e, best)
Пример #8
0
def train_ppo(df, df_dense, df_wide, df_fail, state_dim, action_dim, lr, betas,
              gamma, epochs, model_path):
    memory = Memory()
    n_latent_var = [128, 32]
    K_epochs = 4
    eps_clip = 0.2
    update_timestep = 2000
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)
    print(lr, betas)

    timestep = 0

    loss_file = open(model_path + 'loss.txt', 'a')

    for epoch in range(epochs):
        print("epoch start:" + str(epoch) + '\n')
        moving_loss = 0
        cnt = 0
        for index in range(df.shape[0]):
            timestep += 1
            row = df.iloc[index]
            state_dense = df_dense[index]
            state_wide = df_wide[index]
            fail_state = df_fail[index]
            state = np.concatenate((state_dense, state_wide, fail_state))
            action = row['action']
            reward = row['reward']
            done = row['done']
            ppo.policy_old.act(state, action, reward, done, memory)

            if timestep % update_timestep == 0:
                loss = ppo.update(memory)
                memory.clear_memory()
                timestep = 0
                moving_loss += np.mean(loss)
                cnt += 1

        loss_file.write(
            str(epoch) + '-th round loss: ' +
            str(round(moving_loss / cnt, 4)) + '\n')
        loss_file.flush()
        torch.save(
            ppo.policy.action_layer.state_dict(),
            model_path + 'ppo_20191009_20191021_action_layer' + str(epoch) +
            '-th_epoch.pkl')
        torch.save(
            ppo.policy.value_layer.state_dict(),
            model_path + 'ppo_20191009_20191021_action_layer' + str(epoch) +
            '-th_epoch.pkl')

    gc.collect()
    loss_file.close()

    return ppo.policy.action_layer.cpu().eval()
Пример #9
0
def main(env_id, dim_latent, render, num_process, lr_p, lr_v, gamma, tau,
         epsilon, batch_size, ppo_mini_batch_size, ppo_epochs, max_iter,
         eval_iter, save_iter, model_path, log_path, seed):
    base_dir = log_path + env_id + "/PPO_encoder_exp{}".format(seed)
    writer = SummaryWriter(base_dir)

    ppo = PPO(
        env_id=env_id,
        dim_latent=dim_latent,
        render=render,
        num_process=20,  #cpu_count(),
        min_batch_size=batch_size,
        lr_p=lr_p,
        lr_v=lr_v,
        gamma=gamma,
        tau=tau,
        clip_epsilon=epsilon,
        ppo_epochs=ppo_epochs,
        ppo_mini_batch_size=ppo_mini_batch_size,
        seed=seed)

    for i_iter in range(1, max_iter + 1):
        ppo.learn(writer, i_iter)

        if i_iter % eval_iter == 0:
            ppo.eval(i_iter, render=render)

        if i_iter % save_iter == 0:
            ppo.save(model_path)

            pickle.dump(
                ppo,
                open('{}/{}_ppo_encoder.p'.format(model_path, env_id), 'wb'))

        torch.cuda.empty_cache()
Пример #10
0
def main():
    env_name = 'BreakoutNoFrameskip-v4'
    env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name),
                                       episode_life=True,
                                       clip_rewards=True,
                                       frame_stack=True,
                                       scale=True)
    output_size = env.action_space.n
    input_shape = env.observation_space.shape

    with tf.Session() as sess:
        with tf.variable_scope('Breakout_lr'):
            input = tf.placeholder(tf.float32, [None, *input_shape])

            model = PPO(sess,
                        input,
                        models.nature_cnn(input),
                        actiontype.Discrete,
                        output_size,
                        learning_rate=lambda f: 2.5e-4 * (1 - f),
                        epochs=4,
                        minibatch_size=4,
                        gamma=0.99,
                        beta2=0.01,
                        name='Breakout_lr')
        train(sess,
              model,
              env_name,
              1e7,
              256,
              log_interval=5,
              num_envs=16,
              atari=True)
        #run_only(sess, model, env, render=True)
        env.close()
Пример #11
0
    def __init__(self, observation_space, action_space):
        self.k = 10
        self.actions = HUMAN_ACTIONS
        self.action_space = gym.spaces.Discrete(NUM_ACTIONS)
        shape = observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0,
                                                high=1.0,
                                                shape=(1, shape[0], shape[1]),
                                                dtype=np.uint8)
        env_shape = self.observation_space.shape
        state_dim = np.prod(env_shape)
        self.state_dim = state_dim

        self.action_dim = self.action_space.n
        self.agent = PPO(
            self.state_dim * self.k,
            self.action_dim,
            n_latent_var=600,
            betas=(0.9, 0.999),
            lr=1e-4,
            K_epochs=8,
            gamma=0.99,
            eps_clip=0.2,
        )
        self.actions = HUMAN_ACTIONS
        self.agent.policy.load_state_dict(
            torch.load("results/experiment_1/checkpoint_210_eps.pth",
                       map_location=torch.device(device)))
        self.framestack = None
Пример #12
0
def maml_initialize(starting_policy, env_fn, n, n_inner, alpha_inner, **ppo_params):
    timesteps = ppo_params.get('update_interval') * max(n_inner, 1)
    ppo_params['lr'] = alpha_inner
    library = []
    gradients = []
    env = env_fn(seed=ppo_params.get('seed'))
    for i in range(n):
        env.randomize()
        env.reset()
        agent = PPO(env, **ppo_params)
        agent.policy.load_state_dict(copy_tensor(starting_policy))
        agent.learn(timesteps, track_higher_gradients=True)
        library.append(agent.policy.state_dict())
        gradients.append(get_gradients(agent.meta_policy.parameters(),
                                       agent.meta_policy.parameters(time=0)))
    return library, gradients
Пример #13
0
    def initialize_rl_alg(self, args):
        hyperparams = {
            'optim_epochs': self.args.ppo_optim_epochs,
            'minibatch_size': self.args.ppo_minibatch_size,
            'gamma': self.args.gamma,
            'value_iters': self.args.ppo_value_iters,
            'clip_epsilon': self.args.ppo_clip,
            'entropy_coeff': self.args.entropy_coeff,
        }

        self.rl_alg = PPO(policy=self.policy,
                          policy_optimizer=self.policy_optimizer,
                          valuefn=self.valuefn,
                          value_optimizer=self.value_optimizer,
                          replay_buffer=self.replay_buffer,
                          **hyperparams)
Пример #14
0
def create_model(sess, name):
    with tf.variable_scope(name):
        input = tf.placeholder(tf.float32, [None, 12])
        initializer = tf.orthogonal_initializer(np.sqrt(2)) #Orthogonal initializer
        network = add_dense(input, 32, activation=tf.nn.tanh, kernel_initializer=initializer, name="dense1")
        network = add_dense(network, 32, activation=tf.nn.tanh, kernel_initializer=initializer, name="dense2")
        return PPO(sess, input, network, actiontype.Continuous, 2, epochs=10, minibatch_size=32, gamma=0.99, beta2=0.00, epsilon=0.2,\
            learning_rate=lambda f : 3e-4*(1-f), name=name)
Пример #15
0
def main():
    torch.set_default_tensor_type('torch.DoubleTensor')

    batchsz = 2048
    ppo = PPO(make_env, 10)

    # load model from checkpoint
    ppo.load()
    # comment this line to close evaluaton thread, to speed up training process.
    ppo.render(2)

    for i in range(10000):

        ppo.update(batchsz)

        if i % 100 == 0 and i:
            ppo.save()
Пример #16
0
def worker_estimator(args, manager, config, make_env):
    init_logging_handler(args.log_dir, '_estimator')
    agent = PPO(make_env, args, manager, config, args.process, pre_irl=True)
    agent.load(args.save_dir + '/best')

    best0, best1 = float('inf'), float('inf')
    for e in range(args.epoch):
        agent.train_irl(e, args.batchsz_traj)
        best0 = agent.test_irl(e, args.batchsz, best0)
        best1 = agent.imit_value(e, args.batchsz_traj, best1)
Пример #17
0
def main():
    env = gym.make('CartPole-v1')
    model = PPO()
    saved_model = torch.load('models/ppo_model9500.pt')

    model.load_state_dict(saved_model)

    while True:
        score = 0
        s = env.reset()

        for i in range(200):
            prob = model.pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample().item()
            s_prime, r, done, info = env.step(a)

            model.put_data((s, a, r / 100.0, s_prime, prob[a].item(), done))
            s = s_prime

            score += r
            env.render()
            if done:
                break

        print('score = {}'.format(score))
Пример #18
0
def main(config):
    parser = get_parser()
    argv = sys.argv[1:]
    args, _ = parser.parse_known_args(argv)

    init_logging_handler(config.log_dir)
    logging.info(args)
    config = update_cfg(config, args)

    logging.info("Start initializing")
    irl_model = RewardModule(config).to(device=device)   # this is the reward model only, which will be fed to RewardEstimator.
    reward_agent = RewardEstimator(config=config, irl_model=irl_model)
    
    user_policy = ActorCriticDiscrete(config).to(device=device)
    user_policy = init_net(user_policy)
    user_ppo = PPO(config, user_policy)

    system_policy = ActorCriticContinuous(config).to(device=device)
    system_policy = init_net(system_policy)

    init_system_policy = ActorCriticContinuous(config).to(device=device)
    init_system_policy.load_state_dict(system_policy.state_dict())

    system_ppo = PPO(config, system_policy, init_policy=init_system_policy)

    # reward_true = RewardTruth(config).to(device=device)  # this is the ground truth which will not be updated once randomly initialized.
    reward_true = RewardTruthSampled(config).to(device)
    reward_true = init_net(reward_true)
    logging.info("Finish building module: reward agent, user ppo, system ppo")

    main_agent = InteractAgent(config=config,
                               user_agent=user_ppo,
                               user_reward=reward_agent,
                               system_agent=system_ppo,
                               reward_groundtruth=reward_true
                               )
    
    for e_id in range(config.master_epochs):
        main_agent.master_train(e_id)
    # for _ in range(3):
        # main_agent.system_train()
    # raise ValueError("stop here")
    logging.info("@@@@@@@@@@  Finished  @@@@@@@@@@@")
Пример #19
0
 def __init__(self, traci, is_not_train=False):
     self.controlTLIds = traci.trafficlight.getIDList()  # tuple ('0',)
     self.controlTLIds = self.controlTLIds[0]  # string '0'
     self.phaseDefs = ['GrrrGrrr', 'rGrrrGrr', 'rrGrrrGr', 'rrrGrrrG']
     self.yelloPhases = ['yrrryrrr', 'ryrrryrr', 'rryrrryr', 'rrryrrry']
     action_mask = [1, 1, 1, 1]
     self.detectorIDs = traci.inductionloop.getIDList()
     self.controlLanes = get_laneID(self.detectorIDs)
     self.reset()
     state_size = len(self.state)
     self.learner = Learner(state_size, action_mask, is_not_train)
     return
Пример #20
0
def main(args):
    env = gym.make(args.env_name)
    device = torch.device(args.device)

    # 1.Set some necessary seed.
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # 2.Create actor, critic, EnvSampler() and PPO.
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    actor = PolicyNetwork(state_size, action_size, hidden_sizes=args.hidden_sizes)
    critic = ValueNetwork(state_size, hidden_sizes=args.hidden_sizes)
    env_sampler = EnvSampler(env, args.max_episode_step)
    ppo = PPO(actor, 
              critic, 
              clip=args.clip, 
              gamma=args.gamma, 
              tau=args.tau, 
              target_kl=args.target_kl, 
              device=device,
              pi_steps_per_update=args.pi_steps_per_update,
              value_steps_per_update=args.value_steps_per_update,
              pi_lr=args.pi_lr,
              v_lr=args.value_lr)

    # 3.Start training.
    def get_action(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = actor.select_action(state)
        return action.detach().cpu().numpy()[0]

    total_step = 0
    for episode in range(1, args.episodes+1):
        episode_reward, samples = env_sampler(get_action, args.batch_size)
        actor_loss, value_loss = ppo.update(*samples)
        yield episode*args.batch_size, episode_reward, actor_loss, value_loss
Пример #21
0
def train():
    g_exit = GracefulExit()
    timestamp = datetime.datetime.utcnow().strftime(TIMESTAMP_FORMAT)
    logger = Logger(ENV_NAME, timestamp)
    env = gym.make(ENV_NAME)
    dim_obs = env.observation_space.shape[0] + 1
    dim_act = env.action_space.shape[0]
    scaler = VecScaler(dim_obs)
    rec_dir = os.path.join(REC_DIR, ENV_NAME, timestamp)
    env = gym.wrappers.Monitor(env, rec_dir, force=True)
    agent = PPO(dim_obs, dim_act, GAMMA, LAMBDA, CLIP_RANGE, LR_POLICY,
                LR_VALUE_F, logger)
    run_batch(env, agent.policy, 5, scaler)
    episode = 0
    while episode < NUM_EPISODES:
        batch_size = min(MAX_BATCH, NUM_EPISODES - episode)
        trajectories, steps, mean_return = run_batch(env, agent.policy, batch_size, scaler)
        episode += batch_size
        logger.log({'_time': datetime.datetime.utcnow().strftime(TIMESTAMP_FORMAT),
                    '_episode': episode,
                    'steps': steps,
                    '_mean_return': mean_return})
        agent.update(trajectories)
        logger.write()
        if g_exit.exit:
            break
    agent.close()
    logger.close()
Пример #22
0
def main(_):

    tf.Session().__enter__()
    np.random.seed(FLAGS.seed)
    random.seed(FLAGS.seed)
    tf.set_random_seed(FLAGS.seed)

    def make_env(seed):
        def _make_env():
            env = gym.make(FLAGS.env)
            env.seed(seed)
            env.allow_early_resets = True
            env = gym_wrapper.StackObs(env)

            return env

        return _make_env

    try:
        env = gym_wrapper.Workers(
            [make_env(_) for _ in [random.randint(0, 1000)] * FLAGS.nenvs])

        ppo = PPO(env,
                  nsteps=FLAGS.nsteps,
                  learning_rate=FLAGS.lr,
                  clip_range=FLAGS.cr,
                  max_steps=FLAGS.max_steps,
                  mb_size=FLAGS.mb_size,
                  opteps=FLAGS.opteps,
                  gae=FLAGS.gae,
                  gamma=FLAGS.gamma,
                  vf_coef=FLAGS.vf_coef,
                  ent_coef=FLAGS.ent_coef,
                  normalize_observations=FLAGS.normalize_obs)
        ppo.run()
        env.close()

    except KeyboardInterrupt:
        env.close()
Пример #23
0
def main():
    # parse args
    args = option.args

    # worker device
    if args.backend == 'cpu':
        args.worker_device = "/cpu:0"
    else:
        gpu_id = args.index % args.gpu_count
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
        args.worker_device = "/gpu:0"

    # start session
    config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        gpu_options=tf.GPUOptions(allow_growth=True),
    )
    sess = tf.Session(config=config)
    sess.__enter__()

    # create env
    env = gym.make(args.env)
    env = wrap_train(env)

    # create ppo
    #with tf.device(args.worker_device):
    ppo = PPO(env.observation_space,
              env.action_space,
              cnn_model_func,
              clip_param=0.2,
              entcoeff=0.01)

    # create worker
    if args.mode == 'train':
        worker = TrainWorker(env,
                             ppo,
                             args.render,
                             args.index == 0,
                             train_data_size=256,
                             optimize_size=64,
                             optimize_epochs=4,
                             gamma=0.99,
                             lambda_=0.95,
                             max_steps=1e6)

    else:
        pass

    # start worker
    worker()
Пример #24
0
def rank_policies(memory: Memory, library, **ppo_params):
    agent = PPO(None, **ppo_params)
    # pylint: disable=not-callable
    returns = torch.tensor(memory.returns).float().to(DEVICE).detach()
    # eturns = (returns - returns.mean()) / (returns.std() + 1e-5)
    states = torch.tensor(memory.states).float().to(DEVICE).detach()
    actions = torch.tensor(memory.actions).float().to(DEVICE).detach()
    vals = []
    for params in library:
        agent.policy.load_state_dict(params)
        logp, _, _ = agent.policy.evaluate(states, actions)
        p = torch.exp(logp)
        vals.append(torch.sum(p * returns).item())
    return np.argsort(vals)[::-1], np.asarray(vals)
Пример #25
0
def train(env, hyperparameters, actor_model, critic_model, datapath, exp_name):
    print(f"Training", flush=True)

    # Create a model for PPO.
    model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters)

    # Tries to load in an existing actor/critic model to continue training on
    if actor_model != '' and critic_model != '':
        print(f"Loading in {actor_model} and {critic_model}...", flush=True)
        model.actor.load_state_dict(torch.load(datapath + "/" + actor_model))
        model.critic.load_state_dict(torch.load(datapath + "/" + critic_model))
        print(f"Successfully loaded.", flush=True)
    elif actor_model != '' or critic_model != '':
        print(
            f"Error: Specify both actor/critic models or none at all to avoid accidental override"
        )
        sys.exit(0)
    else:
        print(f"Training from scratch.", flush=True)
        ymd_time = time.strftime("%m-%d-%H-%M_")
        relpath = ''.join([ymd_time, exp_name])
        datapath = osp.join(datapath, relpath)
        if not os.path.exists(datapath): os.makedirs(datapath)
    model.learn(total_timesteps=1_000_000, logpath=datapath)
Пример #26
0
def create_policy(policy_type='rand', board_size=8, seed=0, search_depth=1):
    if policy_type == 'rand':
        policy = simple_policies.RandomPolicy(seed=seed)
    elif policy_type == 'greedy':
        policy = simple_policies.GreedyPolicy()
    elif policy_type == 'maximin':
        policy = simple_policies.MaxiMinPolicy(search_depth)
    elif policy_type == 'human':
        policy = simple_policies.HumanPolicy(board_size)
    elif policy_type == 'dqn':
        policy = DQN('dqn', board_size)
    elif policy_type == 'ppo':
        policy = PPO('ppo', board_size)

    return policy
Пример #27
0
def run_ppo(agent: PPO, render: bool = True):
    env = gym.make("CartPole-v1")
    draw = env.render if render else lambda:...

    # Train forever.
    while True:
        next_state = env.reset()
        reward = 0
        done = False
        while True:
            action = agent.train_step(state=next_state,
                                      reward=reward,
                                      episode_ended=done)
            if done:
                break
            next_state, reward, done, info = env.step(action)
            draw()
Пример #28
0
def get_algorithm(*argv, **kwargs):

    if args.algorithm == 'pg':
        return PG(*argv, **kwargs)
    if args.algorithm == 'ddpg':
        return DDPG(*argv, **kwargs)
    if args.algorithm == 'td3':
        return TD3(*argv, **kwargs)
    if args.algorithm == 'rbi':
        return RBI(*argv, **kwargs)
    if args.algorithm == 'drbi':
        return DRBI(*argv, **kwargs)
    if args.algorithm == 'ppo':
        return PPO(*argv, **kwargs)
    if args.algorithm == 'sacq':
        return SACQ(*argv, **kwargs)
    if args.algorithm == 'sspg':
        return SSPG(*argv, **kwargs)
    raise NotImplementedError
Пример #29
0
def main(algo):
    seed = 7
    path = 'model_checkpoints/ppo.ckpt'

    # Load the ENV
    # env = UnityEnv(env_file='Environments/Reacher_Linux_one/Reacher.x86_64',no_graphics=True)
    env = UnityEnv(env_file='Environments/Tennis_Linux/Tennis.x86_64',
                   no_graphics=True)

    # number of agents
    num_agents = env.num_agents
    print('Number of agents:', num_agents)

    # size of each action
    action_size = env.action_size

    # examine the state space
    state_size = env.state_size
    print('Size of each action: {}, Size of the state space {}'.format(
        action_size, state_size))

    config = Config(algo)

    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        device2 = torch.device("cuda:1")
    agent = PPO(action_size, state_size, seed, device, config)
    #     try:
    #     except:
    #         device = torch.device("cuda:0")
    # else:
    #     device = torch.device('cpu')
    # try:
    #     agent_a = PPO(action_size,state_size,seed,device,config)
    #     agent_b = PPO(action_size,state_size,seed,device2,config)
    #     print('Double GPU')
    # except:
    #     print('Single GPU')
    #     agent_a = PPO(action_size,state_size,seed,device,config)
    #     agent_b = PPO(action_size,state_size,seed,device,config)

    train_ppo(env, agent, EPISODES, path)
Пример #30
0
def main():
    env = gym.make('AntBulletEnv-v0')
    output_size = env.action_space.shape[0]
    with tf.Session() as sess:
        name = 'ant_5m'
        with tf.variable_scope(name):
            input = tf.placeholder(tf.float32,
                                   [None, env.observation_space.shape[0]])
            initializer = tf.orthogonal_initializer(
                np.sqrt(2))  #Orthogonal initializer
            network = add_dense(input,
                                64,
                                activation=tf.nn.tanh,
                                kernel_initializer=initializer,
                                name="dense1")
            network = add_dense(network,
                                64,
                                activation=tf.nn.tanh,
                                kernel_initializer=initializer,
                                name="dense2")

            model = PPO(sess,
                        input,
                        network,
                        actiontype.Continuous,
                        output_size,
                        epochs=10,
                        minibatch_size=32,
                        gamma=0.99,
                        beta2=0.000,
                        epsilon=0.2,
                        learning_rate=lambda f: 3e-4 * (1 - f),
                        name=name)
        train(sess,
              model,
              'AntBulletEnv-v0',
              1000000,
              2048,
              num_envs=16,
              log_interval=5)
        run_only(sess, model, env)
        env.close()