示例#1
0
 def __init__(self, env, gamma):
     self.env = env
     self.env.seed(543)
     torch.manual_seed(543)
     self.policy_model = PolicyModel()
     self.optimizer = optim.Adam(self.policy_model.parameters(), lr=0.009)
     self.gamma = gamma
     self.eps = np.finfo(np.float32).eps.item()
     self.loss_list = []
     self.ep_no_list = []
示例#2
0
    def __init__(self):
        """
            init
        """
        self.parse_args = self._init_parser()
        self.bl_decay = self.parse_args.bl_decay
        self.log_dir = self.parse_args.log_dir
        self.early_stop = self.parse_args.early_stop
        self.data_path = self.parse_args.data_path
        self.num_models = self.parse_args.num_models
        self.batch_size = self.parse_args.batch_size
        self.chunk_size = self.parse_args.chunk_size

        self._init_dir_path()
        self.model = PolicyModel(self.parse_args)
        algo_hyperparas = {'lr': self.parse_args.learning_rate}
        self.algorithm = ReinforcePolicyGradient(self.model,
                                                 hyperparas=algo_hyperparas)
        self.autodl_agent = AutoDLAgent(self.algorithm, self.parse_args)
        self.total_reward = 0
示例#3
0
#Create the environment
#----------------------------
env = gym.make(env_id)
if args.unwrap: env = env.unwrapped
a_dim = env.action_space.shape[0]
a_low = env.action_space.low[0]
a_hight = env.action_space.high[0]
s_dim = env.observation_space.shape[0]

#Create the model
#----------------------------
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, s_dim, a_dim, a_low, a_hight)

#Start playing
#----------------------------
sess.run(tf.global_variables_initializer())

#Load the model
saver = tf.train.Saver(max_to_keep=2)
ckpt = tf.train.get_checkpoint_state(save_dir)
if ckpt:
    print("Loading the model ... ", end="")
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Done.")

logstd = np.zeros((1, a_dim), dtype=np.float32)
logstd.fill(-6.0)
示例#4
0
adv_ph = tf.placeholder(tf.float32, [None], name="advantage")
return_ph = tf.placeholder(tf.float32, [None], name="return")
lr_ph = tf.placeholder(tf.float32, [])
clip_ph = tf.placeholder(tf.float32, [])


#Create the model
#----------------------------
config = tf.ConfigProto(
	allow_soft_placement=True,
	intra_op_parallelism_threads=n_env,
	inter_op_parallelism_threads=n_env
)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, s_dim, a_dim, a_low, a_high, "policy")


#Loss
#----------------------------
neg_logprob = policy.distrib.neg_logp(action_ph)

v_pred = policy.value
v_pred_clip = old_v_pred_ph + tf.clip_by_value(v_pred - old_v_pred_ph, -clip_ph, clip_ph)
v_loss1 = tf.square(v_pred - return_ph)
v_loss2 = tf.square(v_pred_clip - return_ph)
v_loss = 0.5 * tf.reduce_mean(tf.maximum(v_loss1, v_loss2))

ratio = tf.exp(old_neg_logprob_ph - neg_logprob)
pg_loss1 = -adv_ph * ratio
pg_loss2 = -adv_ph * tf.clip_by_value(ratio, 1.0 - clip_ph, 1.0 + clip_ph)
示例#5
0
if "FIRE" in env.unwrapped.get_action_meanings():
    env = env_wrapper.FireResetEnv(env)

env = env_wrapper.WarpFrame(env)
env = env_wrapper.FrameStack(env, n_stack)

a_dim = env.action_space.n
img_height, img_width, c_dim = env.observation_space.shape

#Create the model
#----------------------------
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, img_height, img_width, c_dim, a_dim)

#Start training
#----------------------------
sess.run(tf.global_variables_initializer())

#Load the model
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

saver = tf.train.Saver(max_to_keep=2)
ckpt = tf.train.get_checkpoint_state(save_dir)
if ckpt:
    print("Loading the model ... ", end="")
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Done.")
示例#6
0
#Create multiple environments
#----------------------------
env = MultiEnv([make_env(i, env_id=env_id) for i in range(n_env)])
a_dim = env.ac_space.n
img_height, img_width, c_dim = env.ob_space.shape
runner = MultiEnvRunner(env, img_height, img_width, c_dim, n_step, n_stack,
                        gamma)

#Create the model
#----------------------------
config = tf.ConfigProto(intra_op_parallelism_threads=n_env,
                        inter_op_parallelism_threads=n_env)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, img_height, img_width, c_dim * n_stack, a_dim)

#Placeholders
#----------------------------
action_ph = tf.placeholder(tf.int32, [None], name="action")
adv_ph = tf.placeholder(tf.float32, [None], name="advantage")
discount_return_ph = tf.placeholder(tf.float32, [None],
                                    name="discounted_return")
lr_ph = tf.placeholder(tf.float32, [])

#Loss
#----------------------------
nll_loss = -policy.cat_dist.log_prob(action_ph)
pg_loss = tf.reduce_mean(adv_ph * nll_loss)
value_loss = tf.reduce_mean(
    tf.squared_difference(tf.squeeze(policy.value), discount_return_ph) / 2.0)
示例#7
0
env_id = args.env
save_dir = "./save_" + env_id

#Create the environment
#----------------------------
env = gym.make(env_id)
if args.unwrap: env = env.unwrapped
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.n

#Create the model
#----------------------------
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, s_dim, a_dim)

#Start playing
#----------------------------
sess.run(tf.global_variables_initializer())

#Load the model
saver = tf.train.Saver(max_to_keep=2)
ckpt = tf.train.get_checkpoint_state(save_dir)
if ckpt:
    print("Loading the model ... ", end="")
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Done.")

for it in range(100):
    ob = env.reset()
示例#8
0
env = MultiEnv([make_env(i, env_id=env_id, unwrap=args.unwrap) for i in range(n_env)])
s_dim = env.ob_space.shape[0]
a_dim = env.ac_space.n
runner = MultiEnvRunner(env, s_dim, a_dim, n_step, gamma, lamb)


#Create the model
#----------------------------
config = tf.ConfigProto(
	allow_soft_placement=True,
	intra_op_parallelism_threads=n_env,
	inter_op_parallelism_threads=n_env
)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, s_dim, a_dim, "policy")
dis = DiscriminatorModel(sess, s_dim, a_dim, name="discriminator")


#Placeholders
#----------------------------
#action_ph:          (mb_size)
#old_neg_logprob_ph: (mb_size)
#old_v_pred_ph:      (mb_size)
#adv_ph:             (mb_size)
#return_ph:          (mb_size)
action_ph = tf.placeholder(tf.int32, [None], name="action")
old_neg_logprob_ph = tf.placeholder(tf.float32, [None], name="old_negtive_log_prob")
old_v_pred_ph = tf.placeholder(tf.float32, [None], name="old_value_pred")
adv_ph = tf.placeholder(tf.float32, [None], name="advantage")
return_ph = tf.placeholder(tf.float32, [None], name="return")
示例#9
0
class ReinforceCartPole:
    def __init__(self, env, gamma):
        self.env = env
        self.env.seed(543)
        torch.manual_seed(543)
        self.policy_model = PolicyModel()
        self.optimizer = optim.Adam(self.policy_model.parameters(), lr=0.009)
        self.gamma = gamma
        self.eps = np.finfo(np.float32).eps.item()
        self.loss_list = []
        self.ep_no_list = []

    def get_action(self, state):
        state_torch = torch.from_numpy(state).float().unsqueeze(0)

        probs = self.policy_model(state_torch)
        first_action_probability = probs[0][0]
        random_no = random.random()
        if random_no <= first_action_probability:
            action = torch.tensor(0, dtype=torch.long)
        else:
            action = torch.tensor(1, dtype=torch.long)

        m = Categorical(probs)
        # print(action)
        log_prob = m.log_prob(action)
        return action, log_prob

    def get_returns(self, episode_rewards):
        return_sum = 0.0
        returns = []
        for r in reversed(episode_rewards):
            return_sum = r + (self.gamma * return_sum)
            returns.append(return_sum)

        returns = torch.tensor(list(reversed(returns)))
        returns = (returns - returns.mean()) / (returns.std() + self.eps)

        return returns

    def optimize(self, episode_log_probs, episode_rewards):
        returns = self.get_returns(episode_rewards)
        policy_loss = []

        for logp, ret in zip(episode_log_probs, returns):
            policy_loss.append(-logp * ret)

        self.optimizer.zero_grad()
        loss = torch.cat(policy_loss).sum()
        self.loss_list.append(loss.item())
        loss.backward()
        self.optimizer.step()

    def train(self,
              no_episodes,
              limit=4000,
              rendering=False,
              max_steps=500000):
        running_reward = 10.0
        plot_rewards = []
        plot_episode_nos = []
        plot_mean_rewards = []
        plot_mr_epno = []
        for ep in range(1, no_episodes):
            state = self.env.reset()
            episode_rewards = []
            episode_log_probs = []
            ep_reward = 0.0
            for s in range(max_steps):

                action, log_prob = self.get_action(state)
                episode_log_probs.append(log_prob)
                state, r, done, _ = self.env.step(action.item())
                if rendering:
                    self.env.render()
                episode_rewards.append(r)
                ep_reward += r
                # print(next_state)
                # print(r)
                # print('\n')
                if done:
                    # time.sleep(0.5)
                    break

            running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
            plot_rewards.append(ep_reward)
            plot_episode_nos.append(ep)
            if ep > 100:
                plot_mean_rewards.append(
                    sum(plot_rewards[-100:]) / len(plot_rewards[-100:]))
                plot_mr_epno.append(ep)
            self.optimize(episode_log_probs, episode_rewards)
            if ep % 25 == 0:
                print(
                    'Episode Number: {}\t Latest Reward: {:.2f}\tAverage Running reward: {:.2f}'
                    .format(ep, ep_reward, running_reward))
            #if running_reward > self.env.spec.reward_threshold:
            if running_reward > limit:

                print("Solved! Running reward is now {} and "
                      "the last episode runs to {} time steps!".format(
                          running_reward, s))
                break

        self.plot(plot_rewards, plot_mean_rewards, plot_mr_epno,
                  plot_episode_nos)

        self.env.close()

    def plot(self, plot_rewards, plot_mean_rewards, plot_mr_epno,
             plot_episode_nos):
        plt.plot(plot_episode_nos, plot_rewards)
        plt.plot(plot_mr_epno, plot_mean_rewards)
        plt.ylabel('Running Rewards')
        plt.xlabel('Episode No')
        plt.legend(['Reward', '100 Episode Mean Reward'], loc='upper left')
        plt.savefig('RewardsVsEpisodeNo.png')

        plt.clf()
        plt.plot(plot_episode_nos, self.loss_list)
        plt.ylabel('Training Loss')
        plt.xlabel('Episode No')
        # plt.legend(['AgentScore'], loc='upper left')
        plt.savefig('LossVsEpisodeNo.png')
示例#10
0
#Create the environment
#----------------------------
env = gym.make(env_id)
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_low = env.action_space.low[0]
a_high = env.action_space.high[0]


#Create the model
#----------------------------
config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, s_dim, a_dim, a_low, a_high, name="policy")
dis = DiscriminatorModel(sess, s_dim, a_dim, name="discriminator")


#Placeholders
#----------------------------
#action_ph: (mb_size, a_dim)
action_ph = tf.placeholder(tf.float32, [None, a_dim], name="action")
lr_ph = tf.placeholder(tf.float32, [])


#Loss
#----------------------------
loss = tf.reduce_mean(policy.distrib.neg_logp(action_ph))