Пример #1
0
class ActorCritic(object):
    def __init__(self, env):
        LR_A = 0.001  # learning rate for actor
        LR_C = 0.01  # learning rate for critic
        num_features = env.observation_space.shape[0]
        # num_features = 14
        num_actions = env.action_space.shape[0]

        self.action_space = env.action_space

        sess = tf.Session()
        self.actor = Actor(
            sess,
            n_features=num_features,
            action_bound=[env.action_space.low[0], env.action_space.high[0]],
            lr=LR_A)
        self.critic = Critic(
            sess, n_features=num_features, lr=LR_C
        )  # we need a good teacher, so the teacher should learn faster than the actor
        sess.run(tf.global_variables_initializer())

    def get_action(self, state, episode_percentage):
        # state = state[0:14]

        # Sometimes pick random action to explore
        if np.random.random() < self.get_exploration_prob(episode_percentage):
            # print 'random'
            return self.action_space.sample()
        else:
            # print 'not random'
            return self.actor.choose_action(state)[0]

    def get_exploration_prob(self, episode_percentage):
        # if (episode_percentage > .8):
        # 	epsilon = 0.3
        # else:
        epsilon = -1 * (episode_percentage**2) + 1
        # epsilon = -1 * (episode_percentage - 1) ** 3
        # epsilon = -0.8 * (episode_percentage - 1) ** 3 + 0.2
        # epsilon = -0.8 * episode_percentage + 1
        # print epsilon
        return epsilon

    def update(self, state, action, reward, new_state):
        # state = state[0:14]
        # new_state = new_state[0:14]

        td_error = self.critic.learn(
            state, reward,
            new_state)  # gradient = grad[r + gamma * V(s_) - V(s)]
        # print td_error
        self.actor.learn(
            state, action,
            td_error)  # true_gradient = grad[logPi(s,a) * td_error]

    def get_name(self):
        return 'ActorCritic'
Пример #2
0
class ActorCriticExperienceReplay(object):
    def __init__(self, env):
        self.MEMORY_SIZE = 200
        self.BATCH_SIZE = 10

        LR_A = 0.001  # learning rate for actor
        LR_C = 0.01  # learning rate for critic
        num_features = env.observation_space.shape[0]
        num_actions = env.action_space.shape[0]

        self.action_space = env.action_space

        sess = tf.Session()
        self.actor = Actor(
            sess,
            n_features=num_features,
            action_bound=[env.action_space.low[0], env.action_space.high[0]],
            lr=LR_A)
        self.critic = Critic(
            sess, n_features=num_features, lr=LR_C
        )  # we need a good teacher, so the teacher should learn faster than the actor
        sess.run(tf.global_variables_initializer())

        self.replay_memory = []

    def get_action(self, state, episode_percentage):
        # Sometimes pick random action to explore
        if np.random.random() < self.get_exploration_prob(episode_percentage):
            return self.action_space.sample()
        else:
            return self.actor.choose_action(state)[0]

    def get_exploration_prob(self, episode_percentage):
        return -1 * (episode_percentage**2) + 1
        # return -1 * (episode_percentage - 1) ** 3

    def update(self, state, action, reward, new_state):
        td_error = self.critic.learn(
            state, reward,
            new_state)  # gradient = grad[r + gamma * V(s_) - V(s)]
        self.actor.learn(
            state, action,
            td_error)  # true_gradient = grad[logPi(s,a) * td_error]

        # Add to replay memory
        self.replay_memory.append((state, action, reward, new_state))
        if len(self.replay_memory) >= self.MEMORY_SIZE:
            self.replay_memory.pop(0)

        # Learn from replayed memories
        if np.random.random() < 0.5 and len(
                self.replay_memory) > self.BATCH_SIZE:
            minibatch = random.sample(self.replay_memory, self.BATCH_SIZE)
            for (batch_state, batch_action, batch_reward,
                 batch_new_state) in minibatch:
                td_error = self.critic.learn(batch_state, batch_reward,
                                             batch_new_state)
                self.actor.learn(batch_state, batch_action, td_error)

    def get_name(self):
        return 'ActorCritic_ExperienceReplay'
Пример #3
0
    def train(self, max_episode=10, max_path_length=200, verbose=0):
        env = self.env
        avg_reward_sum = 0.

        #f_eps = open("episode.csv","w")
        #write_eps = csv.write(f_eps)

        for e in range(max_episode):
            env._reset()
            observation = env._reset()
            game_over = False
            reward_sum = 0

            inputs = []
            outputs = []
            predicteds = []
            rewards = []

            #f_iter = open("episode_{0}.csv".format(e),"w")
            #write_iter = csv.writer(f_iter)
            f_episode = "episode_{0}.csv".format(e)
            os.system("rm -rf {0}".format(f_episode))

            print(observation[0].shape, observation[1].shape)

            sess = tf.Session()

            actor = Actor(sess,
                          n_actions=self.env.action_space.n
                          # output_graph=True,
                          )

            critic = Critic(
                sess, n_actions=self.env.action_space.n
            )  # we need a good teacher, so the teacher should learn faster than the actor

            sess.run(tf.global_variables_initializer())

            while not game_over:

                action, aprob = actor.choose_action(observation)

                inputs.append(observation)
                predicteds.append(aprob)

                y = np.zeros([self.env.action_space.n])
                y[action] = 1.
                outputs.append(y)

                observation_, reward, actual_reward, game_over, info = self.env._step(
                    action)
                reward_sum += float(actual_reward)

                print(reward)
                #rewards.append(float(reward))
                rewards.append(float(reward))

                # After env.step
                td_error = critic.learn(
                    observation, reward_sum,
                    observation_)  # gradient = grad[r + gamma * V(s_) - V(s)]
                actor.learn(
                    observation, action,
                    td_error)  # true_gradient = grad[logPi(s,a) * td_error]

                # check memory for RNN model
                if len(inputs) > self.max_memory:
                    del inputs[0]
                    del outputs[0]
                    del predicteds[0]
                    del rewards[0]

                if verbose > 0:
                    if env.actions[action] == "LONG" or env.actions[
                            action] == "SHORT":
                        #if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD":
                        color = bcolors.FAIL if env.actions[
                            action] == "LONG" else bcolors.OKBLUE
                        print("%s:\t%s\t%.2f\t%.2f\t" %
                              (info["dt"], color + env.actions[action] +
                               bcolors.ENDC, reward_sum, info["cum"]) +
                              ("\t".join([
                                  "%s:%.2f" % (l, i)
                                  for l, i in zip(env.actions, aprob.tolist())
                              ])))
                    #write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())])))
                    os.system("echo %s >> %s" %
                              ("%s:\t%s\t%.2f\t%.2f\t" %
                               (info["dt"], env.actions[action], reward_sum,
                                info["cum"]) +
                               ("\t".join([
                                   "%s:%.2f" % (l, i)
                                   for l, i in zip(env.actions, aprob.tolist())
                               ])), f_episode))

                avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01
                toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % (
                    e, info["code"],
                    (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) +
                    ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"],
                    avg_reward_sum)
                print(toPrint)
                if self.history_filename != None:
                    os.system("echo %s >> %s" %
                              (toPrint, self.history_filename))

                dim = len(inputs[0])
                inputs_ = [[] for i in range(dim)]
                for obs in inputs:
                    for i, block in enumerate(obs):
                        inputs_[i].append(block[0])
                inputs_ = [np.array(inputs_[i]) for i in range(dim)]

                outputs_ = np.vstack(outputs)
                predicteds_ = np.vstack(predicteds)
                rewards_ = np.vstack(rewards)

                print("shape: ", np.shape(rewards))

                print("fit model input.shape %s, output.shape %s" %
                      ([inputs_[i].shape
                        for i in range(len(inputs_))], outputs_.shape))

                np.set_printoptions(linewidth=200, suppress=True)
                print("currentTargetIndex:", env.currentTargetIndex)
actor = Actor(s_dim=N_S,a_dim=N_A,learning_rate=0.01,sess=sess)
critic = Critic(s_dim=N_S,learning_rate=0.05,reward_decay=0.9,sess=sess)

sess.run(tf.global_variables_initializer())

for i_episode in range(3000):

	s = env.reset()
	t = 0
	track_r = []

	while True:
		if RENDER:env.render()

		a = actor.choose_action(s)
		s_,r,done,info = env.step(a)

		if done: r = -20
		track_r.append(r)

		td_error = critic.learn(s,r,s_)
		actor.learn(s,a,td_error)

		s = s_
		t += 1

		if done or t >= 1000:
			ep_rs_sum = sum(track_r)

			if 'running_reward' not in globals():