Пример #1
0
    def __init__(
            self,
            n_actions,
            n_features,
            sess,
            agent_id,
            num_training,
            learning_rate=0.01,
            reward_decay=0.9,
            replace_target_iter=300,
            memory_size=500,
            batch_size=32,
            save_model_freq=100,
            max_epsilon=1,
            min_epsilon=0,
            load_model=False,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.sess = sess
        self.agent_id = agent_id
        self.num_training = num_training
        self.lr = learning_rate
        self.gamma = reward_decay
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.save_model_freq = save_model_freq
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon = self.max_epsilon
        self.load_model = load_model

        # total learning step
        self.learn_step_counter = 0
        self.episode_rew_agent = 0
        self.episode_rew_all = 0
        self.episode = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = Memory(capacity=memory_size) #np.zeros((self.memory_size, n_features * 2 + 2))
        # consist of [target_net, evaluate_net]
        self._build_net()
        t_params = tf.get_collection('target_net_params')
        e_params = tf.get_collection('eval_net_params')
        self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

        self.cost_his = []
        if(self.load_model):
            saver = tf.train.Saver(max_to_keep=100000000)
            model_load_steps = 420000
            model_file_load = os.path.join("models/", "agent_No_" + str(self.agent_id) + "/",
                                           str(model_load_steps) + "_" + "model_segment_training/", "8m")
            saver.restore(self.sess, model_file_load)
            print("model trained for %s steps of agent %s have been loaded"%(model_load_steps, self.agent_id))
        else:
            self.sess, self.saver, self.summary_placeholders, self.update_ops, self.summary_op, self.summary_writer, self.summary_vars = self.init_sess()
Пример #2
0
def run():
    policy_net = DQN(num_channels, 19).cuda()
    target_net = DQN(num_channels, 19).cuda()
    optimizer = optim.Adam(policy_net.parameters(), LR)
    memory = Memory(50000)
    env = gym.make(ENV_NAME)
    env.make_interactive(port=6666, realtime=False)
    max_epi = 100
    n_step = 2
    update_period = 10
    gamma = 0.99

    total_steps = 0
    epsilon = 0.95
    endEpsilon = 0.01
    stepDrop = (epsilon - endEpsilon) / max_epi

    for num_epi in range(max_epi):
        obs = env.reset()
        state = converter(ENV_NAME, obs).cuda()
        state = state.float()
        done = False
        total_reward = 0
        steps = 0
        if epsilon > endEpsilon:
            epsilon -= stepDrop

        while not done:
            steps += 1
            total_steps += 1
            a_out = policy_net.sample_action(state, epsilon)
            action_index = a_out
            action = make_19action(env, action_index)
            obs_prime, reward, done, info = env.step(action)

            total_reward += reward

            if done:
                print("%d episode is done" % num_epi)
                print("total rewards : %d " % total_reward)
                writer.add_scalar('Rewards/train', total_reward, num_epi)
                break

            state_prime = converter(ENV_NAME, obs_prime).cuda()
            append_sample(memory, policy_net, target_net, state, action_index,
                          reward, state_prime, done)
            state = state_prime

            if memory.size() > 1000:
                update_network(policy_net, target_net, memory, 2, optimizer,
                               total_steps)

            if total_steps % 2000 == 0:
                update_target(policy_net, target_net)
Пример #3
0
    def __init__(self, sess, env, FLAGS, rl_mode):

        self.FLAGS = FLAGS
        self.rl_mode = rl_mode
        self.p_dic = getattr(conf.dic.path_dic, self.FLAGS.env_name)
        self.s_dim = env.observation_space.shape[0]
        self.a_dim = env.action_space.shape[0]
        self.sess = sess
        self._build_graph()
        self.state_translate = env.observation_space.low
        self.state_scale = env.observation_space.high - env.observation_space.low + 1e-5
        self.action_translate = env.action_space.low
        self.action_scale = env.action_space.high - env.action_space.low + 1e-5

        if self.rl_mode:
            self.memory = Memory(self.FLAGS.replayBuffer_size,
                                 dims=2 * self.s_dim + self.a_dim + 1)
Пример #4
0
def main():
    policy_net = DQN(num_channels=num_channels,
                     num_actions=19).to(device=device)
    target_net = DQN(num_channels=num_channels,
                     num_actions=19).to(device=device)
    target_net.load_state_dict(policy_net.state_dict())
    memory = Memory(50000)
    optimizer = optim.Adam(policy_net.parameters(),
                           lr=learning_rate,
                           weight_decay=1e-5)
    print("pre_train start")
    model_name = 'pre_trained_dqn'
    pre_train(env_name, memory, policy_net, target_net, optimizer)
    print("pre_train finished")
def train_cnn(env, policy, train_policy, args):
    """
    
    
    Args:
       param1(): policy
       param2(): writer
       param3(): episode default 1 number for path to save the video
    """
    size = args.size
    obs_shape = (args.history_length, size, size)
    action_shape = (args.action_dim, )
    memoy = Memory((84, 84, 3), int(args.buffer_size), args.device)
    replay_buffer = ReplayBuffer(obs_shape, action_shape,
                                 int(args.buffer_size), args.image_pad,
                                 args.device)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)
    total_timesteps = 0
    done_counter = deque(maxlen=100)
    scores_window = deque(maxlen=100)
    t0 = time.time()
    pathname = str(args.locexp) + "/" + str(args.env_name)
    pathname += "_batch_size_" + str(args.batch_size) + "_lr_encoder_" + str(
        args.lr_encoder)
    tensorboard_name = str(args.locexp) + '/runs/' + pathname
    writer = SummaryWriter(tensorboard_name)
    for i_episode in range(int(args.episodes)):
        obs = env.reset()
        done = False
        episode_reward = 0
        for step in range(args.max_episode_steps):
            action = policy.select_action(np.array(obs))
            new_obs, reward, done, image = env.step(action)
            memoy.add(image)
            episode_reward += reward
            # frame = cv2.imwrite("im{}.png".format(step), np.array(image))
            done_bool = 0 if step + 1 == args.max_episode_steps else float(
                done)
            total_timesteps += 1
            obs = new_obs
            if step == 49:
                done = True
            if done:
                memoy.create_states(replay_buffer)
                if total_timesteps != 0:
                    if step < 50:
                        done_counter.append(1)
                    else:
                        done_counter.append(0)
                goals = sum(done_counter)
                scores_window.append(episode_reward)
                text = "Total Timesteps: {} Episode Num: {} ".format(
                    total_timesteps, i_episode)
                text += "Episode steps {} ".format(step)
                text += "Goal last 100 ep : {} ".format(goals)
                text += "Reward: {:.2f}  Average Re: {:.2f} Time: {}".format(
                    episode_reward, np.mean(scores_window),
                    time_format(time.time() - t0))
                print(text)
                break
            if total_timesteps > args.start_opt:
                train_policy.train_cnn(replay_buffer, policy, writer)
Пример #6
0
def main():
    ''' Create the environment
    '''
    env = gym.make(ENV_NAME)

    # For tensorboard
    writer = tf.summary.FileWriter("./tensorboard")

    assert STATE_DIM == np.prod(np.array(env.observation_space.shape))
    assert ACTION_DIM == np.prod(np.array(env.action_space.shape))

    env.seed(0)
    np.random.seed(0)
    ''' Create the replay memory
    '''
    replay_memory = Memory(REPLAY_MEM_CAPACITY)

    # Tensorflow part starts here!
    tf.reset_default_graph()
    ''' Create placeholders 
    '''
    # Placeholders
    state_placeholder = tf.placeholder(dtype=tf.float32, \
                                       shape=[None, STATE_DIM],
                                       name='state_placeholder')
    action_placeholder = tf.placeholder(dtype=tf.float32, \
                                        shape=[None, ACTION_DIM],
                                        name='action_placeholder')
    reward_placeholder = tf.placeholder(dtype=tf.float32,
                                        shape=[None],
                                        name='reward_placeholder')
    next_state_placeholder = tf.placeholder(dtype=tf.float32,
                                            shape=[None, STATE_DIM],
                                            name='next_state_placeholder')
    is_not_terminal_placeholder = tf.placeholder(
        dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder')

    is_training_placeholder = tf.placeholder(dtype=tf.float32,
                                             shape=(),
                                             name='is_training_placeholder')
    ''' A counter to count the number of episodes
    '''
    episodes = tf.Variable(0.0, trainable=False, name='episodes')
    episode_incr_op = episodes.assign_add(1)
    ''' Create the actor network inside the actor scope and calculate actions
    '''
    with tf.variable_scope('actor'):
        actor = ActorNetwork(STATE_DIM,
                             ACTION_DIM,
                             HIDDEN_1_ACTOR,
                             HIDDEN_2_ACTOR,
                             HIDDEN_3_ACTOR,
                             trainable=True)
        unscaled_actions = actor.call(state_placeholder)
        ''' Scale the actions to fit within the bounds provided by the 
        environment
        '''
        actions = scale_actions(unscaled_actions, env.action_space.low,
                                env.action_space.high)
    ''' Create the target actor network inside target_actor scope and calculate 
    the target actions. Apply stop_gradient to the target actions so that 
    thier gradient is not computed at any point of time.
    '''
    with tf.variable_scope('target_actor', reuse=False):
        target_actor = ActorNetwork(STATE_DIM,
                                    ACTION_DIM,
                                    HIDDEN_1_ACTOR,
                                    HIDDEN_2_ACTOR,
                                    HIDDEN_3_ACTOR,
                                    trainable=True)

        unscaled_target_actions = target_actor.call(next_state_placeholder)
        ''' Scale the actions to fit within the bounds provided by the 
        environment
        '''
        target_actions_temp = scale_actions(unscaled_target_actions,
                                            env.action_space.low,
                                            env.action_space.low)
        target_actions = tf.stop_gradient(target_actions_temp)
    ''' Create the critic network inside the critic variable scope. Get the 
    Q-values of given actions and Q-values of actions suggested by the actor 
    network.
    '''
    with tf.variable_scope('critic'):
        critic = CriticNetwork(STATE_DIM,
                               ACTION_DIM,
                               HIDDEN_1_CRITIC,
                               HIDDEN_2_CRITIC,
                               HIDDEN_3_CRITIC,
                               trainable=True)

        q_values_of_given_actions = critic.call(state_placeholder,
                                                action_placeholder)
        q_values_of_suggested_actions = critic.call(state_placeholder, actions)
    ''' Create the target critic network inside the target_critic variable 
    scope. Calculate the target Q-values and apply stop_gradient to it.
    '''
    with tf.variable_scope('target_critic', reuse=False):
        target_critic = CriticNetwork(STATE_DIM,
                                      ACTION_DIM,
                                      HIDDEN_1_CRITIC,
                                      HIDDEN_2_CRITIC,
                                      HIDDEN_3_CRITIC,
                                      trainable=True)

        target_q_values_temp = target_critic.call(next_state_placeholder,
                                                  target_actions)
        target_q_values = tf.stop_gradient(target_q_values_temp)
    ''' Calculate 
    - trainable variables in actor (Weights of actor network), 
    - Weights of target actor network
    - trainable variables in critic (Weights of critic network),
    - Weights of target critic network
    '''
    actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='actor')

    target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='target_actor')

    critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='critic')

    target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_critic')
    ''' Get the operators for updating the target networks. The 
    update_target_networks function defined in utils returns a list of operators 
    to be run from tf session inorder to update the target networks using 
    soft update.
    '''
    update_targets_op = update_target_networks(TAU, \
        target_actor_vars, actor_vars, target_critic_vars, \
            critic_vars)
    ''' Create the tf operation to train the critic network:
    - calculate TD-target 
    - calculate TD-Error = TD-target - q_values_of_given_actions
    - calculate Critic network's loss (Mean Squared Error of TD-Errors)
    - ?
    - create a tf operation to train the critic network
    '''
    targets = tf.expand_dims(reward_placeholder, 1) + \
        tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \
            target_q_values
    td_errors = targets - q_values_of_given_actions
    critic_loss = tf.reduce_mean(tf.square(td_errors))

    # Update critic networks after computing loss
    for var in critic_vars:
        if not 'bias' in var.name:
            critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var)

    # optimize critic
    critic_train_op = tf.train.AdamOptimizer(
        LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss)
    ''' Create a tf operation to train the actor networks
    - Calculate the Actor network's loss
    - Create the tf operation to train the actor network
    '''
    # Actor's loss
    actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions)
    for var in actor_vars:
        if not 'bias' in var.name:
            actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var)

    # Optimize actor
    actor_train_op = tf.train.AdamOptimizer(
        LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss,
                                                           var_list=actor_vars)

    # Init session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    writer.add_graph(sess.graph)

    # Training
    num_steps = 0
    for episode in range(NUM_EPISODES):
        total_reward = 0
        num_steps_in_episode = 0

        # Create noise
        noise = np.zeros(ACTION_DIM)
        noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \
            (env.action_space.high - env.action_space.low)

        # Initial state
        state = env.reset()

        for _ in range(MAX_STEPS_PER_EPISODE):

            action = sess.run(actions, feed_dict={ \
                state_placeholder: state[None],
                is_training_placeholder: False})

            # Add Noise to actions
            noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \
                EXPLORATION_SIGMA * np.random.randn(ACTION_DIM)

            action += noise_scale * noise

            # Take action on env
            next_state, reward, done, _info = env.step(action)
            next_state = np.squeeze(next_state)
            reward = np.squeeze(reward)
            action = action[0]

            total_reward += reward

            replay_memory.add_to_memory(
                (state, action, reward, next_state, 0.0 if done else 1.0))

            if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \
                MINI_BATCH_SIZE :
                batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE)
                _, _ = sess.run([critic_train_op, actor_train_op],
                    feed_dict={
                        state_placeholder: np.asarray( \
                            [elem[0] for elem in batch]),
                        action_placeholder: np.asarray( \
                            [elem[1] for elem in batch]),
                        reward_placeholder: np.asarray( \
                            [elem[2] for elem in batch]),
                        next_state_placeholder: np.asarray( \
                            [elem[3] for elem in batch]),
                        is_not_terminal_placeholder: np.asarray( \
                            [elem[4] for elem in batch]),
                        is_training_placeholder: True
                })

                _ = sess.run(update_targets_op)

            state = next_state
            num_steps += 1
            num_steps_in_episode += 1

            if done:
                _ = sess.run(episode_incr_op)
                break

        print(str((episode, total_reward, num_steps_in_episode, noise_scale)))

    env.close()
Пример #7
0
    env = gym.make(ENV_NAME)
    env.seed(1)
    env = env.unwrapped

    # Get state and action dimension
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    # Initialize actor, critic and target networks
    actor = ActorNetwork(action_dim=action_dim)
    critic = CriticNetwork()
    target_mu = TargetNetMu(actor)
    target_q = TargetNetQ(critic)

    # Initialize buffer
    memory = Memory(capacity=buffer_size, dims=2 * state_dim + action_dim + 1)

    # Total loss for critic
    total_critic_loss = 0
    total_transition_trained_on = 0

    # Outer iteration
    for m in range(M):

        # Receive initial observation
        s = env.reset()
        explore_variance = 2  # initial exploration variance

        s = nd.array(s).reshape((1, -1))
        # print(s)
Пример #8
0
class DDPG:
    def __init__(self, sess, env, FLAGS, rl_mode):

        self.FLAGS = FLAGS
        self.rl_mode = rl_mode
        self.p_dic = getattr(conf.dic.path_dic, self.FLAGS.env_name)
        self.s_dim = env.observation_space.shape[0]
        self.a_dim = env.action_space.shape[0]
        self.sess = sess
        self._build_graph()
        self.state_translate = env.observation_space.low
        self.state_scale = env.observation_space.high - env.observation_space.low + 1e-5
        self.action_translate = env.action_space.low
        self.action_scale = env.action_space.high - env.action_space.low + 1e-5

        if self.rl_mode:
            self.memory = Memory(self.FLAGS.replayBuffer_size,
                                 dims=2 * self.s_dim + self.a_dim + 1)

    def _build_graph(self):
        self._placehoders()
        self._actor_critic()
        self._loss_train_op()
        self.score = tf.Variable(0.,
                                 trainable=False,
                                 dtype=tf.float32,
                                 name='score')
        self.score_summary = tf.summary.scalar('score', self.score)
        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(self.p_dic.get('agent_log_dir'))
        self.writer.add_graph(self.sess.graph)
        self.saver = tf.train.Saver(max_to_keep=50,
                                    keep_checkpoint_every_n_hours=1)

    def _placehoders(self):
        with tf.name_scope('inputs'):
            self.current_state = tf.placeholder(tf.float32,
                                                shape=[None, self.s_dim],
                                                name='s')
            self.reward = tf.placeholder(tf.float32, [None, 1], name='r')
            self.next_state = tf.placeholder(tf.float32,
                                             shape=[None, self.s_dim],
                                             name='s_')
            self.is_training = tf.placeholder(tf.bool, name='is_training')

    def _actor_critic(self):
        self.actor = build_actor(self.current_state, self.a_dim,
                                 self.is_training)
        self.actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                            'Actor')
        actor_ema = tf.train.ExponentialMovingAverage(decay=1 - self.FLAGS.tau)
        self.update_targetActor = actor_ema.apply(self.actor_vars)
        self.targetActor = build_actor(self.next_state,
                                       self.a_dim,
                                       False,
                                       reuse=True,
                                       getter=get_getter(actor_ema))

        self.critic = build_critic(self.current_state, self.actor,
                                   self.is_training)
        self.critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             'Critic')
        critic_ema = tf.train.ExponentialMovingAverage(decay=1 -
                                                       self.FLAGS.tau)
        self.update_targetCritic = critic_ema.apply(self.critic_vars)
        self.targetCritic = build_critic(self.next_state,
                                         self.targetActor,
                                         False,
                                         reuse=True,
                                         getter=get_getter(critic_ema))

    def _loss_train_op(self):
        max_grad = 2
        with tf.variable_scope('target_q'):
            self.target_q = self.reward + self.FLAGS.gamma * self.targetCritic
        with tf.variable_scope('TD_error'):
            self.critic_loss = tf.squared_difference(self.target_q,
                                                     self.critic)
        with tf.variable_scope('critic_grads'):
            self.critic_grads = tf.gradients(ys=self.critic_loss,
                                             xs=self.critic_vars)
            for ix, grad in enumerate(self.critic_grads):
                self.critic_grads[ix] = grad / self.FLAGS.batch_size
        with tf.variable_scope('C_train'):
            critic_optimizer = tf.train.AdamOptimizer(self.FLAGS.critic_lr,
                                                      epsilon=1e-5)
            self.train_critic = critic_optimizer.apply_gradients(
                zip(self.critic_grads, self.critic_vars))
        with tf.variable_scope('a_grad'):
            self.a_grads = tf.gradients(self.critic, self.actor)[0]
        with tf.variable_scope('actor_grads'):
            self.actor_grads = tf.gradients(ys=self.actor,
                                            xs=self.actor_vars,
                                            grad_ys=self.a_grads)
            for ix, grad in enumerate(self.actor_grads):
                self.actor_grads[ix] = tf.clip_by_norm(
                    grad / self.FLAGS.batch_size, max_grad)
        with tf.variable_scope('A_train'):
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                actor_optimizer = tf.train.AdamOptimizer(-self.FLAGS.actor_lr,
                                                         epsilon=1e-5)
                self.train_actor = actor_optimizer.apply_gradients(
                    zip(self.actor_grads, self.actor_vars))

    def choose_action(self, state):

        return self.sess.run(self.actor,
                             feed_dict={
                                 self.current_state: state,
                                 self.is_training: False
                             })

    def train(self, episode=None, ep_reward=None):
        batch_memory = self.memory.sample(self.FLAGS.batch_size)
        batch_s = batch_memory[:, :self.s_dim]
        batch_a = batch_memory[:, self.s_dim:self.s_dim + self.a_dim]
        batch_r = batch_memory[:, -self.s_dim - 1:-self.s_dim]
        batch_s_ = batch_memory[:, -self.s_dim:]

        if episode is None:
            critic_feed_dict = {
                self.current_state: batch_s,
                self.actor: batch_a,
                self.reward: batch_r,
                self.next_state: batch_s_,
                self.is_training: True
            }
            self.sess.run([self.train_critic, self.update_targetCritic],
                          feed_dict=critic_feed_dict)
            actor_feed_dict = {
                self.current_state: batch_s,
                self.next_state: batch_s_,
                self.is_training: True
            }
            self.sess.run([self.train_actor, self.update_targetActor],
                          feed_dict=actor_feed_dict)
        else:
            update_score = self.score.assign(
                tf.convert_to_tensor(ep_reward, dtype=tf.float32))
            with tf.control_dependencies([update_score]):
                merged_score = tf.summary.merge([self.score_summary])
            self.critic_summary = tf.summary.merge_all(scope='Critic')
            self.actor_summary = tf.summary.merge_all(scope='Actor')
            critic_feed_dict = {
                self.current_state: batch_s,
                self.actor: batch_a,
                self.reward: batch_r,
                self.next_state: batch_s_,
                self.is_training: True
            }
            _, _, critic = self.sess.run([
                self.train_critic, self.update_targetCritic,
                self.critic_summary
            ],
                                         feed_dict=critic_feed_dict)
            self.writer.add_summary(critic, episode)
            actor_feed_dict = {
                self.current_state: batch_s,
                self.next_state: batch_s_,
                self.is_training: True
            }
            merged = tf.summary.merge([merged_score, self.actor_summary])
            _, _, actor = self.sess.run(
                [self.train_actor, self.update_targetActor, merged],
                feed_dict=actor_feed_dict)
            self.writer.add_summary(actor, episode)

            self.saver.save(
                self.sess,
                self.p_dic.get('agent_log_dir') + '/' +
                datetime.datetime.now().strftime('%y%m%d-%H:%M:%S') + '_EP' +
                str(episode) + '.ckpt')

    def perceive(self,
                 state,
                 action,
                 reward,
                 next_state,
                 episode=None,
                 ep_reward=None):
        self.memory.store_transition(state, action, reward, next_state)
        if self.memory.pointer > self.FLAGS.replayBuffer_size:
            self.train(episode, ep_reward)

    def load(self):
        self.saver.restore(
            self.sess,
            tf.train.latest_checkpoint(self.p_dic.get('agent_log_dir')))

    def act(self, obs):
        actor_feed_dict = {self.current_state: obs, self.is_training: False}
        action = self.sess.run(self.actor, feed_dict=actor_feed_dict)
        act_low = np.array(
            [16., 16., 16., 16., 7.36, 16., 16., 16., 16., 6.57],
            dtype=np.float32)
        act_high = np.array(
            [30., 30., 30., 30., 7.36, 30., 30., 30., 30., 6.57],
            dtype=np.float32)
        return action
Пример #9
0
class DeepQNetwork:
    def __init__(
            self,
            n_actions,
            n_features,
            sess,
            agent_id,
            num_training,
            learning_rate=0.01,
            reward_decay=0.9,
            replace_target_iter=300,
            memory_size=500,
            batch_size=32,
            save_model_freq=100,
            max_epsilon=1,
            min_epsilon=0,
            load_model=False,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.sess = sess
        self.agent_id = agent_id
        self.num_training = num_training
        self.lr = learning_rate
        self.gamma = reward_decay
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.save_model_freq = save_model_freq
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon = self.max_epsilon
        self.load_model = load_model

        # total learning step
        self.learn_step_counter = 0
        self.episode_rew_agent = 0
        self.episode_rew_all = 0
        self.episode = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = Memory(capacity=memory_size) #np.zeros((self.memory_size, n_features * 2 + 2))
        # consist of [target_net, evaluate_net]
        self._build_net()
        t_params = tf.get_collection('target_net_params')
        e_params = tf.get_collection('eval_net_params')
        self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

        self.cost_his = []
        if(self.load_model):
            saver = tf.train.Saver(max_to_keep=100000000)
            model_load_steps = 420000
            model_file_load = os.path.join("models/", "agent_No_" + str(self.agent_id) + "/",
                                           str(model_load_steps) + "_" + "model_segment_training/", "8m")
            saver.restore(self.sess, model_file_load)
            print("model trained for %s steps of agent %s have been loaded"%(model_load_steps, self.agent_id))
        else:
            self.sess, self.saver, self.summary_placeholders, self.update_ops, self.summary_op, self.summary_writer, self.summary_vars = self.init_sess()

        # 将网络计算的初始化工作完成
    def init_sess(self):
        # Summary for tensorboard
        summary_placeholders, update_ops, summary_op, summary_vars = self.setup_summary()
        fileWritePath = os.path.join("logs/", "agent_No_" + str(self.agent_id) + "/")
        summary_writer = tf.summary.FileWriter(fileWritePath, self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

        # Load the file if the saved file exists

        saver = tf.train.Saver(max_to_keep=100000000)

        return self.sess, saver, summary_placeholders, update_ops, summary_op, summary_writer, summary_vars

    def _build_net(self):
        # ------------------ build evaluate_net ------------------
        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
        self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
        with tf.variable_scope('eval_net'):
            # c_names(collections_names) are the collections to store variables                                512*512的网络结构
            c_names, n_l1, n_l2, w_initializer, b_initializer = \
                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 256, 256, \
                tf.contrib.layers.xavier_initializer(), tf.contrib.layers.xavier_initializer()
                # tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers

            # first layer. collections is used later when assign to target net
            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)

            # second layer. collections is used later when assign to target net
            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [n_l1, n_l2], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, n_l2], initializer=b_initializer, collections=c_names)
                l2 = tf.nn.relu(tf.matmul(l1, w2) + b2)

            # third layer. collections is used later when assign to target net
            with tf.variable_scope('l3'):
                w3 = tf.get_variable('w3', [n_l2, self.n_actions], initializer=w_initializer, collections=c_names)
                b3 = tf.get_variable('b3', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                self.q_eval = tf.matmul(l2, w3) + b3

        with tf.variable_scope('loss'):
            self.abs_errors = tf.reduce_sum(tf.abs(self.q_target - self.q_eval), axis=1)
            self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval))
                       #tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))

        with tf.variable_scope('train'):
            # self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
            self._train_op = tf.train.AdamOptimizer(self.lr, epsilon=1e-02).minimize(self.loss)

        # ------------------ build target_net ------------------
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
        with tf.variable_scope('target_net'):
            # c_names(collections_names) are the collections to store variables
            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]

            # first layer. collections is used later when assign to target net
            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)

            # second layer. collections is used later when assign to target net
            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [n_l1, n_l2], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [1, n_l2], initializer=b_initializer, collections=c_names)
                l2 = tf.nn.relu(tf.matmul(l1, w2)) + b2

            # third layer. collections is used later when assign to target net
            with tf.variable_scope('l3'):
                w3 = tf.get_variable('w3', [n_l2, self.n_actions], initializer=w_initializer, collections=c_names)
                b3 = tf.get_variable('b3', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                self.q_next = tf.matmul(l2, w3) + b3

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))
        self.memory.store(transition)
        # if not hasattr(self, 'memory_counter'):
        #     self.memory_counter = 0
        #
        # transition = np.hstack((s, [a, r], s_))  #往水平方向平铺,所以是一行数
        #
        # # replace the old memory with new memory
        # index = self.memory_counter % self.memory_size
        # self.memory[index, :] = transition
        #
        # self.memory_counter += 1

    def choose_action(self, observation):
        # to have batch dimension when feed into tf placeholder
        observation = observation[np.newaxis, :]
        if(self.load_model == False):
            if np.random.uniform() < self.epsilon:
                # forward feed the observation and get q value for every actions
                action = np.random.randint(0, self.n_actions)
            else:
                actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
                action = np.argmax(actions_value)
        else:
            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
            action = np.argmax(actions_value)
        return action

    def learn(self):
        # check to replace target parameters
        # if(self.memory_counter < self.batch_size):
        #     return
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.replace_target_op)
            # print('\ntarget_params_replaced\n')

        # sample batch memory from all memory
        # if self.memory_counter > self.memory_size:
        #     sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        # else:
        #     sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
        # batch_memory = self.memory[sample_index, :]
        tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
        q_next, q_eval = self.sess.run(
            [self.q_next, self.q_eval],
            feed_dict={
                self.s_: batch_memory[:, -self.n_features:],  # fixed params
                self.s: batch_memory[:, :self.n_features],  # newest params
            })

        # change q_target w.r.t q_eval's action
        q_target = q_eval.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.n_features].astype(int)
        reward = batch_memory[:, self.n_features + 1]     #batch个行,第n_features + 1列的数,那正好是reward

        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

        """
        For example in this batch I have 2 samples and 3 actions:
        q_eval =
        [[1, 2, 3],
         [4, 5, 6]]

        q_target = q_eval =
        [[1, 2, 3],
         [4, 5, 6]]

        Then change q_target with the real q_target value w.r.t the q_eval's action.
        For example in:
            sample 0, I took action 0, and the max q_target value is -1;
            sample 1, I took action 2, and the max q_target value is -2:
        q_target =
        [[-1, 2, 3],
         [4, 5, -2]]

        So the (q_target - q_eval) becomes:
        [[(-1)-(1), 0, 0],
         [0, 0, (-2)-(6)]]

        We then backpropagate this error w.r.t the corresponding action to network,
        leave other action as error=0 cause we didn't choose it.
        """
        _, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
                                                 feed_dict={self.s: batch_memory[:, :self.n_features],
                                                            self.q_target: q_target,
                                                            self.ISWeights: ISWeights})
        self.memory.batch_update(tree_idx, abs_errors)
        # train eval network
        # _, self.cost = self.sess.run([self._train_op, self.loss],
        #                              feed_dict={self.s: batch_memory[:, :self.n_features],
        #                                         self.q_target: q_target})

        self.cost_his.append(self.cost)

        self.learn_step_counter += 1

        self.plotting()

        # Decreasing epsilon
        if self.epsilon > self.min_epsilon:
            self.epsilon -= self.max_epsilon/self.num_training
        else:
            self.epsilon = self.min_epsilon


        if (self.learn_step_counter % self.save_model_freq == 0):
            model_file_save = os.path.join("models/", "agent_No_"+str(self.agent_id)+"/", str(self.learn_step_counter) + "_" + "model_segment_training/", "8m")
            dirname = os.path.dirname(model_file_save)
            if any(dirname):
                os.makedirs(dirname, exist_ok=True)
            self.saver.save(self.sess, model_file_save)
            print("Model trained for %s times is saved"%self.learn_step_counter)

            # save data of replay buffer
            obj = self.memory
            filename = 'buffer_agent'+str(self.agent_id)+'.txt'
            file = open(filename, 'wb')
            pickle.dump(obj, file)
            file.close()

    def setup_summary(self):
        cost = tf.Variable(0.)
        eps_rew_agent = tf.Variable(0.)
        eps_rew_all = tf.Variable(0.)

        tf.summary.scalar("cost", cost)
        tf.summary.scalar("eps_rew_agent", eps_rew_agent)
        tf.summary.scalar("eps_rew_all", eps_rew_all)
        summary_vars = [cost, eps_rew_agent, eps_rew_all]

        summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))]

        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
        summary_op = tf.summary.merge_all()

        return summary_placeholders, update_ops, summary_op, summary_vars

    def plotting(self):
        tensorboard_info = [self.cost, self.episode_rew_agent, self.episode_rew_all]
        vars_plot = []
        for i in range(len(tensorboard_info)):
            vars_plot.append(self.sess.run(self.update_ops[i], feed_dict={self.summary_placeholders[i]: float(tensorboard_info[i])}))

        summary_1 = tf.Summary(value=[tf.Summary.Value(tag="cost", simple_value=vars_plot[0])])
        summary_2 = tf.Summary(value=[tf.Summary.Value(tag="eps_rew_agent", simple_value=vars_plot[1])])
        summary_3 = tf.Summary(value=[tf.Summary.Value(tag="eps_rew_all", simple_value=vars_plot[2])])

        self.summary_writer.add_summary(summary_1, self.learn_step_counter)
        self.summary_writer.add_summary(summary_2, self.episode)
        self.summary_writer.add_summary(summary_3, self.episode)

    def plot_cost(self):
        import matplotlib.pyplot as plt
        plt.plot(np.arange(len(self.cost_his)), self.cost_his)
        plt.ylabel('Cost')
        plt.xlabel('training steps')
        plt.show()

    def get_episode_reward(self, eps_r_agent, eps_r_all, episode):
        self.episode_rew_agent = eps_r_agent
        self.episode_rew_all = eps_r_all
        self.episode = episode
Пример #10
0
def train():
    gamma = 0.99
    episodes = 100
    batch_size = 128
    max_time_steps = 200
    episode_reward = 0
    reward_history = []
    env = gym.make('Pendulum-v0')
    obs_old = env.reset()
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    agent_policy = Policy(env, "policy")
    agent_critic = Value(env, "value")
    # agent_policy_t = Policy(env,"policy_t")
    # agent_critic_t = Value(env,"value_t")

    #initial rollouts to gather date
    for i in range(10000):
        action = env.action_space.sample()
        obs, rew, done, _ = env.step(action)
        episode_reward += rew

        memory.append(obs_old, action, rew, obs, done)
        obs_old = obs
        if done:
            # reward_history.append(episode_reward)
            episode_reward = 0
            env.reset()
    episode_reward = 0
    time_t = 200
    tf.summary.scalar("episode_time_steps", time_t)
    tf.summary.scalar("episode_reward", episode_reward)
    merged = tf.summary.merge_all()
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    mark = np.zeros([8, 1])
    time_split = np.zeros_like(mark)
    with tf.Session() as sess:
        sess1 = tf_debug.TensorBoardDebugWrapperSession(sess, "Vader:6007")
        # tf_debug.LocalCLIDebugWrapperSession(sess)
        from datetime import datetime
        now = datetime.now()
        train_writer = tf.summary.FileWriter(
            './train/' + now.strftime("%Y%m%d-%H%M%S") + '/', sess.graph)
        # train_writer = tf.summary.FileWriter('.' + '/train', sess.graph)

        agent_critic.create_target(0.1)
        agent_policy.create_target(0.1)
        # agent_policy_t.create_target_capacity(agent_policy.get_trainable_parameters(),0.6)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.initialize_all_variables())
        sess.run(tf.initialize_local_variables())
        sess1.run(agent_policy.get_trainable_parameters())
        # agent_critic.set_trainable_parameters(agent_critic.get_trainable_parameters(), 0)
        # agent_policy.set_trainable_parameters(agent_policy.get_trainable_parameters(), 0)
        for i in range(episodes):
            print('running episode:', i)
            t = 0
            done = 0

            while t < max_time_steps:
                # print(t)
                start = Time.time()
                action = agent_policy.predict(
                    obs_old)  # + agent_policy.noise(0,1/episode_reward)
                mark[0] = Time.time() - start
                # print('mark1:'+str(mark1))
                action = action.reshape(-1)
                # if action>0.5:
                #     action = 1
                # else:
                #     action = -0
                # action = env.action_space.sample()
                obs, rew, done, info = env.step(action)
                # env.render()
                episode_reward += rew
                memory.append(obs_old, action, rew, obs, done)

                # print('mark2:' + str(mark2))
                if done or t == max_time_steps - 1:
                    time_t = t
                    reward_history.append(episode_reward)
                    episode_reward = 0
                    env.reset()
                obs_old = obs
                t += 1

            for steps in range(50):
                batch = memory.sample(batch_size)
                obs_batch = batch['obs0']
                obs_batch -= np.mean(obs_batch, 0)
                obs_batch = obs_batch / np.var(obs_batch, 0)
                agent_policy.act_as_target = True
                action_batch_predict = agent_policy.predict(obs_batch)[0]
                agent_policy.act_as_target = False
                agent_critic.act_as_target = True
                value_batch = agent_critic.predict(obs_batch,
                                                   action_batch_predict)[0]
                # print(value_batch[0])
                agent_critic.act_as_target = False
                y = np.array(batch['rewards']) + gamma * np.array(
                    value_batch)  #.reshape(-1,batch_size)
                agent_critic.update_value(obs=obs_batch,
                                          action=action_batch_predict,
                                          target=y)
                q_grad = np.array(
                    agent_critic.get_q_gradient(action_batch_predict,
                                                obs_batch)).reshape(
                                                    -1,
                                                    env.action_space.shape[0])
                agent_policy.optimize_policy(q_grad, obs_batch)
                parm = agent_critic.get_all_parameters()
                value = np.array(sess.run(agent_critic.get_all_parameters()))
                agent_critic.update_target()
                agent_policy.update_target()
                value = np.array(sess.run(agent_critic.get_all_parameters()))

            # print(time_split)
            print(reward_history[-1])
            summary = sess.run(merged)
            train_writer.add_summary(summary, i)
            time = 200
            episode_reward = 0