예제 #1
0
    def __init__(self, state_size, action_size, random_seed, hyperparams):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.hyperparams = hyperparams

        self.actor = Actor(state_size, action_size, random_seed).to(device)
        self.actor_noise = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=hyperparams.alpha_actor)

        self.critic = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optim = optim.Adam(
            self.critic.parameters(),
            lr=hyperparams.alpha_critic,
            weight_decay=hyperparams.weight_decay,
        )

        self.replay_buffer = ReplayBuffer(hyperparams.buffer_size,
                                          hyperparams.batch_size, random_seed)

        self.noise = OUNoise(
            action_size,
            random_seed,
            self.hyperparams.mu,
            self.hyperparams.theta,
            self.hyperparams.sigma,
        )
예제 #2
0
    def __init__(self, state_size, action_size, seed):
        self.gradient_clipping = True
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.config = Config()
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.config.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.config.LR_CRITIC,
            weight_decay=self.config.WEIGHT_DECAY)

        self.noise = OUNoise(action_size, seed)

        self.memory = ReplayBuffer(action_size, self.config.BUFFER_SIZE,
                                   self.config.BATCH_SIZE, seed, device)
        self.step_count = 0
예제 #3
0
    def __init__(self, state_size, action_size, agent_id):
        """Initialize a DDPGAgent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            agent_id (int): identifier for this agent
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(RANDOM_SEED)
        self.agent_id = agent_id

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Make sure that the target-local model pairs are initialized to the
        # same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        self.noise = OUNoise(action_size)
예제 #4
0
    def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma,
                 tau, batch_size, n_train, n_episode):
        # Gym environment
        self.env = env

        env_flattened = gym.wrappers.FlattenDictWrapper(
            env, dict_keys=['observation', 'achieved_goal', 'desired_goal'])

        # Get space sizes
        self.state_dim = env_flattened.observation_space.shape[0]
        #self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Get replay buffer and function get a batch from it
        self.replay_buffer = replay_buffer
        self.sample_batch = sample_batch

        self.sess = tf.InteractiveSession()

        # Hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_train = n_train
        self.n_episode = n_episode

        # Initialize networks
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim)
        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim)

        self.exploration_noise = OUNoise(self.action_dim)
예제 #5
0
    def __init__(self, env, state_dim=None):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # along with their target networks
        if state_dim:
            self.state_dim = state_dim
            print(self.state_dim)
        else:
            self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        # Flag to signal save
        self.not_saved = True

        #For normalisation
        self.state_mean = 0
        self.state_std = 1
        self.target_mean = 0
        self.target_std = 1
예제 #6
0
파일: ddpg.py 프로젝트: inikiforovski/DDPG
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        # self.action_dim = env.action_space.shape[0]
        self.state_dim = env.state_size
        self.action_dim = env.action_size
        self.action_bound = (env.action_high - env.action_low) / 2

        print('state_dim: ', self.state_dim, 'action_dim: ', self.action_dim,
              'action_bound: ', self.action_bound)

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim, self.action_bound)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim, self.action_bound)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
예제 #7
0
    def __init__(self, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr, critic_lr,
                 gamma, buffer_size, item_space, summary_dir):

        self.state_item_num = state_item_num
        self.action_item_num = action_item_num
        self.emb_dim = emb_dim
        self.batch_size = batch_size
        self.tau = tau
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.item_space = item_space
        self.summary_dir = summary_dir

        self.sess = tf.Session()

        self.s_dim = emb_dim * state_item_num
        self.a_dim = emb_dim * action_item_num
        self.actor = Actor(self.sess, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr)
        self.critic = Critic(self.sess, state_item_num, action_item_num, emb_dim,
                             self.actor.get_num_trainable_vars(), gamma, tau, critic_lr)
        self.exploration_noise = OUNoise(self.a_dim)

        # set up summary operators
        self.summary_ops, self.summary_vars = self.build_summaries()
        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(summary_dir, self.sess.graph)

        # initialize target network weights
        self.actor.hard_update_target_network()
        self.critic.hard_update_target_network()

        # initialize replay memory
        self.replay_buffer = ReplayBuffer(buffer_size)
예제 #8
0
    def __init__(self, env, time_steps, hidden_dim):
        self.name = 'DDPG'  # name for uploading results
        self.scale = env.asset
        self.unit = env.unit
        self.seed = env.rd_seed

        self.time_dim = time_steps
        self.state_dim = env.observation_space.shape[1]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = 64
        self.memory_size = self.time_dim + self.batch_size * 10
        self.start_size = self.time_dim + self.batch_size * 2

        # Initialise actor & critic networks
        self.actor_network = Actor(self.time_dim, self.state_dim,
                                   self.action_dim, hidden_dim)
        self.critic_network = Critic(self.time_dim, self.state_dim,
                                     self.action_dim, hidden_dim)

        # Initialize replay buffer
        self.replay_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_next_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_action = torch.zeros(
            (self.start_size - 1, 1, self.state_dim), device=cuda)
        self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim,
                                         sigma=0.01 / self.action_dim)
        self.initial()
예제 #9
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
예제 #10
0
    def __init__(self, sess, number, model_path, global_episodes, explore,
                 decay, training):
        self.name = 'worker_' + str(number)  # name for uploading results
        self.number = number
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 41
        self.action_dim = 18
        self.model_path = model_path
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.sess = sess
        self.explore = explore
        self.decay = decay
        self.training = training

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim,
                                          self.name + '/actor')
        self.actor_network.update_target(self.sess)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim,
                                            self.name + '/critic')
        self.critic_network.update_target(self.sess)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.update_local_ops_actor = update_target_graph(
            'global/actor', self.name + '/actor')
        self.update_local_ops_critic = update_target_graph(
            'global/critic', self.name + '/critic')
예제 #11
0
    def __init__(self, env, DIRECTORY):
        self.batch_size = BATCH_SIZE
        self.replay_start_size = REPLAY_START_SIZE  # self.sub_batch_size = BATCH_SIZE / n_gpu

        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        self.trace_length = TRACE_LENGTH
        self.temp_abstract = TEMP_ABSTRACT
        self.actor_network = ActorNetwork(self.sess, BATCH_SIZE,
                                          self.state_dim, self.action_dim,
                                          self.temp_abstract, DIRECTORY)
        self.critic_network = CriticNetwork(self.sess, BATCH_SIZE,
                                            self.state_dim, self.action_dim,
                                            self.temp_abstract, DIRECTORY)

        # initialize replay buffer
        max_len_trajectory = self.environment.spec.timestep_limit + 1  # trace_length
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY,
                                          max_len_trajectory,
                                          self.actor_network.last_epi)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        ###
        self.diff = 0.
        self.discounting_mat_dict = {}
예제 #12
0
    def __init__(self, shape_in, num_output, accele_range, angle_range):
        self.input_shape = shape_in
        self.out_shape = num_output
        self.learning_rate_a = LEARNING_RATE_ACTOR
        self.learning_rate_c = LEARNING_RATE_CRITIC
        self.memory = deque(maxlen=MAX_MEMORY_LEN)
        self.train_start = 200
        self.batch_size = 64
        self.gamma = 0.9
        self.sigma_fixed = 2
        self.channel = CHANNEL
        self.critic_input_action_shape = 1
        self.angle_range = angle_range
        self.accele_range = accele_range
        self.actor_model = self.actor_net_builder()
        self.critic_model = self.critic_net_build()
        self.actor_target_model = self.actor_net_builder()
        self.critic_target_model = self.critic_net_build()
        self.OUnoise = OUNoise(2)
        # self.actor_target_model.trainable = False
        # self.critic_target_model.trainable = False

        self.actor_history = []
        self.critic_history = []
        self.reward_history = []
        self.weight_hard_update()
예제 #13
0
    def __init__(self, env):
        self.action_dim = env.action_space.shape[0]
        self.state_dim = env.observation_space.shape[0]
        self.h1_dim = 400
        self.h2_dim = 300

        self.actor_learning_rate = 1e-4
        self.critic_learning_rate = 1e-3

        self.gamma = 0.99

        # Ornstein-Uhlenbeck noise parameters
        self.noise_theta = 0.15
        self.noise_sigma = 0.20
        self.ou = OUNoise(self.action_dim,
                          theta=self.noise_theta,
                          sigma=self.noise_sigma)

        self.replay_buffer_size = 1000000
        self.replay_buffer = deque(maxlen=self.replay_buffer_size)
        self.replay_start_size = 1000

        self.batch_size = 64

        self.target_update_rate = 0.001
        self.total_parameters = 0
        self.global_steps = 0
        self.reg_param = 0.01
예제 #14
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 n_hidden_units=128,
                 n_layers=3):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # actor
        self.actor = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4)

        # critic
        self.critic = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=3e-4,
                                     weight_decay=0.0001)

        # will add noise
        self.noise = OUNoise(action_size, seed)

        # experience replay
        self.replay = ReplayBuffer(seed)
예제 #15
0
파일: ddpg.py 프로젝트: ivychill/ltr
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")
예제 #16
0
    def reset(self):
        # 매 episode가 시작될때 사용됨.
        # 사용 변수들 초기화

        # 매 에피소드마다 로봇을 원점으로 이동시키려면 아래 주석을 해제한다.
        self.robot_tf = Transformation()
        self.joint1_tf = Transformation()
        self.link1_tf = Transformation(translation=(self.link1_len, 0))
        self.joint2_tf = Transformation()
        self.link2_tf = Transformation(translation=(self.link2_len, 0))
        self.link1_tf_global = self.robot_tf * self.joint1_tf * self.link1_tf
        self.link2_tf_global = self.link1_tf_global * self.joint2_tf * self.link2_tf

        self.step_count = 0.00

        # 목표 지점 생성
        self.target_tf = Transformation(translation=(
            random.randrange(-self.env_boundary, self.env_boundary),
            random.randrange(-self.env_boundary, self.env_boundary)))
        self.ou = OUNoise(dt=self.dt, theta=0.1, sigma=0.2)

        self.done = False
        self.t = 0
        self.buffer = []  # 시각화를 위한 버퍼. episode가 리셋될 때마다 초기화.

        # Step 함수와 다르게 reset함수는 초기 state 값 만을 반환합니다.
        return self._get_state()
예제 #17
0
파일: main.py 프로젝트: mmarklar/ddpg-aigym
def main():
    experiment = 'model-builder-v0'  #specify environments here
    env = gym.make(experiment)
    #steps= env.spec.timestep_limit #steps per episode
    steps = 20
    assert isinstance(env.observation_space,
                      Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    print("Number of Steps per episode:", steps)
    #saving reward:
    reward_st = np.array([0])

    for i in range(episodes):
        print("==== Starting episode no:", i, "====", "\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            #rendering environmet (optional)
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, 300, 300, 2]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            print("Action at step", t, " :", action, "\n")

            observation, reward, done, info = env.step(action)

            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ',
                      reward_per_episode)
                print("Printing reward to file")
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print('\n\n')
                break
    total_reward += reward_per_episode
    print("Average reward per episode {}".format(total_reward / episodes))
 def learn(self, total_timesteps, callback):
     ou_scale = 1.0  # initial scaling factor
     ou_decay = 0.9995  # decay of the scaling factor ou_scale
     ou_mu = 0.0  # asymptotic mean of the noise
     ou_theta = 0.15  # magnitude of the drift term
     ou_sigma = 0.20  # magnitude of the diffusion term
     # this slowly decreases to 0
     # create the noise process
     noise_process = OUNoise(self.action_size, ou_mu, ou_theta, ou_sigma)
     # create the replay buffer
     buffer = ReplayBuffer(seed=self.seed, action_size=self.action_size, buffer_size=self.buffer_size,
                           batch_size=self.batch_size, device=self.device)
     self.t_step = 0
     episode = 0
     while self.t_step < total_timesteps:
         callback.on_start_episode(episode)
         episode_scores = np.zeros(self.env.num_agents)
         states, _, _ = self.env.reset()
         scores = np.zeros(2)
         while True:
             states = np.reshape(states, (1, 48))  # reshape so we can feed both agents states to each agent
             # split into the states into the parts observed by each agent
             states_0 = states[0, :24].reshape((1, 24))
             states_1 = states[0, 24:].reshape((1, 24))
             # generate noise
             noise = ou_scale * noise_process.get_noise().reshape((1, 4))
             # split the noise into the parts for each agent
             noise_0 = noise[0, :2].reshape((1, 2))
             noise_1 = noise[0, 2:].reshape((1, 2))
             # determine actions for the unity agents from current sate, using noise for exploration
             actions_0 = self.player_policy.act(states_0, use_target=False, add_noise=True, noise_value=noise_0)\
                 .detach().cpu().numpy()
             actions_1 = self.opponent_policy.act(states_1, use_target=False, add_noise=True, noise_value=noise_1)\
                 .detach().cpu().numpy()
             actions = np.vstack((actions_0.flatten(), actions_1.flatten()))
             # take the action in the environment
             next_states, rewards, dones, info = self.env.step(actions)
             # store (S, A, R, S') info in the replay buffer (memory)
             buffer.add(states.flatten(), actions.flatten(), rewards, next_states.flatten(), dones)
             episode_scores += rewards
             states = next_states
             self.t_step += 1
             """
             Policy learning
             """
             ## train the agents if we have enough replays in the buffer
             if len(buffer) >= self.batch_size:
                 self.player_policy.learn(buffer.sample(), self.opponent_policy)
                 self.opponent_policy.learn(buffer.sample(), self.player_policy)
             if np.any(dones):
                 break
         if not callback.on_step(np.max(episode_scores), self.t_step):
             break
         # decrease the scaling factor of the noise
         ou_scale *= ou_decay
         episode += 1
예제 #19
0
    def __init__(self, model, env, sess, num_episodes, direction):
        self.model = model
        self.sess = sess
        self.direction = direction
        self.env = env
        self.num_episodes = num_episodes
        self.episode_start = 0
        self.noise = OUNoise(mu=np.zeros(self.env.action_space.shape))
        self.noise_decay = 0.2
        self.epsilon = EPSILON
        self.epsilon_decay = nth_root(self.num_episodes, 0.001 / self.epsilon)
        self.count_exp_replay = 0
        self.tau = TAU
        self.target_Q_ph = tf.placeholder(tf.float32, shape=(None, 1))
        self.actions_grads_ph = tf.placeholder(
            tf.float32, shape=((None, ) + self.env.action_space.shape))

        #train operation
        self.actor_train_ops = self.model.Actor.train_step(
            self.actions_grads_ph)
        self.critic_train_ops = self.model.Critic.train_step(self.target_Q_ph)

        #update operation
        self.update_critic_target = self.model.update_target_network(
            self.model.Critic.network_params,
            self.model.Critic_target.network_params, self.tau)
        self.update_actor_target = self.model.update_target_network(
            self.model.Actor.network_params,
            self.model.Actor_target.network_params, self.tau)
        sess.run(tf.initialize_all_variables())
        #for testing only
        self.sess.run(
            self.model.update_target_network(
                self.model.Critic.network_params,
                self.model.Critic_target.network_params))
        self.sess.run(
            self.model.update_target_network(
                self.model.Actor.network_params,
                self.model.Actor_target.network_params))

        # reward summary for tensorboard
        self.tf_reward = tf.Variable(0.0,
                                     trainable=False,
                                     name='reward_summary')
        self.tf_reward_summary = tf.summary.scalar("Reward by episode",
                                                   self.tf_reward)

        # time
        self.tf_time = tf.Variable(0.0,
                                   trainable=False,
                                   name='Time_per_episode')
        self.tf_time_summary = tf.summary.scalar("Time per episode",
                                                 self.tf_time)

        # writer
        self.writer = tf.summary.FileWriter('./graphs', self.sess.graph)
예제 #20
0
def main():
    experiment= 'InvertedPendulum-v1' #specify environments here
    env= gym.make(experiment)
    steps= env.spec.timestep_limit #steps per episode    
    assert isinstance(env.observation_space, Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"
    
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    reward_per_episode = 0    
    total_reward=0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]    
    print "Number of States:", num_states
    print "Number of Actions:", num_actions
    print "Number of Steps per episode:", steps
    #saving reward:
    reward_st = np.array([0])
      
    
    for i in xrange(episodes):
        print "==== Starting episode no:",i,"====","\n"
        observation = env.reset()
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)            
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise #Select action according to current policy and exploration noise
            print "Action at step", t ," :",action,"\n"
            
            observation,reward,done,info=env.step(action)
            
            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done or (t == steps-1)):
                print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode
                print "Printing reward to file"
                exploration_noise.reset() #reinitializing random noise for action exploration
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print '\n\n'
                break
    total_reward+=reward_per_episode            
    print "Average reward per episode {}".format(total_reward / episodes)    
예제 #21
0
    def __init__(self, env, state_size, action_size):
        self.env = env
        self.replay_memory = deque()
        self.actor_network = actor_network.ActorNetwork(
            state_size, action_size)
        self.critic_network = critic_network.CriticNetwork(
            state_size, action_size)

        self.ou_noise = OUNoise(action_size)

        self.time_step = 0
예제 #22
0
 def __init__(self, env, args):
     self.direction = args.direction
     self.env = env
     self.num_episodes = args.episodes
     self.episode_start = 0
     self.noise = OUNoise(mu=np.zeros(self.env.action_space.shape))
     self.noise_decay = args.noise_decay
     self.count_exp_replay = 0
     self.train_iteration = 0
     self.tau = args.TAU
     self.tools = Tools()
예제 #23
0
파일: main.py 프로젝트: wenjiebit/FCMADRL
def main():
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE,
                 CA_ACTION_BOUND)
    exploration_noise = OUNoise(CA_ACTION_SPACE)
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = CA_OBS_SPACE
    num_actions = CA_ACTION_SPACE

    print "Number of States:", num_states
    print "Number of Actions:", num_actions
    print "Number of Steps per episode:", steps
    #saving reward:
    reward_st = np.array([0])

    for i in xrange(episodes):
        print "==== Starting episode no:", i, "====", "\n"
        # observation = env.reset()
        observation = ca_reset()
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)
            # env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            print "Action at step", t, " :", action, "\n"

            # observation,reward,done,info=env.step(action)
            observation, reward, done, info = ca_step(action)
            print x, observation, action, reward, done
            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode
                print "Printing reward to file"
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print '\n\n'
                break
    total_reward += reward_per_episode
    print "Average reward per episode {}".format(total_reward / episodes)
예제 #24
0
    def __init__(self, task, train=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high


        # Set the learning rate suggested by paper:  https://pdfs.semanticscholar.org/71f2/03de1a53deae81a7707143f0ed564661e279.pdf
        self.actor_learning_rate = 0.001
        self.actor_decay = 0.0
        self.critic_learning_rate = 0.001
        self.critic_decay = 0.0

        # Actor Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay)

        # Critic Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay)

        # initialize targets model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        # self.exploration_theta = 0.15
        # self.exploration_sigma = 0.2
        self.exploration_theta = 0.01
        self.exploration_sigma = 0.02
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta,
                   self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000

        self.batch_size = 64

        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.best_w = None
        self.best_score = -np.inf
        # self.noise_scale = 0.7
        self.score = 0

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01 # for soft update of target parameters

        # Indicate if we want to learn (or use to predict without learn)
        self.set_train(train)
예제 #25
0
def main():
    env = Env(19997)
    steps= 10000
    num_states = 59
    num_actions = 3

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(num_actions)
    counter=0
    reward_per_episode = 0    
    total_reward=0
    reward_st = np.array([0])

    agent.actor_net.load_actor(os.getcwd() + '/weights/actor/model.ckpt')
    agent.critic_net.load_critic(os.getcwd() + '/weights/critic/model.ckpt')
      
    for i in range(episodes):
        # print "==== Starting episode no:",i,"====","\n"
        observation = env.reset()
        done =False
        reward_per_episode = 0
        for t in range(steps):
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise #Select action according to current policy and exploration noise
            
            for i in range(num_actions):
                if action[i] > 1.0:
                    action[i] = 1.0
                if action[i] < -1.0:
                    action[i] = -1.0

            observation,reward,done = env.step(action)
            print("reward:", reward, "\n")
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done or (t == steps-1)):
                print('Episode',i,'Steps: ',t,'Episode Reward:',reward_per_episode)
                exploration_noise.reset()
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                agent.actor_net.save_actor(os.getcwd() + '/weights/actor/model.ckpt')
                agent.critic_net.save_critic(os.getcwd() + '/weights/critic/model.ckpt')
                break
    total_reward+=reward_per_episode            
예제 #26
0
    def __init__(self, sess, data_fname):
        self.name = 'DDPG'
        # Randomly initialize actor network and critic network
        # with both their target networks

        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = Hp.state_dim
        self.action_dim = Hp.action_dim
        print(self.state_dim, self.action_dim)

        self.sess = sess

        self.state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.target_state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.state_network = StateEnc(self.sess, self.state_input,
                                      self.target_state_input)
        state_batch = self.state_network.encoding
        next_state_batch = self.state_network.target_encoding

        weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters(
        )

        state_network_params = weights + biases + [
            w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2
        ]

        self.actor_network = ActorNetwork(self.sess, Hp.n_hidden,
                                          self.action_dim, self.state_input,
                                          state_batch, next_state_batch,
                                          state_network_params)
        self.critic_network = CriticNetwork(self.sess, Hp.n_hidden,
                                            self.action_dim, state_batch,
                                            next_state_batch)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname)
        self.summary_str2 = None

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
예제 #27
0
    def __init__(self):
        super(FirstAgent, self).__init__()

        # actor models
        self.actor_local = None
        self.actor_target = None

        # critic models
        self.critic_local = None
        self.critic_target = None

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
예제 #28
0
    def __init__(self, state_space, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.sess = tf.Session()

        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_space = state_space
        self.action_dim = action_dim  # 1

        self.ac_network = ActorCriticNetwork(self.sess, self.state_space,
                                             self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
    def __init__(self, env,device):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        self.device=device
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        
        self.actor_network = ActorNetwork(self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
예제 #30
0
	def __init__(self, env):
		self.sess = tf.InteractiveSession()
		#self.params = loadparams() # ???
		self.env = env
		self.n_states = env.observation_space.shape[0]
		self.n_actions = env.action_space.shape[0]
		self.low = self.env.action_space.low
		self.high = self.env.action_space.high
		self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions)
		self.trainable_var_count = self.actor_network.get_trainable_var_count()
		self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \
			self.actor_network, self.trainable_var_count)
		self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']???
		self.exploration_noise = OUNoise(self.n_actions)
		# self.noise = Noise()
		self.gamma = GAMMA
		self.sess.run(tf.global_variables_initializer())
예제 #31
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.08
        self.exploration_sigma = 0.15
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor 0.99
        self.tau = 0.001  # for soft update of target parameters 0.01

        # Score tracker and learning parameters
        self.total_reward = None
        self.count = 0
        self.score = 0
        self.best_score = -np.inf
        self.last_state = None
예제 #32
0
	def create_networks_and_training_method(self,state_dim,action_dim):

		theta_p = networks.theta_p(state_dim,action_dim)
		theta_q = networks.theta_q(state_dim,action_dim)
		target_theta_p,target_update_p = self.exponential_moving_averages(theta_p,TAU)
		target_theta_q,target_update_q = self.exponential_moving_averages(theta_q,TAU)

		self.state = tf.placeholder(tf.float32,[None,state_dim],'state')
		self.action_test = networks.policy_network(self.state,theta_p)

		# Initialize a random process the Ornstein-Uhlenbeck process for action exploration
		self.exploration = OUNoise(action_dim)
		noise = self.exploration.noise()
		self.action_exploration = self.action_test + noise

		q = networks.q_network(self.state,self.action_test,theta_q)
		# policy optimization
		mean_q = tf.reduce_mean(q)
		weight_decay_p = tf.add_n([L2_POLICY * tf.nn.l2_loss(var) for var in theta_p])  
		loss_p = -mean_q + weight_decay_p

		optim_p = tf.train.AdamOptimizer(P_LEARNING_RATE)
		grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=theta_p)
		optimize_p = optim_p.apply_gradients(grads_and_vars_p)
		with tf.control_dependencies([optimize_p]):
			self.train_p = tf.group(target_update_p)

		# q optimization
		self.action_train = tf.placeholder(tf.float32,[None,action_dim],'action_train')
		self.reward = tf.placeholder(tf.float32,[None],'reward')
		self.next_state = tf.placeholder(tf.float32,[None,state_dim],'next_state')
		self.done = tf.placeholder(tf.bool,[None],'done')

		q_train = networks.q_network(self.state,self.action_train,theta_q)
		next_action = networks.policy_network(self.next_state,theta=target_theta_p)
		next_q = networks.q_network(self.next_state,next_action,theta=target_theta_q)
		q_target = tf.stop_gradient(tf.select(self.done,self.reward,self.reward + GAMMA * next_q))

		# q loss
		q_error = tf.reduce_mean(tf.square(q_target - q_train))
		weight_decay_q = tf.add_n([L2_Q * tf.nn.l2_loss(var) for var in theta_q])
		loss_q = q_error + weight_decay_q

		optim_q = tf.train.AdamOptimizer(Q_LEARNING_RATE)
		grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=theta_q)
		optimize_q = optim_q.apply_gradients(grads_and_vars_q)
		with tf.control_dependencies([optimize_q]):
			self.train_q = tf.group(target_update_q)

		tf.scalar_summary("loss_q",loss_q)
		tf.scalar_summary("loss_p",loss_p)
		tf.scalar_summary("q_mean",mean_q)
		global merged_summary_op
		merged_summary_op = tf.merge_all_summaries()
예제 #33
0
    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return
예제 #34
0
파일: ddpg.py 프로젝트: Ivehui/DDPG
    def __init__(self, environment):
        self.name = 'DDPG' # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0
예제 #35
0
파일: ddpg.py 프로젝트: ChampionZP/DDPG
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
예제 #36
0
    def __init__(self, env, args):
        self.action_dim = env.action_space.shape[0]
        self.state_dim = env.observation_space.shape[0]

        self.actor_lr = args.a_lr
        self.critic_lr = args.c_lr

        self.gamma = args.gamma

        # Ornstein-Uhlenbeck noise parameters
        self.ou = OUNoise(
            self.action_dim, theta=args.noise_theta, sigma=args.noise_sigma)

        self.replay_buffer = deque(maxlen=args.buffer_size)
        self.replay_start_size = args.replay_start_size

        self.batch_size = args.batch_size

        self.target_update_rate = args.target_update_rate
        self.total_parameters = 0
        self.global_steps = 0
        self.reg_param = args.reg_param
예제 #37
0
class DDPG:

    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return

    def train(self):
        action_dim = self.action_dim

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)  # sample BATCH_SIZE from replay_buffer
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # if action_dim = 1, it's a number not a array
        action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim])

        # calculate y_batch via target network
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch)

        y_batch = []
        for i in range(BATCH_SIZE):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # print np.shape(reward_batch), np.shape(y_batch)

        # train actor network
        self.actor_network.train(state_batch)

        # train critic network
        self.critic_network.train(y_batch, state_batch, action_batch)

        # update target network
        self.actor_network.update_target()
        self.critic_network.update_target()
        return

    def noise_action(self, state):
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def _record_log(self, reward, living_time):
        summary_str = self.sess.run(self.summary_op, feed_dict={
            self.reward_input: reward,
            self.time_input: living_time
        })
        self.summary_writer.add_summary(summary_str, self.time_step)
        return

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.episode_start_time == 0.0:
            self.episode_start_time = time.time()
        # for testing
        # self.time_step += 1
        # if self.time_step == 100:
        #     print '--------------------------------'
        #     self.replay_buffer.save_to_pickle()
        # return
        
        self.episode_reward += reward
        living_time = time.time() - self.episode_start_time
        if self.time_step % 1000 == 0 or done:
            self._record_log(self.episode_reward, living_time)

        if self.replay_buffer.size() > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 100000 == 0:
            self.save_network()

        if done:
            print '===============reset noise========================='
            self.exploration_noise.reset()
            self.episode_reward = 0.0
            self.episode_start_time = time.time()

        self.time_step += 1
        return

    def load_time_step(self):
        if not os.path.exists(self.dir_path):
            return
        files = os.listdir(self.dir_path)
        step_list = []
        for filename in files:
            if ('meta' in filename) or ('-' not in filename):
                continue
            step_list.append(int(filename.split('-')[-1]))
        step_list = sorted(step_list)
        if len(step_list) == 0:
            return
        self.time_step = step_list[-1] + 1
        return

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.dir_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print 'Successfully loaded:', checkpoint.model_checkpoint_path
        else:
            print 'Could not find old network weights'
        return

    def save_network(self):
        print 'save actor-critic network...', self.time_step
        self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step)
        return
예제 #38
0
파일: ddpg.py 프로젝트: Tomakko/neurobotics
    def __init__(self):

        # Initialize our session
        self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            self.visualize_input = VISUALIZE_BUFFER
            if self.visualize_input:
                self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 86
            self.width = self.height
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8')
            self.old_action = np.ones(2, dtype='float')
            self.network_action = np.zeros(2, dtype='float')
            self.noise_action = np.zeros(2, dtype='float')
            self.action = np.zeros(2, dtype='float')

            # Initialize the grad inverter object to keep the action bounds
            self.action_bounds = [[0.3, 0.3],
                                  [-0.3, -0.3]]
            self.grad_inv = GradInverter(self.action_bounds)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.merge_all_summaries()
            self.summary_writer = tf.train.SummaryWriter(os.path.expanduser('~')+'/tensorboard_data')

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(self.session.graph, self.session, BATCH_SIZE)

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())
                self.critic_network.restore_pretrained_weights(FILTER_LOAD_PATH)
                self.actor_network.restore_pretrained_weights(FILTER_LOAD_PATH)

            threads = tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)
예제 #39
0
class DDPG(object):

    def __init__(self, env, args):
        self.action_dim = env.action_space.shape[0]
        self.state_dim = env.observation_space.shape[0]

        self.actor_lr = args.a_lr
        self.critic_lr = args.c_lr

        self.gamma = args.gamma

        # Ornstein-Uhlenbeck noise parameters
        self.ou = OUNoise(
            self.action_dim, theta=args.noise_theta, sigma=args.noise_sigma)

        self.replay_buffer = deque(maxlen=args.buffer_size)
        self.replay_start_size = args.replay_start_size

        self.batch_size = args.batch_size

        self.target_update_rate = args.target_update_rate
        self.total_parameters = 0
        self.global_steps = 0
        self.reg_param = args.reg_param

    def construct_model(self, gpu):
        if gpu == -1:  # use CPU
            device = '/cpu:0'
            sess_config = tf.ConfigProto()
        else:  # use GPU
            device = '/gpu:' + str(gpu)
            sess_config = tf.ConfigProto(
                log_device_placement=True, allow_soft_placement=True)
            sess_config.gpu_options.allow_growth = True

        self.sess = tf.Session(config=sess_config)

        with tf.device(device):
            # output action, q_value and gradients of q_val w.r.t. action
            with tf.name_scope('predict_actions'):
                self.states = tf.placeholder(
                    tf.float32, [None, self.state_dim], name='states')
                self.action = tf.placeholder(
                    tf.float32, [None, self.action_dim], name='action')
                self.is_training = tf.placeholder(tf.bool, name='is_training')

                self.action_outputs, self.actor_params = self._build_actor(
                    self.states, scope='actor_net', bn=True)
                self.value_outputs, self.critic_params = self._build_critic(
                    self.states, self.action, scope='critic_net', bn=False)
                self.action_gradients = tf.gradients(
                    self.value_outputs, self.action)[0]

            # estimate target_q for update critic
            with tf.name_scope('estimate_target_q'):
                self.next_states = tf.placeholder(
                    tf.float32, [None, self.state_dim], name='next_states')
                self.mask = tf.placeholder(tf.float32, [None], name='mask')
                self.rewards = tf.placeholder(tf.float32, [None], name='rewards')

                # target actor network
                self.t_action_outputs, self.t_actor_params = self._build_actor(
                    self.next_states, scope='t_actor_net', bn=True,
                    trainable=False)
                # target critic network
                self.t_value_outputs, self.t_critic_params = self._build_critic(
                    self.next_states, self.t_action_outputs, bn=False,
                    scope='t_critic_net', trainable=False)

                self.target_q = self.rewards + self.gamma * \
                    (self.t_value_outputs[:, 0] * self.mask)

            with tf.name_scope('compute_gradients'):
                self.actor_opt = tf.train.AdamOptimizer(self.actor_lr)
                self.critic_opt = tf.train.AdamOptimizer(self.critic_lr)

                # critic gradients
                td_error = self.target_q - self.value_outputs[:, 0]
                critic_mse = tf.reduce_mean(tf.square(td_error))
                critic_reg = tf.reduce_sum(
                    [tf.nn.l2_loss(v) for v in self.critic_params])
                critic_loss = critic_mse + self.reg_param * critic_reg
                self.critic_gradients = \
                    self.critic_opt.compute_gradients(
                        critic_loss, self.critic_params)
                # actor gradients
                self.q_action_grads = tf.placeholder(
                    tf.float32, [None, self.action_dim], name='q_action_grads')
                actor_gradients = tf.gradients(
                    self.action_outputs, self.actor_params,
                    -self.q_action_grads)
                self.actor_gradients = zip(actor_gradients, self.actor_params)
                # apply gradient to update model
                self.train_actor = self.actor_opt.apply_gradients(
                    self.actor_gradients)
                self.train_critic = self.critic_opt.apply_gradients(
                    self.critic_gradients)

            with tf.name_scope('update_target_networks'):
                # batch norm paramerters should not be included when updating!
                target_networks_update = []

                for v_source, v_target in zip(
                        self.actor_params, self.t_actor_params):
                    update_op = v_target.assign_sub(
                        0.001 * (v_target - v_source))
                    target_networks_update.append(update_op)

                for v_source, v_target in zip(
                        self.critic_params, self.t_critic_params):
                    update_op = v_target.assign_sub(
                        0.01 * (v_target - v_source))
                    target_networks_update.append(update_op)

                self.target_networks_update = tf.group(*target_networks_update)

            with tf.name_scope('total_numbers_of_parameters'):
                for v in tf.trainable_variables():
                    shape = v.get_shape()
                    param_num = 1
                    for d in shape:
                        param_num *= d.value
                    print(v.name, ' ', shape, ' param nums: ', param_num)
                    self.total_parameters += param_num
                print('Total nums of parameters: ', self.total_parameters)

    def sample_action(self, states, noise):
        # is_training suppose to be False when sampling action.
        action = self.sess.run(
            self.action_outputs,
            feed_dict={self.states: states, self.is_training: False})
        ou_noise = self.ou.noise() if noise else 0

        return action + ou_noise

    def store_experience(self, s, a, r, next_s, done):
        self.replay_buffer.append([s, a[0], r, next_s, done])
        self.global_steps += 1

    def update_model(self):

        if len(self.replay_buffer) < self.replay_start_size:
            return

        # get batch
        batch = random.sample(self.replay_buffer, self.batch_size)
        s, _a, r, next_s, done = np.vstack(batch).T.tolist()
        mask = ~np.array(done)

        # compute a = u(s)
        a = self.sess.run(self.action_outputs, {
            self.states: s,
            self.is_training: True
        })
        # gradients of q_value w.r.t action a
        dq_da = self.sess.run(self.action_gradients, {
            self.states: s,
            self.action: a,
            self.is_training: True
        })
        # train
        self.sess.run([self.train_actor, self.train_critic], {
            # train_actor feed
            self.states: s,
            self.is_training: True,
            self.q_action_grads: dq_da,
            # train_critic feed
            self.next_states: next_s,
            self.action: _a,
            self.mask: mask,
            self.rewards: r
        })
        # update target network
        self.sess.run(self.target_networks_update)

    def _build_actor(self, states, scope, bn=False, trainable=True):
        h1_dim = 400
        h2_dim = 300
        init = tf.contrib.layers.variance_scaling_initializer(
            factor=1.0, mode='FAN_IN', uniform=True)

        with tf.variable_scope(scope):
            if bn:
                states = self.batch_norm(
                    states, self.is_training, tf.identity,
                    scope='actor_bn_states', trainable=trainable)
            h1 = tcl.fully_connected(
                states, h1_dim, activation_fn=None, weights_initializer=init,
                biases_initializer=init, trainable=trainable, scope='actor_h1')

            if bn:
                h1 = self.batch_norm(
                    h1, self.is_training, tf.nn.relu, scope='actor_bn_h1',
                    trainable=trainable)
            else:
                h1 = tf.nn.relu(h1)

            h2 = tcl.fully_connected(
                h1, h2_dim, activation_fn=None, weights_initializer=init,
                biases_initializer=init, trainable=trainable, scope='actor_h2')
            if bn:
                h2 = self.batch_norm(
                    h2, self.is_training, tf.nn.relu, scope='actor_bn_h2',
                    trainable=trainable)
            else:
                h2 = tf.nn.relu(h2)

            # use tanh to bound the action
            a = tcl.fully_connected(
                h2, self.action_dim, activation_fn=tf.nn.tanh,
                weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3),
                biases_initializer=tf.random_uniform_initializer(-3e-4, 3e-4),
                trainable=trainable, scope='actor_out')

        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)

        return a, params

    def _build_critic(self, states, action, scope, bn=False, trainable=True):
        h1_dim = 400
        h2_dim = 300
        init = tf.contrib.layers.variance_scaling_initializer(
            factor=1.0, mode='FAN_IN', uniform=True)
        with tf.variable_scope(scope):
            if bn:
                states = self.batch_norm(
                    states, self.is_training, tf.identity,
                    scope='critic_bn_state', trainable=trainable)
            h1 = tcl.fully_connected(
                states, h1_dim, activation_fn=None, weights_initializer=init,
                biases_initializer=init, trainable=trainable, scope='critic_h1')
            if bn:
                h1 = self.batch_norm(
                    h1, self.is_training, tf.nn.relu, scope='critic_bn_h1',
                    trainable=trainable)
            else:
                h1 = tf.nn.relu(h1)

            # skip action from the first layer
            h1 = tf.concat([h1, action], 1)

            h2 = tcl.fully_connected(
                h1, h2_dim, activation_fn=None, weights_initializer=init,
                biases_initializer=init, trainable=trainable,
                scope='critic_h2')

            if bn:
                h2 = self.batch_norm(
                    h2, self.is_training, tf.nn.relu, scope='critic_bn_h2',
                    trainable=trainable)
            else:
                h2 = tf.nn.relu(h2)

            q = tcl.fully_connected(
                h2, 1, activation_fn=None,
                weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3),
                biases_initializer=tf.random_uniform_initializer(-3e-4, 3e-4),
                trainable=trainable, scope='critic_out')

        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)
        return q, params

    def batch_norm(self, x, is_training, activation_fn, scope, trainable=True):
        # switch the 'is_training' flag and 'reuse' flag
        return tf.cond(
            is_training,
            lambda: tf.contrib.layers.batch_norm(
                x,
                activation_fn=activation_fn,
                center=True,
                scale=True,
                updates_collections=None,
                is_training=True,
                reuse=None,
                scope=scope,
                decay=0.9,
                epsilon=1e-5,
                trainable=trainable),
            lambda: tf.contrib.layers.batch_norm(
                x,
                activation_fn=activation_fn,
                center=True,
                scale=True,
                updates_collections=None,
                is_training=False,
                reuse=True,  # to be able to reuse scope must be given
                scope=scope,
                decay=0.9,
                epsilon=1e-5,
                trainable=trainable))
예제 #40
0
    def __init__(self):

        # Make sure all the directories exist
        if not tf.gfile.Exists(TFLOG_PATH):
            tf.gfile.MakeDirs(TFLOG_PATH)
        if not tf.gfile.Exists(EXPERIENCE_PATH):
            tf.gfile.MakeDirs(EXPERIENCE_PATH)
        if not tf.gfile.Exists(NET_SAVE_PATH):
            tf.gfile.MakeDirs(NET_SAVE_PATH)

        # Initialize our session
        self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            self.visualize_input = VISUALIZE_BUFFER
            if self.visualize_input:
                self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 86
            self.width = self.height
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8')
            self.old_action = np.ones(2, dtype='float')
            self.network_action = np.zeros(2, dtype='float')
            self.noise_action = np.zeros(2, dtype='float')
            self.action = np.zeros(2, dtype='float')

            # Initialize the grad inverter object to keep the action bounds
            self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session)

            # Make sure the directory for the data files exists
            if not tf.gfile.Exists(DATA_PATH):
                tf.gfile.MakeDirs(DATA_PATH)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.merge_all_summaries()
            self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH)

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session)

            # Uncomment if collecting a buffer for the autoencoder
            # self.buffer = deque()

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())

            tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)
예제 #41
0
class DDPG:

    def __init__(self):

        # Make sure all the directories exist
        if not tf.gfile.Exists(TFLOG_PATH):
            tf.gfile.MakeDirs(TFLOG_PATH)
        if not tf.gfile.Exists(EXPERIENCE_PATH):
            tf.gfile.MakeDirs(EXPERIENCE_PATH)
        if not tf.gfile.Exists(NET_SAVE_PATH):
            tf.gfile.MakeDirs(NET_SAVE_PATH)

        # Initialize our session
        self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            self.visualize_input = VISUALIZE_BUFFER
            if self.visualize_input:
                self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 86
            self.width = self.height
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8')
            self.old_action = np.ones(2, dtype='float')
            self.network_action = np.zeros(2, dtype='float')
            self.noise_action = np.zeros(2, dtype='float')
            self.action = np.zeros(2, dtype='float')

            # Initialize the grad inverter object to keep the action bounds
            self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session)

            # Make sure the directory for the data files exists
            if not tf.gfile.Exists(DATA_PATH):
                tf.gfile.MakeDirs(DATA_PATH)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.merge_all_summaries()
            self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH)

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session)

            # Uncomment if collecting a buffer for the autoencoder
            # self.buffer = deque()

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())

            tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)

    def train(self):

        # Check if the buffer is big enough to start training
        if self.data_manager.enough_data():

            # get the next random batch from the data manger
            state_batch, \
                action_batch, \
                reward_batch, \
                next_state_batch, \
                is_episode_finished_batch = self.data_manager.get_next_batch()

            state_batch = np.divide(state_batch, 100.0)
            next_state_batch = np.divide(next_state_batch, 100.0)

            # Are we visualizing the first state batch for debugging?
            # If so: We have to scale up the values for grey scale before plotting
            if self.visualize_input:
                state_batch_np = np.asarray(state_batch)
                state_batch_np = np.multiply(state_batch_np, -100.0)
                state_batch_np = np.add(state_batch_np, 100.0)
                self.viewer.set_data(state_batch_np)
                self.viewer.run()
                self.visualize_input = False

            # Calculate y for the td_error of the critic
            y_batch = []
            next_action_batch = self.actor_network.target_evaluate(next_state_batch)
            q_value_batch = self.critic_network.target_evaluate(next_state_batch, next_action_batch)

            for i in range(0, BATCH_SIZE):
                if is_episode_finished_batch[i]:
                    y_batch.append([reward_batch[i]])
                else:
                    y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

            # Now that we have the y batch lets train the critic
            self.critic_network.train(y_batch, state_batch, action_batch)

            # Get the action batch so we can calculate the action gradient with it
            # Then get the action gradient batch and adapt the gradient with the gradient inverting method
            action_batch_for_gradients = self.actor_network.evaluate(state_batch)
            q_gradient_batch = self.critic_network.get_action_gradient(state_batch, action_batch_for_gradients)
            q_gradient_batch = self.grad_inv.invert(q_gradient_batch, action_batch_for_gradients)

            # Now we can train the actor
            self.actor_network.train(q_gradient_batch, state_batch)

            # Save model if necessary
            if self.training_step > 0 and self.training_step % SAVE_STEP == 0:
                self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step)

            # Update time step
            self.training_step += 1

        self.data_manager.check_for_enqueue()

    def get_action(self, state):

        # normalize the state
        state = state.astype(float)
        state = np.divide(state, 100.0)

        # Get the action
        self.action = self.actor_network.get_action(state)

        # Are we using noise?
        if self.noise_flag:
            # scale noise down to 0 at training step 3000000
            if self.training_step < MAX_NOISE_STEP:
                self.action += (MAX_NOISE_STEP - self.training_step) / MAX_NOISE_STEP * self.exploration_noise.noise()
            # if action value lies outside of action bounds, rescale the action vector
            if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]:
                self.action *= np.fabs(A0_BOUNDS[0]/self.action[0])
            if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]:
                self.action *= np.fabs(A1_BOUNDS[0]/self.action[1])

        # Life q value output for this action and state
        self.print_q_value(state, self.action)

        return self.action

    def set_experience(self, state, reward, is_episode_finished):

        # Make sure we're saving a new old_state for the first experience of every episode
        if self.first_experience:
            self.first_experience = False
        else:
            self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state,
                                                       is_episode_finished)

            # Uncomment if collecting data for the auto_encoder
            # experience = (self.old_state, self.old_action, reward, state, is_episode_finished)
            # self.buffer.append(experience)

        if is_episode_finished:
            self.first_experience = True
            self.exploration_noise.reset()

        # Safe old state and old action for next experience
        self.old_state = state
        self.old_action = self.action

    def print_q_value(self, state, action):

        string = "-"
        q_value = self.critic_network.evaluate([state], [action])
        stroke_pos = 30 * q_value[0][0] + 30
        if stroke_pos < 0:
            stroke_pos = 0
        elif stroke_pos > 60:
            stroke_pos = 60
        print '[' + stroke_pos * string + '|' + (60-stroke_pos) * string + ']', "Q: ", q_value[0][0], \
            "\tt: ", self.training_step
예제 #42
0
파일: main.py 프로젝트: zxqzhang/ddpg-aigym
def main():
    experiment= 'InvertedPendulum-v1'
    env= gym.make(experiment)
    assert isinstance(env.observation_space, Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    total_reward=0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    
    #saving reward:
    reward_st = np.array([0])
    
    
    
    for i in xrange(episodes):
        observation = env.reset()
    
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)            
            #env.render()
            
            x = observation
            #select action using actor network model
            action = agent.evaluate_actor(np.reshape(x,[num_actions,num_states]))
            
            noise = exploration_noise.noise()
            
                       
            action = action[0] + noise
            
            
            print 'Agent.Action :',action
            print '\n'
            print '\n'
            
                      
            observation,reward,done,[]=env.step(action)
            #add s_t,s_t+1,action,reward to experience memeroy
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()            
            
            reward_per_episode+=reward
            
            counter+=1
            #check if episode ends:
            if done:
                print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode
                exploration_noise.reset()
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print '\n'
                print '\n'
                break
    total_reward+=reward_per_episode            
    print "Average reward per episode {}".format(total_reward / episodes)    
예제 #43
0
파일: ddpg.py 프로젝트: ChampionZP/DDPG
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
예제 #44
0
파일: ddpg.py 프로젝트: Ivehui/DDPG
class DDPG:
    """docstring for DDPG"""
    def __init__(self, environment):
        self.name = 'DDPG' # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0

    def set_init_observation(self,observation):
        # receive initial observation state
        self.state = observation

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = random.sample(self.replay_buffer,BATCH_SIZE)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]

        action_batch = np.resize(action_batch,[BATCH_SIZE,1])

        # Calculate y
        y_batch = []
        next_action_batch = self.actor_network.target_evaluate(next_state_batch)
        q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch)
        for i in range(0,BATCH_SIZE):
            done = minibatch[i][4]
            if done:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.evaluate(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def get_action(self):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.get_action(self.state)
        return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high)

    def set_feedback(self,observation,action,reward,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        next_state = observation
        self.replay_buffer.append((self.state,action,reward,next_state,done))
        # Update current state
        self.state = next_state
        # Update time step
        self.time_step += 1

        # Limit the replay buffer size
        if len(self.replay_buffer) > REPLAY_BUFFER_SIZE:
            self.replay_buffer.popleft()

        # Store transitions to replay start size then start training
        if self.time_step >  REPLAY_START_SIZE:
            self.train()

        if self.time_step % 10000 == 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
예제 #45
0
class DDPG:
	def __init__(self, env):
		self.name = 'DDPG' # name for uploading results
		self.environment = env
		
		state_dim = env.observation_space.shape[0]
		action_dim = env.action_space.shape[0]
		# Initialize time step
		self.time_step = 0
		# initialize replay buffer
		self.replay_buffer = deque()
		# initialize networks
		self.create_networks_and_training_method(state_dim,action_dim)

		self.sess = tf.InteractiveSession()
		self.sess.run(tf.initialize_all_variables())

		# loading networks
		self.saver = tf.train.Saver()
		checkpoint = tf.train.get_checkpoint_state("saved_networks")
		if checkpoint and checkpoint.model_checkpoint_path:
				self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
				print "Successfully loaded:", checkpoint.model_checkpoint_path
		else:
				print "Could not find old network weights"

		global summary_writer
		summary_writer = tf.train.SummaryWriter('~/logs',graph=self.sess.graph)
	
	def create_networks_and_training_method(self,state_dim,action_dim):

		theta_p = networks.theta_p(state_dim,action_dim)
		theta_q = networks.theta_q(state_dim,action_dim)
		target_theta_p,target_update_p = self.exponential_moving_averages(theta_p,TAU)
		target_theta_q,target_update_q = self.exponential_moving_averages(theta_q,TAU)

		self.state = tf.placeholder(tf.float32,[None,state_dim],'state')
		self.action_test = networks.policy_network(self.state,theta_p)

		# Initialize a random process the Ornstein-Uhlenbeck process for action exploration
		self.exploration = OUNoise(action_dim)
		noise = self.exploration.noise()
		self.action_exploration = self.action_test + noise

		q = networks.q_network(self.state,self.action_test,theta_q)
		# policy optimization
		mean_q = tf.reduce_mean(q)
		weight_decay_p = tf.add_n([L2_POLICY * tf.nn.l2_loss(var) for var in theta_p])  
		loss_p = -mean_q + weight_decay_p

		optim_p = tf.train.AdamOptimizer(P_LEARNING_RATE)
		grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=theta_p)
		optimize_p = optim_p.apply_gradients(grads_and_vars_p)
		with tf.control_dependencies([optimize_p]):
			self.train_p = tf.group(target_update_p)

		# q optimization
		self.action_train = tf.placeholder(tf.float32,[None,action_dim],'action_train')
		self.reward = tf.placeholder(tf.float32,[None],'reward')
		self.next_state = tf.placeholder(tf.float32,[None,state_dim],'next_state')
		self.done = tf.placeholder(tf.bool,[None],'done')

		q_train = networks.q_network(self.state,self.action_train,theta_q)
		next_action = networks.policy_network(self.next_state,theta=target_theta_p)
		next_q = networks.q_network(self.next_state,next_action,theta=target_theta_q)
		q_target = tf.stop_gradient(tf.select(self.done,self.reward,self.reward + GAMMA * next_q))

		# q loss
		q_error = tf.reduce_mean(tf.square(q_target - q_train))
		weight_decay_q = tf.add_n([L2_Q * tf.nn.l2_loss(var) for var in theta_q])
		loss_q = q_error + weight_decay_q

		optim_q = tf.train.AdamOptimizer(Q_LEARNING_RATE)
		grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=theta_q)
		optimize_q = optim_q.apply_gradients(grads_and_vars_q)
		with tf.control_dependencies([optimize_q]):
			self.train_q = tf.group(target_update_q)

		tf.scalar_summary("loss_q",loss_q)
		tf.scalar_summary("loss_p",loss_p)
		tf.scalar_summary("q_mean",mean_q)
		global merged_summary_op
		merged_summary_op = tf.merge_all_summaries()

	def train(self):
		#print "train step",self.time_step
		# Sample a random minibatch of N transitions from replay buffer
		minibatch = random.sample(self.replay_buffer,BATCH_SIZE)
		state_batch = [data[0] for data in minibatch]
		action_batch = [data[1] for data in minibatch]
		reward_batch = [data[2] for data in minibatch]
		next_state_batch = [data[3] for data in minibatch]
		done_batch = [data[4] for data in minibatch]

		_,_,summary_str = self.sess.run([self.train_p,self.train_q,merged_summary_op],feed_dict={
			self.state:state_batch,
			self.action_train:action_batch,
			self.reward:reward_batch,
			self.next_state:next_state_batch,
			self.done:done_batch
			})

		summary_writer.add_summary(summary_str,self.time_step)

		# save network every 1000 iteration
		if self.time_step % 1000 == 0:
			self.saver.save(self.sess, 'saved_networks/' + 'network' + '-ddpg', global_step = self.time_step)

	def noise_action(self,state):
		# Select action a_t according to the current policy and exploration noise
		action = self.sess.run(self.action_exploration,feed_dict={
			self.state:[state]
			})[0]
		return np.clip(action,self.environment.action_space.low,self.environment.action_space.high)

	def action(self,state):
		action = self.sess.run(self.action_test,feed_dict={
			self.state:[state]
			})[0]
		return np.clip(action,self.environment.action_space.low,self.environment.action_space.high)

	def perceive(self,state,action,reward,next_state,done):
		# Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
		self.replay_buffer.append((state,action,reward,next_state,done))
		# Update time step
		self.time_step += 1

		# Limit the replay buffer size
		if len(self.replay_buffer) > REPLAY_BUFFER_SIZE:
			self.replay_buffer.popleft()

		# Store transitions to replay start size then start training
		if self.time_step >  REPLAY_START_SIZE:
			self.train()

		# Re-iniitialize the random process when an episode ends
		if done:
			self.exploration.reset()

	# f fan-in size
	def exponential_moving_averages(self,theta, tau=0.001):
		ema = tf.train.ExponentialMovingAverage(decay=1 - tau)
		update = ema.apply(theta)  # also creates shadow vars
		averages = [ema.average(x) for x in theta]
		return averages, update