예제 #1
0
    def __init__(self):
        super(FirstAgent, self).__init__()

        # actor models
        self.actor_local = None
        self.actor_target = None

        # critic models
        self.critic_local = None
        self.critic_target = None

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
예제 #2
0
    def __init__(self, state_size, action_size, agent_id):
        """Initialize a DDPGAgent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            agent_id (int): identifier for this agent
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(RANDOM_SEED)
        self.agent_id = agent_id

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Make sure that the target-local model pairs are initialized to the
        # same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        self.noise = OUNoise(action_size)
예제 #3
0
    def __init__(self, state_size, action_size, seed):
        self.gradient_clipping = True
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.config = Config()
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.config.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.config.LR_CRITIC,
            weight_decay=self.config.WEIGHT_DECAY)

        self.noise = OUNoise(action_size, seed)

        self.memory = ReplayBuffer(action_size, self.config.BUFFER_SIZE,
                                   self.config.BATCH_SIZE, seed, device)
        self.step_count = 0
예제 #4
0
    def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma,
                 tau, batch_size, n_train, n_episode):
        # Gym environment
        self.env = env

        env_flattened = gym.wrappers.FlattenDictWrapper(
            env, dict_keys=['observation', 'achieved_goal', 'desired_goal'])

        # Get space sizes
        self.state_dim = env_flattened.observation_space.shape[0]
        #self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Get replay buffer and function get a batch from it
        self.replay_buffer = replay_buffer
        self.sample_batch = sample_batch

        self.sess = tf.InteractiveSession()

        # Hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_train = n_train
        self.n_episode = n_episode

        # Initialize networks
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim)
        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim)

        self.exploration_noise = OUNoise(self.action_dim)
예제 #5
0
파일: ddpg.py 프로젝트: inikiforovski/DDPG
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        # self.action_dim = env.action_space.shape[0]
        self.state_dim = env.state_size
        self.action_dim = env.action_size
        self.action_bound = (env.action_high - env.action_low) / 2

        print('state_dim: ', self.state_dim, 'action_dim: ', self.action_dim,
              'action_bound: ', self.action_bound)

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim, self.action_bound)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim, self.action_bound)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
예제 #6
0
    def __init__(self, shape_in, num_output, accele_range, angle_range):
        self.input_shape = shape_in
        self.out_shape = num_output
        self.learning_rate_a = LEARNING_RATE_ACTOR
        self.learning_rate_c = LEARNING_RATE_CRITIC
        self.memory = deque(maxlen=MAX_MEMORY_LEN)
        self.train_start = 200
        self.batch_size = 64
        self.gamma = 0.9
        self.sigma_fixed = 2
        self.channel = CHANNEL
        self.critic_input_action_shape = 1
        self.angle_range = angle_range
        self.accele_range = accele_range
        self.actor_model = self.actor_net_builder()
        self.critic_model = self.critic_net_build()
        self.actor_target_model = self.actor_net_builder()
        self.critic_target_model = self.critic_net_build()
        self.OUnoise = OUNoise(2)
        # self.actor_target_model.trainable = False
        # self.critic_target_model.trainable = False

        self.actor_history = []
        self.critic_history = []
        self.reward_history = []
        self.weight_hard_update()
예제 #7
0
    def reset(self):
        # 매 episode가 시작될때 사용됨.
        # 사용 변수들 초기화

        # 매 에피소드마다 로봇을 원점으로 이동시키려면 아래 주석을 해제한다.
        self.robot_tf = Transformation()
        self.joint1_tf = Transformation()
        self.link1_tf = Transformation(translation=(self.link1_len, 0))
        self.joint2_tf = Transformation()
        self.link2_tf = Transformation(translation=(self.link2_len, 0))
        self.link1_tf_global = self.robot_tf * self.joint1_tf * self.link1_tf
        self.link2_tf_global = self.link1_tf_global * self.joint2_tf * self.link2_tf

        self.step_count = 0.00

        # 목표 지점 생성
        self.target_tf = Transformation(translation=(
            random.randrange(-self.env_boundary, self.env_boundary),
            random.randrange(-self.env_boundary, self.env_boundary)))
        self.ou = OUNoise(dt=self.dt, theta=0.1, sigma=0.2)

        self.done = False
        self.t = 0
        self.buffer = []  # 시각화를 위한 버퍼. episode가 리셋될 때마다 초기화.

        # Step 함수와 다르게 reset함수는 초기 state 값 만을 반환합니다.
        return self._get_state()
예제 #8
0
    def __init__(self, env, time_steps, hidden_dim):
        self.name = 'DDPG'  # name for uploading results
        self.scale = env.asset
        self.unit = env.unit
        self.seed = env.rd_seed

        self.time_dim = time_steps
        self.state_dim = env.observation_space.shape[1]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = 64
        self.memory_size = self.time_dim + self.batch_size * 10
        self.start_size = self.time_dim + self.batch_size * 2

        # Initialise actor & critic networks
        self.actor_network = Actor(self.time_dim, self.state_dim,
                                   self.action_dim, hidden_dim)
        self.critic_network = Critic(self.time_dim, self.state_dim,
                                     self.action_dim, hidden_dim)

        # Initialize replay buffer
        self.replay_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_next_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_action = torch.zeros(
            (self.start_size - 1, 1, self.state_dim), device=cuda)
        self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim,
                                         sigma=0.01 / self.action_dim)
        self.initial()
예제 #9
0
    def __init__(self, env, DIRECTORY):
        self.batch_size = BATCH_SIZE
        self.replay_start_size = REPLAY_START_SIZE  # self.sub_batch_size = BATCH_SIZE / n_gpu

        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        self.trace_length = TRACE_LENGTH
        self.temp_abstract = TEMP_ABSTRACT
        self.actor_network = ActorNetwork(self.sess, BATCH_SIZE,
                                          self.state_dim, self.action_dim,
                                          self.temp_abstract, DIRECTORY)
        self.critic_network = CriticNetwork(self.sess, BATCH_SIZE,
                                            self.state_dim, self.action_dim,
                                            self.temp_abstract, DIRECTORY)

        # initialize replay buffer
        max_len_trajectory = self.environment.spec.timestep_limit + 1  # trace_length
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY,
                                          max_len_trajectory,
                                          self.actor_network.last_epi)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        ###
        self.diff = 0.
        self.discounting_mat_dict = {}
예제 #10
0
    def __init__(self, env):
        self.action_dim = env.action_space.shape[0]
        self.state_dim = env.observation_space.shape[0]
        self.h1_dim = 400
        self.h2_dim = 300

        self.actor_learning_rate = 1e-4
        self.critic_learning_rate = 1e-3

        self.gamma = 0.99

        # Ornstein-Uhlenbeck noise parameters
        self.noise_theta = 0.15
        self.noise_sigma = 0.20
        self.ou = OUNoise(self.action_dim,
                          theta=self.noise_theta,
                          sigma=self.noise_sigma)

        self.replay_buffer_size = 1000000
        self.replay_buffer = deque(maxlen=self.replay_buffer_size)
        self.replay_start_size = 1000

        self.batch_size = 64

        self.target_update_rate = 0.001
        self.total_parameters = 0
        self.global_steps = 0
        self.reg_param = 0.01
예제 #11
0
    def __init__(self, state_size, action_size, random_seed, hyperparams):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.hyperparams = hyperparams

        self.actor = Actor(state_size, action_size, random_seed).to(device)
        self.actor_noise = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=hyperparams.alpha_actor)

        self.critic = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optim = optim.Adam(
            self.critic.parameters(),
            lr=hyperparams.alpha_critic,
            weight_decay=hyperparams.weight_decay,
        )

        self.replay_buffer = ReplayBuffer(hyperparams.buffer_size,
                                          hyperparams.batch_size, random_seed)

        self.noise = OUNoise(
            action_size,
            random_seed,
            self.hyperparams.mu,
            self.hyperparams.theta,
            self.hyperparams.sigma,
        )
예제 #12
0
    def __init__(self, sess, number, model_path, global_episodes, explore,
                 decay, training):
        self.name = 'worker_' + str(number)  # name for uploading results
        self.number = number
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 41
        self.action_dim = 18
        self.model_path = model_path
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.sess = sess
        self.explore = explore
        self.decay = decay
        self.training = training

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim,
                                          self.name + '/actor')
        self.actor_network.update_target(self.sess)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim,
                                            self.name + '/critic')
        self.critic_network.update_target(self.sess)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.update_local_ops_actor = update_target_graph(
            'global/actor', self.name + '/actor')
        self.update_local_ops_critic = update_target_graph(
            'global/critic', self.name + '/critic')
예제 #13
0
    def __init__(self, env, state_dim=None):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # along with their target networks
        if state_dim:
            self.state_dim = state_dim
            print(self.state_dim)
        else:
            self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        # Flag to signal save
        self.not_saved = True

        #For normalisation
        self.state_mean = 0
        self.state_std = 1
        self.target_mean = 0
        self.target_std = 1
예제 #14
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
예제 #15
0
    def __init__(self, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr, critic_lr,
                 gamma, buffer_size, item_space, summary_dir):

        self.state_item_num = state_item_num
        self.action_item_num = action_item_num
        self.emb_dim = emb_dim
        self.batch_size = batch_size
        self.tau = tau
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.item_space = item_space
        self.summary_dir = summary_dir

        self.sess = tf.Session()

        self.s_dim = emb_dim * state_item_num
        self.a_dim = emb_dim * action_item_num
        self.actor = Actor(self.sess, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr)
        self.critic = Critic(self.sess, state_item_num, action_item_num, emb_dim,
                             self.actor.get_num_trainable_vars(), gamma, tau, critic_lr)
        self.exploration_noise = OUNoise(self.a_dim)

        # set up summary operators
        self.summary_ops, self.summary_vars = self.build_summaries()
        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(summary_dir, self.sess.graph)

        # initialize target network weights
        self.actor.hard_update_target_network()
        self.critic.hard_update_target_network()

        # initialize replay memory
        self.replay_buffer = ReplayBuffer(buffer_size)
예제 #16
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 n_hidden_units=128,
                 n_layers=3):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # actor
        self.actor = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4)

        # critic
        self.critic = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=3e-4,
                                     weight_decay=0.0001)

        # will add noise
        self.noise = OUNoise(action_size, seed)

        # experience replay
        self.replay = ReplayBuffer(seed)
예제 #17
0
파일: ddpg.py 프로젝트: ivychill/ltr
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")
예제 #18
0
파일: main.py 프로젝트: mmarklar/ddpg-aigym
def main():
    experiment = 'model-builder-v0'  #specify environments here
    env = gym.make(experiment)
    #steps= env.spec.timestep_limit #steps per episode
    steps = 20
    assert isinstance(env.observation_space,
                      Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    print("Number of Steps per episode:", steps)
    #saving reward:
    reward_st = np.array([0])

    for i in range(episodes):
        print("==== Starting episode no:", i, "====", "\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            #rendering environmet (optional)
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, 300, 300, 2]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            print("Action at step", t, " :", action, "\n")

            observation, reward, done, info = env.step(action)

            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ',
                      reward_per_episode)
                print("Printing reward to file")
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print('\n\n')
                break
    total_reward += reward_per_episode
    print("Average reward per episode {}".format(total_reward / episodes))
 def learn(self, total_timesteps, callback):
     ou_scale = 1.0  # initial scaling factor
     ou_decay = 0.9995  # decay of the scaling factor ou_scale
     ou_mu = 0.0  # asymptotic mean of the noise
     ou_theta = 0.15  # magnitude of the drift term
     ou_sigma = 0.20  # magnitude of the diffusion term
     # this slowly decreases to 0
     # create the noise process
     noise_process = OUNoise(self.action_size, ou_mu, ou_theta, ou_sigma)
     # create the replay buffer
     buffer = ReplayBuffer(seed=self.seed, action_size=self.action_size, buffer_size=self.buffer_size,
                           batch_size=self.batch_size, device=self.device)
     self.t_step = 0
     episode = 0
     while self.t_step < total_timesteps:
         callback.on_start_episode(episode)
         episode_scores = np.zeros(self.env.num_agents)
         states, _, _ = self.env.reset()
         scores = np.zeros(2)
         while True:
             states = np.reshape(states, (1, 48))  # reshape so we can feed both agents states to each agent
             # split into the states into the parts observed by each agent
             states_0 = states[0, :24].reshape((1, 24))
             states_1 = states[0, 24:].reshape((1, 24))
             # generate noise
             noise = ou_scale * noise_process.get_noise().reshape((1, 4))
             # split the noise into the parts for each agent
             noise_0 = noise[0, :2].reshape((1, 2))
             noise_1 = noise[0, 2:].reshape((1, 2))
             # determine actions for the unity agents from current sate, using noise for exploration
             actions_0 = self.player_policy.act(states_0, use_target=False, add_noise=True, noise_value=noise_0)\
                 .detach().cpu().numpy()
             actions_1 = self.opponent_policy.act(states_1, use_target=False, add_noise=True, noise_value=noise_1)\
                 .detach().cpu().numpy()
             actions = np.vstack((actions_0.flatten(), actions_1.flatten()))
             # take the action in the environment
             next_states, rewards, dones, info = self.env.step(actions)
             # store (S, A, R, S') info in the replay buffer (memory)
             buffer.add(states.flatten(), actions.flatten(), rewards, next_states.flatten(), dones)
             episode_scores += rewards
             states = next_states
             self.t_step += 1
             """
             Policy learning
             """
             ## train the agents if we have enough replays in the buffer
             if len(buffer) >= self.batch_size:
                 self.player_policy.learn(buffer.sample(), self.opponent_policy)
                 self.opponent_policy.learn(buffer.sample(), self.player_policy)
             if np.any(dones):
                 break
         if not callback.on_step(np.max(episode_scores), self.t_step):
             break
         # decrease the scaling factor of the noise
         ou_scale *= ou_decay
         episode += 1
예제 #20
0
    def __init__(self, model, env, sess, num_episodes, direction):
        self.model = model
        self.sess = sess
        self.direction = direction
        self.env = env
        self.num_episodes = num_episodes
        self.episode_start = 0
        self.noise = OUNoise(mu=np.zeros(self.env.action_space.shape))
        self.noise_decay = 0.2
        self.epsilon = EPSILON
        self.epsilon_decay = nth_root(self.num_episodes, 0.001 / self.epsilon)
        self.count_exp_replay = 0
        self.tau = TAU
        self.target_Q_ph = tf.placeholder(tf.float32, shape=(None, 1))
        self.actions_grads_ph = tf.placeholder(
            tf.float32, shape=((None, ) + self.env.action_space.shape))

        #train operation
        self.actor_train_ops = self.model.Actor.train_step(
            self.actions_grads_ph)
        self.critic_train_ops = self.model.Critic.train_step(self.target_Q_ph)

        #update operation
        self.update_critic_target = self.model.update_target_network(
            self.model.Critic.network_params,
            self.model.Critic_target.network_params, self.tau)
        self.update_actor_target = self.model.update_target_network(
            self.model.Actor.network_params,
            self.model.Actor_target.network_params, self.tau)
        sess.run(tf.initialize_all_variables())
        #for testing only
        self.sess.run(
            self.model.update_target_network(
                self.model.Critic.network_params,
                self.model.Critic_target.network_params))
        self.sess.run(
            self.model.update_target_network(
                self.model.Actor.network_params,
                self.model.Actor_target.network_params))

        # reward summary for tensorboard
        self.tf_reward = tf.Variable(0.0,
                                     trainable=False,
                                     name='reward_summary')
        self.tf_reward_summary = tf.summary.scalar("Reward by episode",
                                                   self.tf_reward)

        # time
        self.tf_time = tf.Variable(0.0,
                                   trainable=False,
                                   name='Time_per_episode')
        self.tf_time_summary = tf.summary.scalar("Time per episode",
                                                 self.tf_time)

        # writer
        self.writer = tf.summary.FileWriter('./graphs', self.sess.graph)
예제 #21
0
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6)
        self.angular_noise = OUNoise(1, 0, 0.6, 0.8)
예제 #22
0
파일: main.py 프로젝트: wenjiebit/FCMADRL
def main():
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE,
                 CA_ACTION_BOUND)
    exploration_noise = OUNoise(CA_ACTION_SPACE)
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = CA_OBS_SPACE
    num_actions = CA_ACTION_SPACE

    print "Number of States:", num_states
    print "Number of Actions:", num_actions
    print "Number of Steps per episode:", steps
    #saving reward:
    reward_st = np.array([0])

    for i in xrange(episodes):
        print "==== Starting episode no:", i, "====", "\n"
        # observation = env.reset()
        observation = ca_reset()
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)
            # env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            print "Action at step", t, " :", action, "\n"

            # observation,reward,done,info=env.step(action)
            observation, reward, done, info = ca_step(action)
            print x, observation, action, reward, done
            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode
                print "Printing reward to file"
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print '\n\n'
                break
    total_reward += reward_per_episode
    print "Average reward per episode {}".format(total_reward / episodes)
예제 #23
0
    def __init__(self, env, state_size, action_size):
        self.env = env
        self.replay_memory = deque()
        self.actor_network = actor_network.ActorNetwork(
            state_size, action_size)
        self.critic_network = critic_network.CriticNetwork(
            state_size, action_size)

        self.ou_noise = OUNoise(action_size)

        self.time_step = 0
예제 #24
0
 def __init__(self, env, args):
     self.direction = args.direction
     self.env = env
     self.num_episodes = args.episodes
     self.episode_start = 0
     self.noise = OUNoise(mu=np.zeros(self.env.action_space.shape))
     self.noise_decay = args.noise_decay
     self.count_exp_replay = 0
     self.train_iteration = 0
     self.tau = args.TAU
     self.tools = Tools()
예제 #25
0
    def __init__(self, task, train=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high


        # Set the learning rate suggested by paper:  https://pdfs.semanticscholar.org/71f2/03de1a53deae81a7707143f0ed564661e279.pdf
        self.actor_learning_rate = 0.001
        self.actor_decay = 0.0
        self.critic_learning_rate = 0.001
        self.critic_decay = 0.0

        # Actor Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay)

        # Critic Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay)

        # initialize targets model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        # self.exploration_theta = 0.15
        # self.exploration_sigma = 0.2
        self.exploration_theta = 0.01
        self.exploration_sigma = 0.02
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta,
                   self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000

        self.batch_size = 64

        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.best_w = None
        self.best_score = -np.inf
        # self.noise_scale = 0.7
        self.score = 0

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01 # for soft update of target parameters

        # Indicate if we want to learn (or use to predict without learn)
        self.set_train(train)
예제 #26
0
def main():
    env = Env(19997)
    steps= 10000
    num_states = 59
    num_actions = 3

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(num_actions)
    counter=0
    reward_per_episode = 0    
    total_reward=0
    reward_st = np.array([0])

    agent.actor_net.load_actor(os.getcwd() + '/weights/actor/model.ckpt')
    agent.critic_net.load_critic(os.getcwd() + '/weights/critic/model.ckpt')
      
    for i in range(episodes):
        # print "==== Starting episode no:",i,"====","\n"
        observation = env.reset()
        done =False
        reward_per_episode = 0
        for t in range(steps):
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise #Select action according to current policy and exploration noise
            
            for i in range(num_actions):
                if action[i] > 1.0:
                    action[i] = 1.0
                if action[i] < -1.0:
                    action[i] = -1.0

            observation,reward,done = env.step(action)
            print("reward:", reward, "\n")
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done or (t == steps-1)):
                print('Episode',i,'Steps: ',t,'Episode Reward:',reward_per_episode)
                exploration_noise.reset()
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                agent.actor_net.save_actor(os.getcwd() + '/weights/actor/model.ckpt')
                agent.critic_net.save_critic(os.getcwd() + '/weights/critic/model.ckpt')
                break
    total_reward+=reward_per_episode            
예제 #27
0
    def __init__(self, sess, data_fname, replay=False):
        self.name = 'DDPG'
        # Randomly initialize actor network and critic network
        # with both their target networks

        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = Hp.state_dim
        self.action_dim = Hp.action_dim
        print(self.state_dim, self.action_dim)

        self.sess = sess

        self.state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.target_state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.state_network = StateEnc(self.sess, self.state_input,
                                      self.target_state_input)
        state_batch = self.state_network.encoding
        next_state_batch = self.state_network.target_encoding

        weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters(
        )

        state_network_params = weights + biases + [
            w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2
        ]

        self.actor_network = ActorNetwork(self.sess, Hp.n_hidden,
                                          self.action_dim, self.state_input,
                                          state_batch, next_state_batch,
                                          state_network_params)
        self.critic_network = CriticNetwork(self.sess, Hp.n_hidden,
                                            self.action_dim, state_batch,
                                            next_state_batch)

        # initialize replay buffer
        if replay:
            self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE,
                                              data_fname)
        self.summary_str2 = None

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
예제 #28
0
	def __init__(self, env):
		self.sess = tf.InteractiveSession()
		#self.params = loadparams() # ???
		self.env = env
		self.n_states = env.observation_space.shape[0]
		self.n_actions = env.action_space.shape[0]
		self.low = self.env.action_space.low
		self.high = self.env.action_space.high
		self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions)
		self.trainable_var_count = self.actor_network.get_trainable_var_count()
		self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \
			self.actor_network, self.trainable_var_count)
		self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']???
		self.exploration_noise = OUNoise(self.n_actions)
		# self.noise = Noise()
		self.gamma = GAMMA
		self.sess.run(tf.global_variables_initializer())
    def __init__(self, env,device):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        self.device=device
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        
        self.actor_network = ActorNetwork(self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
예제 #30
0
    def __init__(self, state_space, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.sess = tf.Session()

        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_space = state_space
        self.action_dim = action_dim  # 1

        self.ac_network = ActorCriticNetwork(self.sess, self.state_space,
                                             self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)