示例#1
0
    def __init__(self, state_size=24, action_size=2, random_seed=0):
        """
        Initializes Agent object.
        @Param:
        1. state_size: dimension of each state.
        2. action_size: number of actions.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        #Actor network
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        #Critic network
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        #Noise proccess
        self.noise = OUNoise(action_size,
                             random_seed)  #define Ornstein-Uhlenbeck process

        #Replay memory
        self.memory = ReplayBuffer(
            self.action_size, BUFFER_SIZE, MINI_BATCH,
            random_seed)  #define experience replay buffer object
    def __init__(
            self,
            task,
            actor_params={},
            critic_params={},
            noise_params={},
            replay_memory_params={},
            algo_params = {}
            ):

        # Default Params
        default_actor_params = {'lr': .001}
        default_critic_params= {'lr': .001}
        default_noise_params= {'mu': 0, 'theta': .15, 'sigma': .2}
        default_replay_memory_params= {'buffer_size': 100000, 'batch_size': 64}
        default_algo_params = {'gamma': .99, 'tau': .1}

        # Final Params
        final_actor_params= {**default_actor_params, **actor_params}
        final_critic_params={**default_critic_params, **critic_params}
        final_noise_params={**default_noise_params, **noise_params}
        final_replay_memory_params={**default_replay_memory_params, **replay_memory_params, }
        final_algo_params = {**default_algo_params, **algo_params}

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, final_critic_params)
        self.critic_target = Critic(self.state_size, self.action_size, final_critic_params)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(
                self.action_size,
                final_noise_params['mu'],
                final_noise_params['theta'],
                final_noise_params['sigma']
                )

        # Replay memory
        self.batch_size = final_replay_memory_params['batch_size']
        self.memory = ReplayBuffer(
                final_replay_memory_params['buffer_size'],
                final_replay_memory_params['batch_size']
                )

        # Algorithm parameters
        self.gamma = final_algo_params['gamma']  # discount factor
        self.tau = final_algo_params['tau']      # for soft update of target parameters
    def begin_play(self):
        self.actor = self.uobject.get_owner()
        self.VehicleMovement = self.actor.VehicleMovement
        self.replay_buffer = ReplayBuffer(max_size=50000)
        ue.log('Begin Play on TorchActor class')
        ue.log(torch.cuda.is_available())
        ue.log(dir(self.uobject))

        self.policy = TD3(lr, state_dim, action_dim, max_action)
        self.gen_target()

        self.last_state = []
        self.last_reward = 0
        self.last_action = None
        self.last_done = False
        self.frame = int(random.random() * 100)
        self.start_pos = self.uobject.get_actor_location()

        # self.actor.AutoDrive = True
        self.policy.load(directory, loadfilename)
        self.filename = "{}_{}".format(filename, self.frame)

        self.episode = 0

        self.writer = SummaryWriter(os.path.join(directory, filename))

        self.ep_frame = 0
        self.ep_reward = 0

        self.ep_reward_avg_BEST = 0
示例#4
0
    def __init__(self, config, file_prefix=None):

        self.buffer_size = config.hyperparameters.buffer_size
        self.batch_size = config.hyperparameters.batch_size
        self.update_frequency = config.hyperparameters.update_frequency
        self.gamma = config.hyperparameters.gamma
        self.number_of_agents = config.environment.number_of_agents
        self.noise_weight = config.hyperparameters.noise_start
        self.noise_decay = config.hyperparameters.noise_decay
        self.memory = ReplayBuffer(config)
        self.t = 0

        self.agents = [
            DDPGAgent(index, config) for index in range(self.number_of_agents)
        ]

        if file_prefix:
            for i, to_load in enumerate(self.agents):
                f"{os.getcwd()}/models/by_score/{file_prefix}_actor_{i}.weights"
                actor_file = torch.load(
                    f"{os.getcwd()}/models/by_score/{file_prefix}_actor_{i}.weights",
                    map_location='cpu')
                critic_file = torch.load(
                    f"{os.getcwd()}/models/by_score/{file_prefix}_critic_{i}.weights",
                    map_location='cpu')
                to_load.actor_local.load_state_dict(actor_file)
                to_load.actor_target.load_state_dict(actor_file)
                to_load.critic_local.load_state_dict(critic_file)
                to_load.critic_target.load_state_dict(critic_file)
            print(f'Files loaded with prefix {file_prefix}')
示例#5
0
文件: agent.py 项目: timtody/curious
 def init_td3(self):
     self.policy = TD3(
         self.state_dim,
         self.action_dim,
         self.cnf.td3.max_action,
     )
     self.buffer = ReplayBuffer(self.state_dim, self.action_dim)
示例#6
0
class Agent():
    # needs functions init, choose_action, store_transition
    def __init__(self, input_dims, fc1_dims, fc2_dims, n_actions, alpha, beta,
                batch_size=100, max_size=1e6, mu=0, sigma=0.1, clip=0.5):
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.alpha = alpha
        self.beta = beta
        self.clip = clip

        self.batch_size = batch_size
        self.max_size = max_size
        self.noise = gauss(mu, sigma)
        #self.clamp = max(0.5, x)?

        self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions, 'actor_net')
        self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions, 'critic_net')
        self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions, 'target_critic')
        self.memory = ReplayBuffer(self.max_size, input_dims, n_actions, batch_size=self.batch_size)

    def choose_action(self, observation):
        self.actor.eval()
        state = T.Tensor([observation], dtype=T.float).to(self.device)
        mu = self.actor.forward(state).to(self.actor.device)
        mu_prime = mu + T.Tensor(self.noise(), dtype=T.float).to(self.actor.device)
        mu_prime = np.min(self.clip, mu_prime)
        mu_prime = np.max(self.clip, mu_prime)
        self.actor.train()

        return mu_prime.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)
示例#7
0
    def __init__(self,
                 state_dim,
                 n_actions,
                 gamma=0.99,
                 lmbda=1.0,
                 eps=1e-3,
                 itr_target_update=1e1,
                 device="cuda"):
        """Train a Q-net (using double Q trick) on (state, action, reward, state, action) pairs. This is thus on 'on policy' Q-net

        Args:
            state_dim (tuple): Shape of obseravtion input
            n_actions (int): length of our discrete action spacce
            gamma (float, optional): discount factor. Defaults to 0.99.
            lmbda (float, optional): Random Network Distrillation weight in A_strat. Defaults to 1.0.
            eps (float, optional): Min value for A_strat. To avoid potential divide-by-zero errors in training with A-strat weights. Defaults to 1e-3.
            itr_target_update (int, optional): Number of SARSA updates after which we update the target network. Defaults to 1e1.
            device (str, optional): Whether we use GPU or CPU. Defaults to "cuda".
        """
        self.q_net = Net(state_dim, n_actions).to(device)
        self.q_net_opt = optim.Adam(self.q_net.parameters(), lr=0.001)
        self.target_q_net = Net(state_dim, n_actions).to(device)

        self.itr_target_update = itr_target_update
        self.lmbda = lmbda
        self.count = 0
        self.gamma = gamma
        self.eps = eps
        self.device = device
        self.loss_func = nn.MSELoss()
        self.memory = ReplayBuffer(1e4, 64)
示例#8
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
示例#9
0
    def __init__(self, action_size, discount_factor=0.95, tau=0.02):
        super(MADDPG, self).__init__()

        # Create the multi agent as a list of ddpg agents
        self.maddpg_agents = [AgentDDPG(24, 2, 0), AgentDDPG(24, 2, 0)]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
        self.total_reward = 0.0
        self.count = 0
        self.update_every = 1
        self.batch_size = 128
        self.agent_number = len(self.maddpg_agents)
        self.t_step = 0
        # Initialize the Replay Memory
        self.buffer_size = 1000000
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        self.action_size = action_size
        self.total_reward = np.zeros((1, 2))

        # Initialize the Gaussian Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)
示例#10
0
    def __init__(self, params: Parameters):
        self.parms = params

        self.env = Env(params.game,
                       params.gamma,
                       norm_rewards=None,
                       norm_states=False)

        self.buffer = ReplayBuffer(params.replay_size)

        # Seed
        self.env.seed(params.seed)
        np.random.seed(params.seed)
        tf.random.set_seed(params.seed)

        self.critic = DDPGValueNet(feature_shape=self.env.features_shape,
                                   a_num=self.env.num_actions,
                                   lr=params.lr_c)
        self.target_critic = DDPGValueNet(
            feature_shape=self.env.features_shape,
            a_num=self.env.num_actions,
            lr=params.lr_c)
        self._copy_para(self.critic.model, self.target_critic.model)

        self.actor = CtsPolicy(action_bound=self.env.action_bound,
                               action_dim=self.env.num_actions,
                               lr=params.lr_a)
        self.target_actor = CtsPolicy(action_bound=self.env.action_bound,
                                      action_dim=self.env.num_actions,
                                      lr=params.lr_a)
        self._copy_para(self.actor, self.target_actor)

        self.ema = tf.train.ExponentialMovingAverage(decay=1.0 -
                                                     self.parms.tau)
示例#11
0
    def __init__(self, k_level, H, state_dim, action_dim, render, threshold,
                 action_bounds, action_offset, state_bounds, state_offset, lr):

        # adding lowest level
        self.HAC = [
            DDPG(state_dim, action_dim, action_bounds, action_offset, lr, H)
        ]
        self.replay_buffer = [ReplayBuffer()]

        # adding remaining levels
        for _ in range(k_level - 1):
            self.HAC.append(
                DDPG(state_dim, state_dim, state_bounds, state_offset, lr, H))
            self.replay_buffer.append(ReplayBuffer())

        # set some parameters
        self.k_level = k_level
        self.H = H
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.threshold = threshold
        self.render = render

        # logging parameters
        self.goals = [None] * self.k_level
        self.reward = 0
        self.timestep = 0
示例#12
0
文件: agent.py 项目: xieyiping/drl
    def __init__(self, 
                 params: Parameters, 
                 obs_shapes,
                 a_shapes,
                 a_bounds,
                 a_shape_index):
        self.index = a_shape_index
        self.parms = params

        self.buffer = ReplayBuffer(params.replay_size)
        
        # Critic
        self.critic = DDPGValueNet(feature_shapes=obs_shapes,
                                   a_shapes=a_shapes,lr=params.lr_c, 
                                   n_agent=params.n_agent)

        self.target_critic = DDPGValueNet(feature_shapes=obs_shapes,
                                   a_shapes=a_shapes,lr=params.lr_c, 
                                   n_agent=params.n_agent)

        self._copy_para(self.critic.model, self.target_critic.model)


        # Actor
        self.actor = CtsPolicy(action_bound=a_bounds,
                               s_shape=obs_shapes[a_shape_index],
                               a_shape=a_shapes[a_shape_index], 
                               lr=params.lr_a)

        self.target_actor = CtsPolicy(action_bound=a_bounds,
                               s_shape=obs_shapes[a_shape_index],
                               a_shape=a_shapes[a_shape_index], 
                               lr=params.lr_a)

        self._copy_para(self.actor, self.target_actor)
示例#13
0
    def __init__(self, env_id, alpha, beta, input_dims, tau, env,
            gamma=0.99, update_actor_interval=2, warmup=1000,
            n_actions=2, max_size=1000000, layer1_size=256,
            layer2_size=256, batch_size=256, noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.learn_step_cntr = 0
        self.time_step = 0
        self.warmup = warmup
        self.n_actions = n_actions
        self.update_actor_iter = update_actor_interval

        self.actor = ActorNetwork(alpha, input_dims, layer1_size,
                        layer2_size, n_actions=n_actions, name=env_id+'_actor')

        self.critic_1 = CriticNetwork(beta, input_dims, layer1_size,
                        layer2_size, n_actions=n_actions, name=env_id+'_critic_1')
        self.critic_2 = CriticNetwork(beta, input_dims, layer1_size,
                        layer2_size, n_actions=n_actions, name=env_id+'_critic_2')

        self.target_actor = ActorNetwork(alpha, input_dims, layer1_size,
                    layer2_size, n_actions=n_actions, name=env_id+'_target_actor')
        self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size,
                layer2_size, n_actions=n_actions, name=env_id+'_target_critic_1')
        self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size,
                layer2_size, n_actions=n_actions, name=env_id+'_target_critic_2')

        self.noise = noise
        self.update_network_parameters(tau=1)
示例#14
0
    def begin_play(self):
        self.actor = self.uobject.get_owner()
        self.VehicleMovement = self.actor.VehicleMovement
        self.replay_buffer = ReplayBuffer(max_size=50000)
        ue.log('Begin Play on TorchActor class')
        ue.log(torch.cuda.is_available())

        self.policy = TD3(lr, state_dim, action_dim, max_action)
        self.gen_target()

        self.last_state = []
        self.last_reward = 0
        self.last_action = None
        self.last_done = False
        self.frame = int(random.random() * 100)
        self.start_pos = self.uobject.get_actor_location()

        self.policy.load(directory, loadfilename)

        self.episode = 0

        self.ep_frame = 0
        self.ep_reward = 0

        self.policy = master.policy

        self.boredom = 0.8

        print("MASTER")
        print(master)

        self.my_id = master.get_id()
        self.actor.TextRender.call('SetText {}'.format(self.my_id))
示例#15
0
    def __init__(self,
                 state_size,
                 action_size,
                 policy_network,
                 value_network,
                 n_agents,
                 device,
                 use_gae=True):

        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.device = device

        self.policy_network = policy_network(
            state_size=state_size, action_size=action_size).to(device)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(),
                                           lr=LR)

        self.value_network = value_network(state_size=state_size,
                                           action_size=1).to(device)
        self.value_optimizer = optim.Adam(self.value_network.parameters(),
                                          lr=LR)
        self.epsilon = EPSILON
        self.beta = BETA

        self.reset_memory()
        self.buffer = ReplayBuffer(int(128), 64)
        self.use_gae = use_gae
示例#16
0
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 gamma=0.99,
                 update_actor_interval=2,
                 n_actions=2,
                 warmup=1000,
                 max_size=1e6,
                 layer1_size=400,
                 layer2_size=300,
                 batch_size=100,
                 noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low
        #self.max_action = n_actions
        #self.min_action = 0

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.learn_step_cntr = 0  # how often to call the learning function on the actor network
        self.time_step = 0  # handles countdown to end of warmup
        self.warmup = warmup
        self.n_actions = n_actions
        self.update_actor_iter = update_actor_interval

        self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size,
                                  n_actions, 'actor_net')
        self.critic_1 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions, 'critic_1')
        self.critic_2 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions, 'critic_2')

        self.target_actor = ActorNetwork(alpha,
                                         input_dims,
                                         layer1_size,
                                         layer2_size,
                                         n_actions,
                                         name='target_actor')
        self.target_critic_1 = CriticNetwork(beta,
                                             input_dims,
                                             layer1_size,
                                             layer2_size,
                                             n_actions,
                                             name='target_critic_1')
        self.target_critic_2 = CriticNetwork(beta,
                                             input_dims,
                                             layer1_size,
                                             layer2_size,
                                             n_actions,
                                             name='target_critic_2')

        self.noise = noise
        self.update_network_parameters(
            tau=1)  # sets the target network parameters to original
示例#17
0
class DDPG:
    def __init__(self, action_dim, action_bound, tau, lr_a, lr_c, state_dim,
                 gamma, batch_size):
        self.target = tf.placeholder(tf.float32, [None, 1], 'critic_target')
        self.s = tf.placeholder(tf.float32, [None, state_dim], 'state')
        self.s_ = tf.placeholder(tf.float32, [None, state_dim], 'next_state')

        self.memory = ReplayBuffer(max_size=10000)
        self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))
        self.batch_size = batch_size
        self.gamma = gamma

        self.sess = tf.Session()

        self.actor = Actor(self.sess,
                           self.s,
                           self.s_,
                           action_dim,
                           action_bound,
                           tau,
                           lr_a,
                           f1_units=300)
        self.critic = Critic(self.sess,
                             lr_c,
                             self.s,
                             self.s_,
                             self.actor.a,
                             self.actor.a_,
                             self.target,
                             tau,
                             gamma,
                             state_dim,
                             action_dim,
                             f1_units=300)
        self.actor.add_grad_to_graph(self.critic.a_g)

        self.sess.run(tf.global_variables_initializer())

    def choose_action(self, s):
        a = self.actor.choose_action(s)
        var = self.noise()
        a = a + var
        return a[0]

    def update_target_networks(self):
        self.sess.run([self.actor.replace, self.critic.replace])

    def store(self, s, a, r, s_, done):
        self.memory.store(s, a, r, s_, done)

    def learn(self):
        bs, ba, br, bs_, _ = self.memory.sample(self.batch_size)

        q_ = self.sess.run(self.critic.q_, {self.s_: bs_})
        br = br[:, np.newaxis]
        target_critic = br + self.gamma * q_
        self.critic.learn(bs, ba, target_critic)
        self.actor.learn(bs)
        self.update_target_networks()
示例#18
0
 def __init__(self, state_manager : StateManager, actor : Actor):
     self.state_manager = state_manager
     #pool of active nodes
     self.node_pool = None
     #ANET
     self.actor = actor
     #buffer
     self.replay_buffer = ReplayBuffer(self.state_manager.tree_distribution_converter)
示例#19
0
 def __init__(self, state_size, action_size, num_agents, random_seed=0):
     in_critic = num_agents * state_size
     self.agents = [
         DDPG_agent(state_size, in_critic, action_size, num_agents,
                    random_seed) for i in range(num_agents)
     ]
     self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)
     self.num_agents = num_agents
 def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000):
     self.env = env
     self.learning_rate = learning_rate
     self.gamma = gamma
     self.replay_buffer = ReplayBuffer(buffer_size)
     self.dqn = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
     self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters())
     self.dqn_loss = torch.nn.MSELoss()
示例#21
0
	def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size, replace_cnt):
		super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec)

		self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape)

		self.learn_step_counter = 0
		self.replace_cnt = replace_cnt
		self.q_eval = ConvDQN(env.env_shape, env.no_of_actions)
		self.q_target = ConvDQN(env.env_shape, env.no_of_actions)
示例#22
0
	def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size):
		super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec)

		if self.network == "SimpleConvDQN":
			self.model = ConvDQN(env.env_shape, env.no_of_actions)
		elif self.network == "LinearDQN":
			self.model = LinearDQN(env.env_shape, env.no_of_actions)

		self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape)
示例#23
0
    def __init__(self, obs_dim, action_dim, *args, **kwargs):
        # Initialize arguments
        hidden_dims_actor = tuple(kwargs.get("hidden_dims_actor",
                                             (256, 256)))
        hidden_dims_critic = tuple(kwargs.get("hidden_dims_critic",
                                              (256, 256)))
        hidden_dims_model = tuple(kwargs.get("hidden_dims_model",
                                             (256, 256)))

        self.gamma = 0.99
        self.tau = 0.005
        self.delay = 2
        lr_actor = 0.001
        lr_critic = 0.001
        lr_model = 0.0001
        self.step_random = 500 # How many random actions to take before using actor for action selection
        self.update_every_n_steps = 51 # How often to update model, actor and critics
        self.update_steps = 200 # How many gradient updates to perform, per model,  when updating
        self.time = time.time()

        # Initialize actor
        self.actor = Actor(obs_dim, hidden_dims_actor,  action_dim)
        self.actor_target = copy.deepcopy(self.actor)
        self.optimizer_actor = torch.optim.Adam(self.actor.parameters(),
                                                lr=lr_actor)
        for par in self.actor_target.parameters():
            par.requires_grad = False

        # Initialize 2 critics
        self.critics = []
        self.critics_target = []
        self.optimizer_critics = []
        for k in range(2):
            critic = Critic(obs_dim + action_dim, hidden_dims_critic)
            self.critics.append(critic)
            self.critics_target.append(copy.deepcopy(critic))
            self.optimizer_critics.append(torch.optim.Adam(critic.parameters(),
                                                           lr=lr_critic))

            for par in self.critics_target[k].parameters():
                par.requires_grad = False

        # Initialize models
        self.models = []
        self.optimizer_models = []
        for k in range(25):
            model = Model(obs_dim + action_dim, hidden_dims_model, obs_dim)
            self.models.append(model)
            self.optimizer_models.append(torch.optim.Adam(model.parameters(),
                                                          lr=lr_model))

        # Setup Replay Buffer
        self.buffer = ReplayBuffer()
        self.o_old = None
        self.a_old = None

        self.step_i = 0
示例#24
0
class TorchDriverMaster:
    tester="hello"

    # this is called on game start
    def begin_play(self):
        global master
        master = self
        self.replay_buffer = ReplayBuffer(max_size=50000)
        ue.log('Begin Play on TorchActor class')
        ue.log("Has CUDA: {}".format(torch.cuda.is_available()))

        self.policy = TD3(lr, state_dim, action_dim, max_action)

        self.frame = 0

        self.policy.load(directory, loadfilename)

        self.episode = 0
        self.worker_id = 0

        self.writer = SummaryWriter(os.path.join(directory, filename))



    def get_next_ep(self):
        self.episode += 1
        return self.episode
    def get_id(self):
        retid = self.worker_id
        self.worker_id += 1
        return retid
    def write_data(self,ep_reward, ep_reward_avg):
        real_ep = self.episode
        self.writer.add_scalar('ep_reward',
                               ep_reward,
                               real_ep)
        self.writer.add_scalar('ep_avg_reward',
                               ep_reward_avg,
                               real_ep)
        print("finished ep {}, avgscore: {}".format(real_ep, ep_reward_avg))
        self.episode += 1
    def transfer_buffer(self, buffer):
        self.replay_buffer.mergein(buffer)
        print("buffer merged, length: {}".format(self.replay_buffer.size))

    def tick(self, delta_time):
        self.frame += 1

        if self.replay_buffer.size:
            al, c1l, c2l, prl = self.policy.update(self.replay_buffer, 1, batch_size, gamma, polyak, policy_noise,
                                                   noise_clip, policy_delay)
            if self.frame % 60 == 0:
                print("aloss:{}".format(al))

            if self.frame % 600 == 0:
                self.policy.save(directory, filename)
    def __init__(self,
                 multi_step_env: MultiStepEnv = None,
                 gamma: float = None,
                 eps_max: float = None,
                 eps_min: float = None,
                 eps_decay_steps: int = None,
                 replay_min_size: int = None,
                 replay_max_size: int = None,
                 target_update_freq: int = None,
                 steps_per_update: int = None,
                 train_batch_size: int = None,
                 enable_rgb: bool = None,
                 model_save_file: str = None,
                 optim_l2_reg_coeff: float = None,
                 optim_lr: float = None,
                 eval_freq: int = None):

        self.env = multi_step_env
        self.gamma = gamma
        self.eps_max = eps_max
        self.eps_min = eps_min
        self.eps_decay_steps = eps_decay_steps
        self.replay_min_size = replay_min_size
        self.target_update_freq = target_update_freq
        self.train_batch_size = train_batch_size
        self.steps_per_update = steps_per_update
        self.model_save_file = model_save_file
        self.optim_lr = optim_lr
        self.optim_l2_reg_coeff = optim_l2_reg_coeff
        self.eval_freq = eval_freq

        self.replay_memory = ReplayBuffer(capacity=replay_max_size)
        self.n_steps = 0

        if enable_rgb:
            self.q_train = Q(self.env.frame_stack_size * 3, self.env.height,
                             self.env.width,
                             self.env.num_actions).to(settings.device)
            self.q_target = Q(self.env.frame_stack_size * 3, self.env.height,
                              self.env.width,
                              self.env.num_actions).to(settings.device)
        else:
            self.q_train = Q(self.env.frame_stack_size, self.env.height,
                             self.env.width,
                             self.env.num_actions).to(settings.device)
            self.q_target = Q(self.env.frame_stack_size, self.env.height,
                              self.env.width,
                              self.env.num_actions).to(settings.device)

        self.optimizer = Adam(self.q_train.parameters(),
                              eps=1e-7,
                              lr=self.optim_lr,
                              weight_decay=self.optim_l2_reg_coeff)
        # self.mse_loss = nn.MSELoss()
        assert (self.q_train.state_dict().keys() ==
                self.q_target.state_dict().keys())
    def __init__(self,
                 env,
                 save_dirs,
                 save_freq=10000,
                 gamma=0.99,
                 batch_size=32,
                 learning_rate=0.0001,
                 buffer_size=10000,
                 learn_start=10000,
                 target_network_update_freq=1000,
                 train_freq=4,
                 epsilon_min=0.01,
                 exploration_fraction=0.1,
                 tot_steps=int(1e7)):
        DDQN.__init__(self,
                      env=env,
                      save_dirs=save_dirs,
                      learning_rate=learning_rate)

        self.gamma = gamma
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.buffer_size = buffer_size
        self.learn_start = learn_start
        self.target_network_update_freq = target_network_update_freq
        self.train_freq = train_freq
        self.epsilon_min = epsilon_min
        self.exploration_fraction = exploration_fraction
        self.tot_steps = tot_steps
        self.epsilon = 1.0
        self.exploration = LinearSchedule(schedule_timesteps=int(
            self.exploration_fraction * self.tot_steps),
                                          initial_p=self.epsilon,
                                          final_p=self.epsilon_min)

        self.save_freq = save_freq

        self.replay_buffer = ReplayBuffer(save_dirs=save_dirs,
                                          buffer_size=self.buffer_size,
                                          obs_shape=self.input_shape)

        self.exploration_factor_save_path = os.path.join(
            self.save_path, 'exploration-factor.npz')

        self.target_model_save_path = os.path.join(self.save_path,
                                                   'target-wts.h5')
        self.target_model = NeuralNet(input_shape=self.input_shape,
                                      num_actions=self.num_actions,
                                      learning_rate=learning_rate,
                                      blueprint=self.blueprint).model

        self.show_hyperparams()

        self.update_target()

        self.load()
    def begin_play(self):
        self.actor = self.uobject.get_owner()
        self.replay_buffer = ReplayBuffer(max_size=50000)
        ue.log('Begin Play on TorchWalkerMinion class')

        #self.policy = TD3(lr, state_dim, action_dim, max_action)
        self.gen_target()

        self.last_state = []
        self.last_reward = 0
        self.last_action = None
        self.last_done = False
        self.frame = int(random.random() * 100)
        self.start_pos = self.uobject.get_actor_location()


        self.episode = 0

        self.ep_frame = 0
        self.ep_reward = 0
        self.total_frame = 0


        self.boredom = 0.8

        print("MASTER")
        print(master)

        actionlen = self.actor.get_action_dim()
        TEMP_OBS = self.actor.update_observation()[0]
        print("TEMP_OBS")
        print(TEMP_OBS)
        obslen = len(TEMP_OBS)
        print(obslen)
        master.init_network(obslen+1, actionlen)

        self.my_id = master.get_id()
        #self.actor.TextRender.call('SetText {}'.format(self.my_id))

        self.random_frames = 10

        self.bg_thread = None

        self.exploration_noise = random.random()*0.3
        self.first_frame = True



        self.policy = master.policy

        self.action_space_low = [-1 for x in range(master.action_dim)]
        self.action_space_high = [1 for x in range(master.action_dim)]


        self.obs_space_low = [-1 for x in range(master.state_dim)]
        self.obs_space_high = [1 for x in range(master.state_dim)]
    def __init__(self, state_size: int, action_size: int, num_agents: int,
                 epsilon, random_seed: int):
        """ Initialize a DDPG Agent Object

        :param state_size: dimension of state (input)
        :param action_size: dimension of action (output)
        :param num_agents: number of concurrent agents in the environment
        :param epsilon: initial value of epsilon for exploration
        :param random_seed: random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.t_step = 0

        # Hyperparameters
        self.buffer_size = 1000000
        self.batch_size = 128
        self.update_every = 10
        self.num_updates = 10
        self.gamma = 0.99
        self.tau = 0.001
        self.lr_actor = 0.0001
        self.lr_critic = 0.001
        self.weight_decay = 0
        self.epsilon = epsilon
        self.epsilon_decay = 0.97
        self.epsilon_min = 0.005

        # Networks (Actor: State -> Action, Critic: (State,Action) -> Value)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)
        # Initialize actor and critic networks to start with same parameters
        self.soft_update(self.actor_local, self.actor_target, tau=1)
        self.soft_update(self.critic_local, self.critic_target, tau=1)

        # Noise Setup
        self.noise = OUNoise(self.action_size, random_seed)

        # Replay Buffer Setup
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
示例#29
0
    def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            cfg (config object): main configuration with other passed settings
            num_agents (int): optional (default: 1). If >1 will multiply state and action
                            space sizes for critic. Used for usage with MADDPG.
            agent_id (int): optional (default: 0). Set agent id for MADDPG.
        """
        print("Initializing single DDPG agent!")

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(cfg.random_seed)
        self.n_agents = num_agents
        self.agent_id = agent_id

        self.cfg = cfg

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, cfg.random_seed,
                                 cfg.dense_layers_actor).to(device)
        self.actor_target = Actor(state_size, action_size, cfg.random_seed,
                                  cfg.dense_layers_actor).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=cfg.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents, cfg.random_seed,
                                   cfg.dense_layers_critic).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents, cfg.random_seed,
                                    cfg.dense_layers_critic).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=cfg.lr_critic,
                                           weight_decay=cfg.weight_decay)

        self.hard_copy_weights(self.critic_local, self.critic_target)
        self.hard_copy_weights(self.actor_local, self.actor_target)

        self.t_step = 0

        # Noise process
        self.noise = OUNoise(action_size,
                             cfg.random_seed,
                             theta=cfg.theta_ou,
                             sigma=cfg.sigma_ou)

        # Replay memory
        self.memory = ReplayBuffer(action_size, cfg.buffer_size,
                                   cfg.batch_size, cfg.random_seed, cfg)
示例#30
0
    def reset_agent(self):

        self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, self.bfs)

        self.step_count = 0
        self.total_step_count = 0
        self.train_count = 0
        self.episode_count = 0

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.T_init)