def __init__(self, nb_state, nb_action): self.nb_state = nb_state self.nb_action = nb_action self.actor = Actor(self.nb_state, self.nb_action) self.actor_target = Actor(self.nb_state, self.nb_action) self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE) self.critic = Critic(self.nb_state, self.nb_action) self.critic_target = Critic(self.nb_state, self.nb_action) self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1) self.random_process = OrnsteinUhlenbeckProcess(size=nb_action, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) self.is_training = True self.epsilon = 1.0 self.a_t = None self.s_t = None if USE_CUDA: self.cuda()
def __init__(self, args, nb_states, nb_actions): USE_CUDA = torch.cuda.is_available() if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.gpu_ids = [i for i in range(args.gpu_nums) ] if USE_CUDA and args.gpu_nums > 0 else [-1] self.gpu_used = True if self.gpu_ids[0] >= 0 else False net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_optim = Adam(self.actor.parameters(), lr=args.p_lr, weight_decay=args.weight_decay) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_optim = Adam(self.critic.parameters(), lr=args.c_lr, weight_decay=args.weight_decay) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau_update = args.tau_update self.gamma = args.gamma # Linear decay rate of exploration policy self.depsilon = 1.0 / args.epsilon # initial exploration rate self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.continious_action_space = False
def initialize_memory(self, stocks): self.memory = [] for i in range(self.n_memory): self.memory.append(SequentialMemory(self.memory_length)) for t in range(len(stocks) - 1): for idx_memory in range(self.n_memory): action = np.random.normal(0, self.noise_scale, self.n_stock) action = self.norm_action(action) reward = np.sum((stocks[t + 1] - stocks[t]) * action) self.memory[idx_memory].append(stocks[t], action, reward)
def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions actor_net_cfg = { 'hidden1': 32, 'hidden2': 32, 'hidden3': 32, 'init_w': args.init_w } critic_net_cfg = { 'hidden1': 64, 'hidden2': 64, 'hidden3': 64, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.best_reward = -10
def __init__(self, env, args): #(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.env = env self.nb_states = self.env.observation_space.shape[0] self.nb_actions = self.env.action_space.shape[0] # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) self.load_weights(args.output) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda()
def initialize_memory(self, stocks, scale=10): self.memory = [] for i in range(self.n_memory): self.memory.append(SequentialMemory(self.memory_length)) for t in range(len(stocks)): for idx_memory in range(self.n_memory): action = None reward = np.concatenate( (np.reshape(stocks[t], (self.n_stock, 1)), np.zeros( (self.n_stock, 1))), axis=-1) self.memory[idx_memory].append(stocks[t], action, reward)
def __init__(self, in_channels, num_actions, config): super(DDPG, self).__init__() self.nb_states = in_channels self.nb_actions = num_actions # Create Actor and Critic Network net_cfg = { 'hidden1': config['hidden1'], 'hidden2': config['hidden2'], # 'hidden3': config['hidden3'], # 'hidden4': config['hidden4'], 'init_w': config['init_w'] } self.loss = nn.MSELoss() self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=config['plr']) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=config['lr']) if isGPU: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.observation = config['observation'] self.config = config if config['use_memory']: self.experience_replay = SequentialMemory(limit=config['memory_size'], window_length=1) else: self.experience_replay = deque(maxlen=config['memory_size']) # Create Buffer replay self.random_process = OUProcess(size=self.nb_actions, theta=config['ou_theta'], mu=config['ou_mu'], sigma=config['ou_sigma']) self.batch_size = config['batch_size'] self.tau = config['tau'] self.discount = config['discount'] self.depsilon = 1. / config['epsilon_decay'] self.epsilon = 1.0
def __init__(self, env, policy, gamma, tau, epsilon, epsilon_decay, actor_lr, critic_lr, theta, sigma, mu, buffer_size): #self.num_states = num_states #self.num_actions = num_actions #self.is_training = False self.env = env self.gamma = gamma self.tau = tau self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.actor_lr = actor_lr self.critic_lr = critic_lr self.theta = theta self.sigma = sigma self.mu = mu self.buffer_size = buffer_size self.policy = policy self.actor = policy.actor self.critic = policy.critic self.actor_target = policy.actor_target self.critic_target = policy.critic_target self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.criterion = nn.MSELoss() #the actor/actor_target and critic/critic_target need to have the same weights to start with for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.memory = SequentialMemory(limit=self.buffer_size, window_length=1) #self.replay = ExpcerienceReplay(BUFFER_SIZE,BATCH_SIZE) self.ou_noise = Ornstein_Uhlenbeck(theta=self.theta, sigma=self.sigma, mu=self.mu) if USE_CUDA: self.cuda()
def __init__(self, nb_states, nb_actions): self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions) self.actor_target = Actor(self.nb_states, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic = Critic(self.nb_states, self.nb_actions) self.critic_target = Critic(self.nb_states, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=HISTORY_LEN) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) # Hyper-parameters self.batch_size = BATCH_SIZE self.tau = TAU self.discount = GAMMA self.depsilon = 1.0 / DEPSILON self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True if USE_CUDA: self.cuda()
STATE_SIZE = 2048 var_loss_coef1 = K.variable(0) var_loss_coef2 = K.variable(0) var_loss_coef3 = K.variable(0) model_env = model_state = model_next_state = model_next_state_auto = model_reward = meanImage = None if not args.env_model is None: model_env, model_state, model_next_state, model_next_state_auto, model_reward = load_model(args.env_model, args.env_weight, args.env_reward_weight, STATE_SIZE, ACTION_COUNT, AGENT_HISTORY_LENGTH, 1, var_loss_coef1, var_loss_coef2, var_loss_coef3) meanImage = np.load(args.env_mean_image) print(model_env.summary()) newGame() done = False replay_buffer = SequentialMemory(max_size=REPLAY_MEMORY_SIZE) total_step_count = 0 #REPLAY_START_SIZE = 1000 #FINAL_EXPLORATION_FRAME = 50000 #REPLAY_START_SIZE = 5000 episode_reward = 0 epsilon = INITIAL_EXPLORATION def running_mean(x, N): cumsum = np.cumsum(np.insert(x, 0, 0)) return (cumsum[N:] - cumsum[:-N]) / float(N) def weight_norms(model): ws = model.get_weights() for w in ws:
actor.add(Dense(8)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('sigmoid')) action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=(1,) + (11,), name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=10, random_process=random_process, gamma=.995, target_model_update=1e-3) agent.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae']) agent.fit(env, nb_steps=10000, visualize=False, verbose=0, nb_max_episode_steps=95) #agent.save_weights('weights/ddpg_{}_weights.h5f'.format("stormwater"), overwrite=True) agent.test(env, nb_episodes=15, visualize=False, nb_max_episode_steps=95, plt="")
# resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. forward(observation) backward(step, 0., terminal=False) episode_logs = { 'episode_reward': episode_reward, 'nb_episode_steps': episode_step, 'nb_steps': step, } callback_list.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward = None memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) checkpoint_weights_filename = os.path.join(root_dir, 'my_pacman_weights_weights_{step}.h5f') callbacks = [MyCheckPoint(checkpoint_weights_filename, interval=100000, verbose=1)] callbacks += [TrainEpisodeLogger()] callbacks += [TrainIntervalLogger(interval=10000)] trainable_model, target_model = compile(Adam(lr=.00025), metrics=['mae']) fit(callbacks=callbacks, total_steps=10000000, verbose=1)
model.add(Convolution2D(32, 8, 8, subsample=(4, 4), input_shape=(WINDOW_LENGTH,) + INPUT_SHAPE)) model.add(Activation('relu')) model.add(Convolution2D(64, 4, 4, subsample=(2, 2))) model.add(Activation('relu')) model.add(Convolution2D(64, 3, 3, subsample=(1, 1))) model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try!
def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.epistemic_actor = args.epistemic_actor # true / false self.epistemic_critic = args.epistemic_critic # true / false self.aleatoric_actor = args.aleatoric_actor # true / false self.aleatoric_critic = args.aleatoric_critic # true / false self.dropout_n_actor = args.dropout_n_actor self.dropout_n_critic = args.dropout_n_critic self.dropout_p_actor = args.dropout_p_actor self.dropout_p_critic = args.dropout_p_critic self.print_var_count = 0 self.action_std = np.array([]) self.save_dir = args.output self.episode = 0 # self.save_file = open(self.save_dir + '/std.txt', "a") # Create Actor and Critic Network net_cfg_actor = { 'dropout_n': args.dropout_n_actor, 'dropout_p': args.dropout_p_actor, 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } net_cfg_critic = { 'dropout_n': args.dropout_n_actor, 'dropout_p': args.dropout_p_critic, 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = UAActor(self.nb_states, self.nb_actions, **net_cfg_actor) self.actor_target = UAActor(self.nb_states, self.nb_actions, **net_cfg_actor) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = UACritic(self.nb_states, self.nb_actions, **net_cfg_critic) self.critic_target = UACritic(self.nb_states, self.nb_actions, **net_cfg_critic) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda()
def __init__(self, nb_states, nb_actions, now_date, now_time, args): print("UADDPG!!!!!!!!!!!!!!!!!!!!!!!!!") if args.seed > 0: self.seed(args.seed) self.total_training_step = 1 self.episode = 0 self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } # self.criterion = nn.MSELoss() self.critic_case = 'stochastic' self.actor = UAActor(self.nb_states, self.nb_actions, False, **net_cfg) self.actor_target = UAActor(self.nb_states, self.nb_actions, True, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = UACritic(self.nb_states, self.nb_actions, False, **net_cfg) self.critic_target = UACritic(self.nb_states, self.nb_actions, True, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.s_t_noise = None # Most recent state self.a_t_mean = None # Most recent action self.a_t_var = None self.is_training = True if torch.cuda.is_available(): self.cuda() self.now_date = now_date self.now_time = now_time if os.path.exists('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' + self.now_time + '/') is False: os.mkdir('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' + self.now_time + '/')