예제 #1
0
파일: train.py 프로젝트: renqibing/RL_DDPG
    def __init__(self, nb_state, nb_action):
        self.nb_state = nb_state
        self.nb_action = nb_action

        self.actor = Actor(self.nb_state, self.nb_action)
        self.actor_target = Actor(self.nb_state, self.nb_action)
        self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE)

        self.critic = Critic(self.nb_state, self.nb_action)
        self.critic_target = Critic(self.nb_state, self.nb_action)
        self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_action,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        self.is_training = True
        self.epsilon = 1.0
        self.a_t = None
        self.s_t = None

        if USE_CUDA: self.cuda()
예제 #2
0
    def __init__(self, args, nb_states, nb_actions):
        USE_CUDA = torch.cuda.is_available()
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.gpu_ids = [i for i in range(args.gpu_nums)
                        ] if USE_CUDA and args.gpu_nums > 0 else [-1]
        self.gpu_used = True if self.gpu_ids[0] >= 0 else False

        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double()
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **net_cfg).double()
        self.actor_optim = Adam(self.actor.parameters(),
                                lr=args.p_lr,
                                weight_decay=args.weight_decay)

        self.critic = Critic(self.nb_states, self.nb_actions,
                             **net_cfg).double()
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **net_cfg).double()
        self.critic_optim = Adam(self.critic.parameters(),
                                 lr=args.c_lr,
                                 weight_decay=args.weight_decay)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau_update = args.tau_update
        self.gamma = args.gamma

        # Linear decay rate of exploration policy
        self.depsilon = 1.0 / args.epsilon
        # initial exploration rate
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        self.continious_action_space = False
예제 #3
0
파일: ddpg.py 프로젝트: zhuchiheng/DQN
 def initialize_memory(self, stocks):
     self.memory = []
     for i in range(self.n_memory):
         self.memory.append(SequentialMemory(self.memory_length))
     for t in range(len(stocks) - 1):
         for idx_memory in range(self.n_memory):
             action = np.random.normal(0, self.noise_scale, self.n_stock)
             action = self.norm_action(action)
             reward = np.sum((stocks[t + 1] - stocks[t]) * action)
             self.memory[idx_memory].append(stocks[t], action, reward)
예제 #4
0
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        actor_net_cfg = {
            'hidden1': 32,
            'hidden2': 32,
            'hidden3': 32,
            'init_w': args.init_w
        }

        critic_net_cfg = {
            'hidden1': 64,
            'hidden2': 64,
            'hidden3': 64,
            'init_w': args.init_w
        }

        self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **actor_net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **critic_net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True
        self.best_reward = -10
예제 #5
0
    def __init__(self, env, args):  #(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.env = env

        self.nb_states = self.env.observation_space.shape[0]
        self.nb_actions = self.env.action_space.shape[0]

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        self.load_weights(args.output)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA: self.cuda()
예제 #6
0
파일: dqn.py 프로젝트: zhuzhenping/DQN
 def initialize_memory(self, stocks, scale=10):
     self.memory = []
     for i in range(self.n_memory):
         self.memory.append(SequentialMemory(self.memory_length))
     for t in range(len(stocks)):
         for idx_memory in range(self.n_memory):
             action = None
             reward = np.concatenate(
                 (np.reshape(stocks[t],
                             (self.n_stock, 1)), np.zeros(
                                 (self.n_stock, 1))),
                 axis=-1)
             self.memory[idx_memory].append(stocks[t], action, reward)
예제 #7
0
    def __init__(self, in_channels, num_actions, config):
        super(DDPG, self).__init__()

        self.nb_states = in_channels
        self.nb_actions = num_actions

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': config['hidden1'],
            'hidden2': config['hidden2'],
            # 'hidden3': config['hidden3'],
            # 'hidden4': config['hidden4'],
            'init_w': config['init_w']
        }

        self.loss = nn.MSELoss()
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=config['plr'])

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=config['lr'])

        if isGPU:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        self.observation = config['observation']
        self.config = config

        if config['use_memory']:
            self.experience_replay = SequentialMemory(limit=config['memory_size'], window_length=1)
        else:
            self.experience_replay = deque(maxlen=config['memory_size'])  # Create Buffer replay

        self.random_process = OUProcess(size=self.nb_actions, theta=config['ou_theta'], mu=config['ou_mu'],
                                        sigma=config['ou_sigma'])

        self.batch_size = config['batch_size']
        self.tau = config['tau']
        self.discount = config['discount']
        self.depsilon = 1. / config['epsilon_decay']

        self.epsilon = 1.0
예제 #8
0
    def __init__(self, env, policy, gamma, tau, epsilon, epsilon_decay,
                 actor_lr, critic_lr, theta, sigma, mu, buffer_size):

        #self.num_states = num_states
        #self.num_actions = num_actions
        #self.is_training = False
        self.env = env

        self.gamma = gamma
        self.tau = tau
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.theta = theta
        self.sigma = sigma
        self.mu = mu
        self.buffer_size = buffer_size

        self.policy = policy
        self.actor = policy.actor
        self.critic = policy.critic
        self.actor_target = policy.actor_target
        self.critic_target = policy.critic_target
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=self.actor_lr)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=self.critic_lr)
        self.criterion = nn.MSELoss()

        #the actor/actor_target and critic/critic_target need to have the same weights to start with
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.memory = SequentialMemory(limit=self.buffer_size, window_length=1)
        #self.replay = ExpcerienceReplay(BUFFER_SIZE,BATCH_SIZE)

        self.ou_noise = Ornstein_Uhlenbeck(theta=self.theta,
                                           sigma=self.sigma,
                                           mu=self.mu)

        if USE_CUDA: self.cuda()
예제 #9
0
    def __init__(self, nb_states, nb_actions):
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE,
                                       window_length=HISTORY_LEN)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        # Hyper-parameters
        self.batch_size = BATCH_SIZE
        self.tau = TAU
        self.discount = GAMMA
        self.depsilon = 1.0 / DEPSILON

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        if USE_CUDA: self.cuda()
예제 #10
0
STATE_SIZE = 2048
var_loss_coef1 = K.variable(0)
var_loss_coef2 = K.variable(0)
var_loss_coef3 = K.variable(0)

model_env = model_state = model_next_state = model_next_state_auto = model_reward = meanImage = None

if not args.env_model is None:
	model_env, model_state, model_next_state, model_next_state_auto, model_reward = load_model(args.env_model, args.env_weight, args.env_reward_weight, STATE_SIZE, ACTION_COUNT, AGENT_HISTORY_LENGTH, 1, var_loss_coef1, var_loss_coef2, var_loss_coef3)	
	meanImage = np.load(args.env_mean_image)
	print(model_env.summary())

newGame()
done = False

replay_buffer = SequentialMemory(max_size=REPLAY_MEMORY_SIZE)
total_step_count = 0

#REPLAY_START_SIZE = 1000
#FINAL_EXPLORATION_FRAME = 50000
#REPLAY_START_SIZE = 5000
episode_reward = 0
epsilon = INITIAL_EXPLORATION

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

def weight_norms(model):
	ws = model.get_weights()
	for w in ws:
예제 #11
0
actor.add(Dense(8))
actor.add(Activation('relu'))
actor.add(Dense(nb_actions))
actor.add(Activation('sigmoid'))


action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + (11,), name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)

memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1)

agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=10,
                  random_process=random_process, gamma=.995, target_model_update=1e-3)

agent.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae'])
agent.fit(env, nb_steps=10000, visualize=False, verbose=0, nb_max_episode_steps=95)   
#agent.save_weights('weights/ddpg_{}_weights.h5f'.format("stormwater"), overwrite=True)
agent.test(env, nb_episodes=15, visualize=False, nb_max_episode_steps=95, plt="") 
예제 #12
0
            # resetting the environment. We need to pass in `terminal=False` here since
            # the *next* state, that is the state of the newly reset environment, is
            # always non-terminal by convention.
            forward(observation)
            backward(step, 0., terminal=False)
            episode_logs = {
                'episode_reward': episode_reward,
                'nb_episode_steps': episode_step,
                'nb_steps': step,
            }
            callback_list.on_episode_end(episode, episode_logs)
            episode += 1
            observation = None
            episode_step = None
            episode_reward = None


memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
checkpoint_weights_filename = os.path.join(root_dir, 'my_pacman_weights_weights_{step}.h5f')
callbacks = [MyCheckPoint(checkpoint_weights_filename, interval=100000, verbose=1)]
callbacks += [TrainEpisodeLogger()]
callbacks += [TrainIntervalLogger(interval=10000)]
trainable_model, target_model = compile(Adam(lr=.00025), metrics=['mae'])
fit(callbacks=callbacks, total_steps=10000000, verbose=1)






예제 #13
0
model.add(Convolution2D(32, 8, 8, subsample=(4, 4), input_shape=(WINDOW_LENGTH,) + INPUT_SHAPE))
model.add(Activation('relu'))
model.add(Convolution2D(64, 4, 4, subsample=(2, 2)))
model.add(Activation('relu'))
model.add(Convolution2D(64, 3, 3, subsample=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!
예제 #14
0
파일: ddpg.py 프로젝트: mabingqi1/DRL
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        self.epistemic_actor = args.epistemic_actor  # true / false
        self.epistemic_critic = args.epistemic_critic  # true / false

        self.aleatoric_actor = args.aleatoric_actor  # true / false
        self.aleatoric_critic = args.aleatoric_critic  # true / false

        self.dropout_n_actor = args.dropout_n_actor
        self.dropout_n_critic = args.dropout_n_critic

        self.dropout_p_actor = args.dropout_p_actor
        self.dropout_p_critic = args.dropout_p_critic

        self.print_var_count = 0
        self.action_std = np.array([])
        self.save_dir = args.output
        self.episode = 0

        # self.save_file = open(self.save_dir + '/std.txt', "a")

        # Create Actor and Critic Network
        net_cfg_actor = {
            'dropout_n': args.dropout_n_actor,
            'dropout_p': args.dropout_p_actor,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }

        net_cfg_critic = {
            'dropout_n': args.dropout_n_actor,
            'dropout_p': args.dropout_p_critic,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }

        self.actor = UAActor(self.nb_states, self.nb_actions, **net_cfg_actor)
        self.actor_target = UAActor(self.nb_states, self.nb_actions,
                                    **net_cfg_actor)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = UACritic(self.nb_states, self.nb_actions,
                               **net_cfg_critic)
        self.critic_target = UACritic(self.nb_states, self.nb_actions,
                                      **net_cfg_critic)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()
예제 #15
0
    def __init__(self, nb_states, nb_actions, now_date, now_time, args):
        print("UADDPG!!!!!!!!!!!!!!!!!!!!!!!!!")
        if args.seed > 0:
            self.seed(args.seed)

        self.total_training_step = 1
        self.episode = 0
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        # self.criterion = nn.MSELoss()
        self.critic_case = 'stochastic'
        self.actor = UAActor(self.nb_states, self.nb_actions, False, **net_cfg)
        self.actor_target = UAActor(self.nb_states, self.nb_actions, True,
                                    **net_cfg)

        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = UACritic(self.nb_states, self.nb_actions, False,
                               **net_cfg)
        self.critic_target = UACritic(self.nb_states, self.nb_actions, True,
                                      **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.s_t_noise = None  # Most recent state
        self.a_t_mean = None  # Most recent action
        self.a_t_var = None
        self.is_training = True

        if torch.cuda.is_available():
            self.cuda()

        self.now_date = now_date
        self.now_time = now_time

        if os.path.exists('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
                          self.now_time + '/') is False:
            os.mkdir('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
                     self.now_time + '/')