def __init__(self, env, conf): self.env = env self.conf = conf self.set_cuda() self.agent = DDPG(conf, self.device) self.memory = ReplayMemory(conf)
def __init__(self): self.BATCH_SIZE = 128 self.GAMMA = 0.99 self.EPS_START = 1.0 self.EPS_END = 0.05 self.EPS_DECAY = 0.000005 self.TARGET_UPDATE = 5 self.pretrain_length = self.BATCH_SIZE #self.state_size = [55,3] self.action_size = 3 self.hot_actions = np.array(np.identity(self.action_size).tolist()) #self.action_size = len(self.hot_actions) self.learning_rate = 0.0005 #self.total_episodes = 12 self.max_steps = 1000 self.env = Environment() self.memory_maxsize = 10000 self.DQNetwork = DQNetwork(learning_rate = self.learning_rate,name = 'DQNetwork') self.TargetNetwork = DQNetwork(learning_rate = self.learning_rate , name = 'TargetNetwork') self.memory = ReplayMemory(max_size=self.memory_maxsize) self.saver = tf.train.Saver()
def __init__(self, conf, device): self.conf = conf self.state_dim = conf['state_dim'] self.action_dim = conf['action_dim'] self.device = device self.q = DQNNetwork(self.state_dim, self.action_dim).to(self.device) self.q_target = DQNNetwork(self.state_dim, self.action_dim).to(self.device) self.q_target.load_state_dict(self.q.state_dict()) self.q_target.eval() self.memory = ReplayMemory(self.conf) self.optimizer = optim.Adam(self.q.parameters(), lr=lr_dqn) self.loss = HuberLoss() self.loss = self.loss.to(self.device) self.currIteration = 0
def create_player(load_weights=True, user_model=False): env = create_env() env.reset() # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() init_screen = get_screen(env) _, n_channels, screen_height, screen_width = init_screen.shape # 3, 40, 60 if user_model: policy_net = DQNUser(screen_height, screen_width, n_actions, KERNEL_SIZE, N_LAYERS).to(device) policy_net.eval() target_net = DQNUser(screen_height, screen_width, n_actions, KERNEL_SIZE, N_LAYERS).to(device) target_net.eval() else: policy_net = DQN(screen_height, screen_width, n_actions).to(device) policy_net.eval() target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.eval() if load_weights: model_dir = "models" model_file_name = "mean100_659.pth" policy_net.load_state_dict( torch.load(f"{model_dir}/{model_file_name}", map_location='cpu')) target_net.load_state_dict(policy_net.state_dict()) optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE, weight_decay=1e-6) scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9999999) memory = ReplayMemory(REPLAY_MEM) fake_memory = ReplayMemory(REPLAY_MEM) player = Player(env, policy_net, target_net, optimizer, scheduler, memory, fake_memory) return player
def __init__( self, length, box_dimensions, device, BATCH_SIZE=128, GAMMA=0.999, EPS_START=0.9, EPS_END=0.05, EPS_DECAY=200, TARGET_UPDATE=10, ): self.length = length self.box_dimensions = box_dimensions self.width = box_dimensions[0] self.height = box_dimensions[1] self.epsilon = 0.01 self.orientation = 'LEFT' self.device = device self.policy_net = DQN(self.height, self.width, len(self.actions)).to(device) self.target_net = DQN(self.height, self.width, len(self.actions)).to(device) self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.memory = ReplayMemory(10000) self.steps_done = 0 self.cumulative_reward = 0.0 self.episode = 1 self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.BATCH_SIZE = BATCH_SIZE self.GAMMA = GAMMA self.EPS_START = EPS_START self.EPS_END = EPS_END self.EPS_DECAY = EPS_DECAY self.TARGET_UPDATE = TARGET_UPDATE self._reset(setup=True)
def train(self): # feature dimension state_dim = self.env.observation_space.shape[0] # number of actions n_actions = self.env.action_space.n self.q_net = DQN(state_dim, n_actions).to(self.args.device) self.target_net = DQN(state_dim, n_actions).to(self.args.device) self.target_net.load_state_dict(self.q_net.state_dict()) self.target_net.train() self.q_net.train() # self.optimizer = optim.RMSprop(self.q_net.parameters()) self.optimizer = optim.Adam(self.q_net.parameters(), self.args.LR) self.memory = ReplayMemory(self.args.memory_cap) self.steps_done = 0 self.eps = self.args.EPS_START self.episode_durations = [] for i_episode in range(self.args.num_episodes): self.state = self.env.reset() self.state = torch.Tensor(self.state).to( self.args.device).unsqueeze(0) for t in count(): done = self.__step(t) if done or t >= self.args.END_EPSISODE: self.episode_durations.append(t + 1) if len(self.episode_durations) > 100: ave = np.mean(self.episode_durations[-100:]) else: ave = np.mean(self.episode_durations) if i_episode % 10 == 0: print("[Episode {:>5}] steps: {:>5} ave: {:>5}". format(i_episode, t, ave)) break plt.figure() plt.clf() durations_t = torch.tensor(self.episode_durations, dtype=torch.float) means = durations_t.unfold(0, 100, 1).mean(1).view(-1) plt.plot(means.numpy()) plt.title('training') plt.xlabel('episode') plt.ylabel('duration') plt.savefig('res.png')
def main(): env = Environment() gamma = 0.99 period = 100 learning_rate = 1e-6 min_memory_size = 1000 max_memory_size = 10000 batch_size = 32 num_episodes = 100 layers = [env.observation_space.n, 444, 222, env.action_space.n] memory = ReplayMemory(max_memory_size) dqn = DQN(layers, learning_rate) init_memory(env, memory, min_memory_size) iters = 0 for n_ep in range(num_episodes): eps = compute_eps(n_ep, 10) reward, iters = train_one_episode(env, dqn, memory, gamma, batch_size, eps, period, iters) print(n_ep, reward) make_submission(env, dqn)
TARGET_UPDATE = 10 init_screen = get_screen() _, _, screen_height, screen_width = init_screen.shape n_actions = env.action_space.n policyNet = DQNAgent(screen_height, screen_width, n_actions).to(device) targetNet = DQNAgent(screen_height, screen_width, n_actions).to(device) targetNet.load_state_dict(policyNet.state_dict( )) # Use the parameters of policyNet to evaluate targetNet targetNet.eval() optimizer = optim.RMSprop(policyNet.parameters()) memory = ReplayMemory(10000) steps_done = 0 def select_action(state): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1 #print('select_action received state of dim: ', state.size()) if sample > eps_threshold: with torch.no_grad( ): # Not updating, using the policyNet to simply predict the next action to take # t.max(1) will return largest column value of each row.
torch.save(self.target.state_dict(), target_PATH) policy_PATH = f"policy_episode_{self.episodes}_{self.steps}" target_PATH = f"target_episode_{self.episodes}_{self.steps}" torch.save(self.policy.state_dict(), policy_PATH) torch.save(self.target.state_dict(), target_PATH) env.close() if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using Device {device}") target = DQN(device=device).to(device) policy = DQN(device=device).to(device) model = torch.load("policy_episode_14300_8617296", map_location=torch.device("cpu")) policy.load_state_dict(model) target.load_state_dict(model) mem = ReplayMemory(TrainPongV0.MEMORY_SIZE) trainer = TrainPongV0(target, policy, mem, device) try: trainer.train(50000) finally: np.save("rewards", trainer.total_rewards, allow_pickle=True)
class Snake(object): actions = ['FORWARD', 'LEFT', 'RIGHT'] orientations = ['UP', 'DOWN', 'LEFT', 'RIGHT'] def __init__( self, length, box_dimensions, device, BATCH_SIZE=128, GAMMA=0.999, EPS_START=0.9, EPS_END=0.05, EPS_DECAY=200, TARGET_UPDATE=10, ): self.length = length self.box_dimensions = box_dimensions self.width = box_dimensions[0] self.height = box_dimensions[1] self.epsilon = 0.01 self.orientation = 'LEFT' self.device = device self.policy_net = DQN(self.height, self.width, len(self.actions)).to(device) self.target_net = DQN(self.height, self.width, len(self.actions)).to(device) self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.memory = ReplayMemory(10000) self.steps_done = 0 self.cumulative_reward = 0.0 self.episode = 1 self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.BATCH_SIZE = BATCH_SIZE self.GAMMA = GAMMA self.EPS_START = EPS_START self.EPS_END = EPS_END self.EPS_DECAY = EPS_DECAY self.TARGET_UPDATE = TARGET_UPDATE self._reset(setup=True) def _reset(self, setup=False): self.i_pos = self.box_dimensions / 2 self.body_position = [ np.array([self.i_pos[0] - x, self.i_pos[1]], dtype=int) for x in range(self.length) ] self.cumulative_reward = 0.0 if not setup: self.episode += 1 print('On episode {}, step {}'.format(self.episode, self.steps_done)) def act(self, state): a = self.select_action(state) return self.actions[a.item()] def _convert_action_to_tensor(self, action): return torch.tensor(self.actions.index(action)).view(1, -1) def _convert_move_to_point(self, move): if move == 'FORWARD': return self._handle_forward(move) elif move == 'LEFT': return self._handle_left(move) elif move == 'RIGHT': return self._handle_right(move) else: raise ValueError('Invalid move') def is_colliding(self, position): curr_body = list(self.body_position) curr_body.pop() for _, elt in enumerate(curr_body): if np.array_equal(elt, position): return True return False def select_action(self, state): sample = np.random.random() eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \ m.exp(-1. * self.steps_done / self.EPS_DECAY) self.steps_done += 1 if sample > eps_threshold: with torch.no_grad(): state_tensor = torch.tensor(state, device=self.device).double() state_shape = list(state_tensor.size()) state_tensor = state_tensor.view(1, 1, state_shape[0], state_shape[1]) # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. return self.policy_net(state_tensor).max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(len(self.actions))]], device=self.device, dtype=torch.long) def _handle_forward(self, move): x_pos, y_pos = self.body_position[0] if self.orientation == 'UP': return np.array([x_pos, y_pos + 1]) elif self.orientation == 'DOWN': return np.array([x_pos, y_pos - 1]) elif self.orientation == 'LEFT': return np.array([x_pos + 1, y_pos]) elif self.orientation == 'RIGHT': return np.array([x_pos - 1, y_pos]) else: raise ValueError('Invalid orientation') def _handle_left(self, move): x_pos, y_pos = self.body_position[0] if self.orientation == 'UP': self.orientation = 'LEFT' return np.array([x_pos + 1, y_pos]) elif self.orientation == 'DOWN': self.orientation = 'RIGHT' return np.array([x_pos - 1, y_pos]) elif self.orientation == 'LEFT': self.orientation = 'DOWN' return np.array([x_pos, y_pos - 1]) elif self.orientation == 'RIGHT': self.orientation = 'UP' return np.array([x_pos, y_pos + 1]) else: raise ValueError('Invalid orientation') def _handle_right(self, move): x_pos, y_pos = self.body_position[0] if self.orientation == 'UP': self.orientation = 'RIGHT' return np.array([x_pos - 1, y_pos]) elif self.orientation == 'DOWN': self.orientation = 'LEFT' return np.array([x_pos + 1, y_pos]) elif self.orientation == 'LEFT': self.orientation = 'UP' return np.array([x_pos, y_pos + 1]) elif self.orientation == 'RIGHT': self.orientation = 'DOWN' return np.array([x_pos, y_pos - 1]) else: raise ValueError('Invalid orientation') def process_reward(self, reward): self.cumulative_reward += reward def optimize_model(self): if len(self.memory) < self.BATCH_SIZE: return transitions = self.memory.sample(self.BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat( [torch.from_numpy(s) for s in batch.next_state if s is not None]) state_batch = torch.cat( [torch.from_numpy(s) for s in batch.state if s is not None]) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net( state_batch.view(self.BATCH_SIZE, 1, 10, 10)).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net( non_final_next_states.view(self.BATCH_SIZE, 1, 10, 10)).float().max(1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss( state_action_values, expected_state_action_values.double().unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
episode_reward += reward if render: env.render() if done: break eval_reward.append(episode_reward) return np.mean(eval_reward) if __name__ == '__main__': # Create and wrap the environment env = gym.make('game-stock-exchange-continuous-v0') env = DummyVecEnv([lambda: env]) action_dim = 2 obs_shape = env.observation_space.shape rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim = action_dim) algorithm = DQN(model, act_dim = action_dim, gamma = GAMMA, lr = LEARNING_RATE) agent = Agent(algorithm, obs_shape[0],obs_shape[1],action_dim) while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env,agent,rpm) max_episode = 2000 episode = 0 while episode < max_episode: for i in range(0,50): total_reward = run_episode(env,agent,rpm) episode += 1
class AgentTrainer(): def __init__(self): self.BATCH_SIZE = 128 self.GAMMA = 0.99 self.EPS_START = 1.0 self.EPS_END = 0.05 self.EPS_DECAY = 0.000005 self.TARGET_UPDATE = 5 self.pretrain_length = self.BATCH_SIZE #self.state_size = [55,3] self.action_size = 3 self.hot_actions = np.array(np.identity(self.action_size).tolist()) #self.action_size = len(self.hot_actions) self.learning_rate = 0.0005 #self.total_episodes = 12 self.max_steps = 1000 self.env = Environment() self.memory_maxsize = 10000 self.DQNetwork = DQNetwork(learning_rate = self.learning_rate,name = 'DQNetwork') self.TargetNetwork = DQNetwork(learning_rate = self.learning_rate , name = 'TargetNetwork') self.memory = ReplayMemory(max_size=self.memory_maxsize) self.saver = tf.train.Saver() #self.TargetUpdate = update_target_graph() def select_action(self, sess,decay_step, state, actions): ## EPSILON GREEDY STRATEGY Choose action a from state s using epsilon greedy. ## First we randomize a number exp_exp_tradeoff = np.random.rand() explore_probability = self.EPS_END + (self.EPS_START - self.EPS_END) * np.exp(-self.EPS_DECAY * decay_step) if (explore_probability > exp_exp_tradeoff): # Make a random action (exploration) choice = random.randint(1,len(self.hot_actions))-1 action = self.hot_actions[choice] #print('action_taken is random',action) else: # Get action from Q-network (exploitation) # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state.reshape((1,) + state.shape)}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) action = self.hot_actions[choice] return action, explore_probability #This function helps us to copy one set of variables to another def update_target_graph(self): return op_holder def train(self,num_episodes,sess): # Instantiate memory #memory = Memory(max_size = memory_size) for i in range(self.pretrain_length): # If it's the first step if i == 0: state = self.env.reset() # Get the next_state, the rewards, done by taking a random action choice = random.randint(1,len(self.hot_actions))-1 action = self.hot_actions[choice] next_state, reward, done= self.env.step(np.argmax(action)) # If the episode is finished (we're dead 3x) if done: # We finished the episode next_state = np.zeros(state.shape) # Add experience to memory self.memory.add((state, action, reward, next_state, done)) # Start a new episode state = self.env.reset() else: # Add experience to memory self.memory.add((state, action, reward, next_state, done)) #print("adding to memory") sys.stdout.flush() # Our new state is now the next_state state = next_state decay_step = 0 rewards_list = [] total_steps = 0 for episode in range(num_episodes): #print('epidose',episode) # Set step to 0 step = 0 total_reward = 0 # Initialize the rewards of the episode episode_rewards = [] # Make a new episode and observe the first state state = self.env.reset() done = False #cv2.imshow(state) #cv2.waitKey(100) while not done: step += 1 total_steps+=1 #Increase decay_step decay_step +=1 # Predict the action to take and take it action, explore_probability = self.select_action(sess,decay_step, state, self.hot_actions) #Perform the action and get the next_state, reward, and done information next_state, reward, done = self.env.step(np.argmax(action)) # Add the reward to total reward episode_rewards.append(reward) # If the game is finished if done: # The episode ends so no next state next_state = np.zeros(state.shape, dtype=np.int) steps_taken = step # Get the total reward of the episode total_reward = np.sum(episode_rewards) rewards_list.append((episode, total_reward)) # Store transition <st,at,rt+1,st+1> in memory D self.memory.add((state, action, reward, next_state, done)) else: # Stack the frame of the next_state # next_state, stacked_frames = stack_frames(stacked_frames, next_state, False) # Add experience to memory self.memory.add((state, action, reward, next_state, done)) steps_taken = step # st+1 is now our current state state = next_state ### LEARNING PART # Obtain random mini-batch from memory batch = self.memory.sample(self.BATCH_SIZE) states_mb = np.array([each[0] for each in batch], ndmin=3) actions_mb = np.array([each[1] for each in batch]) rewards_mb = np.array([each[2] for each in batch]) next_states_mb = np.array([each[3] for each in batch], ndmin=3) dones_mb = np.array([each[4] for each in batch]) target_Qs_batch = [] # Get Q values for next_state Qs_next_state = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: next_states_mb}) # Calculate Qtarget for all actions that state q_target_next_state = sess.run(self.TargetNetwork.output, feed_dict = {self.TargetNetwork.inputs_: next_states_mb}) # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a') for i in range(0, len(batch)): terminal = dones_mb[i] # If we are in a terminal state, only equals reward if terminal: target_Qs_batch.append(rewards_mb[i]) else: target = rewards_mb[i] + self.GAMMA * np.max(q_target_next_state[i]) #print (target) #print(Qs_next_state[i]) target_Qs_batch.append(target) targets_mb = np.array([each for each in target_Qs_batch]) #print(targets_mb) loss, _ = sess.run([self.DQNetwork.loss, self.DQNetwork.optimizer], feed_dict={self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb}) if episode%self.TARGET_UPDATE==0: # Update the parameters of our TargetNetwork with DQN_weights #update_target = self.TargetUpdate() # Get the parameters of our DQNNetwork from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "DQNetwork") # Get the parameters of our Target_network to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "TargetNetwork") op_holder = [] # Update our target_network parameters with DQNNetwork parameters for from_var,to_var in zip(from_vars,to_vars): op_holder.append(to_var.assign(from_var)) sess.run(op_holder) print("Target Model updated") # Write TF Summaries # summary = sess.run(write_op, feed_dict={self.DQNetwork.inputs_: states_mb, # self.DQNetwork.target_Q: targets_mb, # self.DQNetwork.actions_: actions_mb}) # writer.add_summary(summary, episode) # writer.flush() print('Total steps: {}'.format(total_steps),'Episode: {}'.format(episode),'Step: {}'.format(steps_taken), 'Total reward: {}'.format(np.sum(episode_rewards)), 'Explore P: {:.4f}'.format(explore_probability), 'Training Loss {:.4f}'.format(loss)) # Save model every 100 episodes if episode % 100 == 0: save_path = self.saver.save(sess, "./models/model.ckpt") print("Model Saved")
class DQN(): def __init__(self, conf, device): self.conf = conf self.state_dim = conf['state_dim'] self.action_dim = conf['action_dim'] self.device = device self.q = DQNNetwork(self.state_dim, self.action_dim).to(self.device) self.q_target = DQNNetwork(self.state_dim, self.action_dim).to(self.device) self.q_target.load_state_dict(self.q.state_dict()) self.q_target.eval() self.memory = ReplayMemory(self.conf) self.optimizer = optim.Adam(self.q.parameters(), lr=lr_dqn) self.loss = HuberLoss() self.loss = self.loss.to(self.device) self.currIteration = 0 def update(self): for i in range(1): if self.memory.length() < self.conf['batch_size']: return transitions = self.memory.sample_batch(self.conf['batch_size']) one_batch = Transition(*zip(*transitions)) action_batch = torch.cat(one_batch.action).view( -1, 1) # [batch-size, 1] reward_batch = torch.cat(one_batch.reward).view( -1, 1) # [batch-size, 1] state_batch = torch.cat(one_batch.state).view( -1, self.conf['state_dim']) next_state_batch = torch.cat(one_batch.next_state).view( -1, self.conf['state_dim']) # dones_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1) # # compute Q(s_t, a) - the model computes Q(s_t), then we select the # # columns of actions taken current_q = self.q(state_batch).gather(1, action_batch) # # compute V(s_{t+1}) for all next states and all actions, # # and we then take max_a { V(s_{t+1}) } next_q = self.q_target(next_state_batch).max(1)[0].view(-1, 1) # # compute target q by: r + gamma * max_a { V(s_{t+1}) } target_q = reward_batch + (self.conf['gamma'] * next_q) # print("current_q:%s, target_q:%s"%(current_q[0].item(), target_q[0].item())) # optimizer step loss = self.loss(current_q, target_q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.cpu().item() ### epsilon-greedy algorithm ### def select_action(self, state_ts): ''' input 'state_ts' is tensor [1, 2*num_plant, 1], which is unsqueezed type output 'action' is tensor [1X1] ''' sample = random.random() eps_threshold = self.conf['epsilon_end'] + ( self.conf['epsilon_start'] - self.conf['epsilon_end']) * math.exp( -1. * self.currIteration / self.conf['epsilon_decay']) self.currIteration += 1 # if (self.currIteration % 1000) == 0: # print("currIteration:%s, eps_threshold:%s"%(self.currIteration, eps_threshold)) if sample > eps_threshold: with torch.no_grad(): # argmax_a Q(s) action = self.q.forward(state_ts).argmax().view( 1 ) # .max(0)[1] : (0) 차원에서 최대 값[0]/index[1] and .view(1,1) : from tuple to [1X1] return action else: action = torch.tensor( [random.randint(0, self.conf['action_dim'] - 1)], device=self.device, dtype=torch.long) # [1X1] return action
class MultipendulumSim(): def __init__(self, env, conf): self.env = env self.conf = conf self.set_cuda() self.agent = DDPG(conf, self.device) self.memory = ReplayMemory(conf) def train(self): self.epi_rewards = [] self.epi_losses = [] for epi in range(self.conf['num_episode']): # episodes print("--- episode %s ---"%(epi)) epi_reward = 0.0 state = self.env.reset() # [2*num_plant, 1] state_ts = to_tensor(state).unsqueeze(0) # [1, 2*num_plant, 1] # unsqueeze(0) on 'state' is necessary for reply memory dataPlot = dataPlotter_v2(self.conf) t = P.t_start while t < P.t_end: # one episode (simulation) t_next_plot = t + P.t_plot while t < t_next_plot: # data plot period if round(t,3)*1000 % 10 == 0: # every 10 ms, schedule udpate action = self.agent.select_action(state_ts) # action type: tensor [1X1] next_state, reward, done, info = self.env.step(action.item(), t) # shape of next_state : [(2*num_plant) X 1] epi_reward += reward # self.env.step(0, t) # test for env.step() function if done: next_state_ts = None break else: next_state_ts = to_tensor(next_state).unsqueeze(0) # [1, 2*num_plant, 1] reward_ts = to_tensor(np.asarray(reward).reshape(-1)) # it's size should be [1] for reply buffer # memory push self.memory.push_transition(state_ts, action, next_state_ts, reward_ts) state_ts = next_state_ts # model optimization step currLoss = self.agent.optimization_model(self.memory) else: # every 1 ms self.env.update_plant_state(t) # plant status update t = t + P.Ts # self.update_dataPlot(dataPlot, t) # update data plot if next_state_ts == None: # episode terminates dataPlot.close() break # episode done self.epi_rewards.append(epi_reward) self.epi_losses.append(currLoss) # The target network has its weights kept frozen most of the time if epi % self.conf['target_update'] == 0: self.agent.scheduler_target.load_state_dict(self.agent.scheduler.state_dict()) # Save satet_dict torch.save(self.agent.scheduler.state_dict(), MODEL_PATH) self.save_log() self.load_log() def save_log(self): combined_stats = dict() combined_stats['rollout/return'] = np.mean(self.epi_rewards) combined_stats['rollout/return_history'] = self.epi_rewards combined_stats['train/loss'] = self.epi_losses with open(LOG_PATH + LOG_FILE, 'wb') as f: pickle.dump(combined_stats,f) # combined-stats['train/loss_scheduler'] = # combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) # combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) # combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) # combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) # combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) # combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) # combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) # combined_stats['total/duration'] = duration # combined_stats['total/steps_per_second'] = float(t) / float(duration) # combined_stats['total/episodes'] = episodes # combined_stats['rollout/episodes'] = epoch_episodes # combined_stats['rollout/actions_std'] = np.std(epoch_actions) def load_log(self): with open(LOG_PATH + LOG_FILE, 'rb') as f: data = pickle.load(f) print("data:", data) def set_cuda(self): self.is_cuda = torch.cuda.is_available() print("torch version: ", torch.__version__) print("is_cuda: ", self.is_cuda) print(torch.cuda.get_device_name(0)) if self.is_cuda: self.device = torch.device("cuda:0") print("Program will run on *****GPU-CUDA***** ") else: self.device = torch.device("cpu") print("Program will run on *****CPU***** ") def update_dataPlot(self, dataPlot, t): r_buff, x_buff, u_buff = self.env.get_current_plant_status() for i in range(self.env.num_plant): dataPlot.update(i, t, r_buff[i], x_buff[i], u_buff[i]) dataPlot.plot() plt.pause(0.0001)