def create_model(loaded): ''' Create Autoencoder from data that has previously been saved Parameters: loaded A model that has been loaded from a file Returns: newly created Autoencoder ''' old_args = loaded['args_dict'] enl, dnl = AutoEncoder.get_non_linearity(old_args['nonlinearity']) product = AutoEncoder(encoder_sizes=old_args['encoder'], encoding_dimension=old_args['dimension'], encoder_non_linearity=enl, decoder_non_linearity=dnl, decoder_sizes=old_args['decoder']) product.load_state_dict(loaded['model_state_dict']) return product
class DDQNAgent: def __init__(self, env, network, buffer, epsilon=0.05, batch_size=32): self.ae = AutoEncoder(25) self.ae.load_state_dict(torch.load('lunar_models/code25.pt', map_location=torch.device('cpu'))) self.env = env self.network = network self.target_network = deepcopy(network) self.buffer = buffer self.epsilon = epsilon self.batch_size = batch_size self.window = 100 self.reward_threshold = 195 # Avg reward before CartPole is "solved" self.initialize() def take_step(self, mode='train'): norm_s0 = normalize_vec(self.s_0) boh = torch.from_numpy(norm_s0).float() ae_out, ae_code = self.ae(Variable( boh.to('cpu')), 100, 100, 50) new_s = ae_code.detach().to('cpu').numpy() if mode == 'explore': action = self.env.action_space.sample() else: action = self.network.get_action(new_s, epsilon=self.epsilon) self.step_count += 1 s_1, r, done, _ = self.env.step(action) self.rewards += r norm_s1 = normalize_vec(s_1) ae_out, ae_code = self.ae(Variable(torch.from_numpy(norm_s1).float().to('cpu')), 100, 100, 50) new_s1 = ae_code.detach().to('cpu').numpy() self.buffer.append(new_s, action, r, done, new_s1) self.s_0 = s_1.copy() if done: self.s_0 = self.env.reset() return done # Implement DQN training algorithm def train(self, gamma=0.99, max_episodes=10000, batch_size=32, network_update_frequency=4, network_sync_frequency=2000): self.gamma = gamma # Populate replay buffer while self.buffer.burn_in_capacity() < 1: self.take_step(mode='explore') ep = 0 training = True while training: self.s_0 = self.env.reset() self.rewards = 0 done = False while done == False: if((ep % 50) == 0 ): self.env.render() done = self.take_step(mode='train') # Update network if self.step_count % network_update_frequency == 0: self.update() # Sync networks if self.step_count % network_sync_frequency == 0: self.target_network.load_state_dict( self.network.state_dict()) self.sync_eps.append(ep) if done: ep += 1 self.training_rewards.append(self.rewards) self.training_loss.append(np.mean(self.update_loss)) self.update_loss = [] mean_rewards = np.mean( self.training_rewards[-self.window:]) self.mean_training_rewards.append(mean_rewards) print("\rEpisode {:d} Mean Rewards {:.2f}\t\t".format( ep, mean_rewards), end="") if ep >= max_episodes: training = False print('\nEpisode limit reached.') break if mean_rewards >= self.reward_threshold: training = False print('\nEnvironment solved in {} episodes!'.format( ep)) break def calculate_loss(self, batch): states, actions, rewards, dones, next_states = [i for i in batch] rewards_t = torch.FloatTensor(rewards).to(device=self.network.device).reshape(-1, 1) actions_t = torch.LongTensor(np.array(actions)).reshape(-1, 1).to( device=self.network.device) dones_t = torch.ByteTensor(dones).to(device=self.network.device) qvals = torch.gather(self.network.get_qvals(states), 1, actions_t) ################################################################# # DDQN Update next_actions = torch.max(self.network.get_qvals(next_states), dim=-1)[1] next_actions_t = torch.LongTensor(next_actions).reshape(-1, 1).to( device=self.network.device) target_qvals = self.target_network.get_qvals(next_states) qvals_next = torch.gather(target_qvals, 1, next_actions_t).detach() ################################################################# qvals_next[dones_t] = 0 # Zero-out terminal states expected_qvals = self.gamma * qvals_next + rewards_t loss = nn.MSELoss()(qvals, expected_qvals) return loss def update(self): self.network.optimizer.zero_grad() batch = self.buffer.sample_batch(batch_size=self.batch_size) loss = self.calculate_loss(batch) loss.backward() self.network.optimizer.step() if self.network.device == 'cuda': self.update_loss.append(loss.detach().cpu().numpy()) else: self.update_loss.append(loss.detach().numpy()) def initialize(self): self.training_rewards = [] self.training_loss = [] self.update_loss = [] self.mean_training_rewards = [] self.sync_eps = [] self.rewards = 0 self.step_count = 0 self.s_0 = self.env.reset()
print(DEVICE) train = get_dataset() if False: print("Showing Random images from dataset") showRandomImaged(train) model = AutoEncoder().cuda() if torch.cuda.is_available() else AutoEncoder( ) if __DEBUG__ == True: print(model) criterian = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5) if LOAD == True: model.load_state_dict(torch.load(PATH)) for epoch in range(EPOCHS): for i, (images, _) in enumerate(train): images = images.to(DEVICE) out = model(images) loss = criterian(out, images) optimizer.zero_grad() loss.backward() optimizer.step() ## LOG print(f"epoch {epoch}/{EPOCHS}\nLoss : {loss.data}") if __DEBUG__ == True: if i % 10 == 0: