def __init__(self, env, hidden_layer=[64, 64]): self.env = env #self.env.env.disableViewer = False self.num_inputs = env.observation_space.shape[0] self.num_outputs = env.action_space.shape[0] self.hidden_layer = hidden_layer self.params = Params() self.Net = ActorCriticNet self.model = self.Net(self.num_inputs, self.num_outputs, self.hidden_layer) self.model.share_memory() self.shared_obs_stats = Shared_obs_stats(self.num_inputs) self.memory = ReplayMemory(10000000) self.value_memory = ReplayMemory(10000000) self.test_mean = [] self.test_std = [] self.noisy_test_mean = [] self.noisy_test_std = [] self.fig = plt.figure() #self.fig2 = plt.figure() self.lr = self.params.lr plt.show(block=False) self.test_list = [] self.noisy_test_list = [] self.queue = mp.Queue() self.value_queue = mp.Queue() self.mpdone = [mp.Event(), mp.Event(), mp.Event(), mp.Event()] self.process = [] self.traffic_light = TrafficLight() self.counter = Counter() self.best_trajectory = ReplayMemory(5000) self.best_score_queue = mp.Queue() self.best_score = mp.Value("f", 0) self.max_reward = mp.Value("f", 1) self.expert_trajectory = ReplayMemory(1e7) self.validation_trajectory = ReplayMemory(6000 * 9) self.best_validation = 1.0 self.current_best_validation = 1.0 self.return_obs_stats = Shared_obs_stats(1) self.gpu_model = self.Net(self.num_inputs, self.num_outputs, self.hidden_layer) self.base_controller = None
def main(): # start = time.time() config = load_config() num_agents = config['num_agents'] state_dim = config['state_dim'] state_length = config['state_length'] action_dim = config['action_dim'] exploration_param = config['exploration_param'] lr = config['learning_rate'] betas = config['betas'] gamma = config['discount_factor'] K_epochs = config['ppo_epoch'] ppo_clip = config['ppo_clip'] torch.manual_seed(123) traffic_light = TrafficLight() counter = Counter() shared_model = PPO(state_dim, state_length, action_dim, exploration_param, lr, betas, gamma, K_epochs, ppo_clip) shared_model.policy.share_memory() batch_buffer = shared_batch_buffer() # optimizer = optim.Adam(shared_model.policy.parameters(), lr=lr) processes = [] p = mp.Process(target=chief, args=(config, traffic_light, counter, shared_model, batch_buffer)) p.start() processes.append(p) for rank in range(num_agents): p = mp.Process(target=train, args=(rank, config, traffic_light, counter, shared_model, batch_buffer)) p.start() processes.append(p) for p in processes: p.join()
parser.add_argument('--test', action='store_true', help='test ') parser.add_argument('--feature', type=int, default=153, help='features num') parser.add_argument('--force', action='store_true', help='force two leg together') parser.add_argument('--start-epoch', type=int, default=0, help='start-epoch') if __name__ == '__main__': args = parser.parse_args() os.environ['OMP_NUM_THREADS'] = '1' torch.manual_seed(args.seed) num_inputs = args.feature num_actions = 18 traffic_light = TrafficLight() counter = Counter() ac_net = ActorCritic(num_inputs, num_actions) opt_ac = optim.Adam(ac_net.parameters(), lr=args.lr) shared_grad_buffers = Shared_grad_buffers(ac_net) shared_obs_stats = Shared_obs_stats(num_inputs) if args.resume: print("=> loading checkpoint ") checkpoint = torch.load('../../7.87.t7') #checkpoint = torch.load('../../best.t7') args.start_epoch = checkpoint['epoch'] #best_prec1 = checkpoint['best_prec1'] ac_net.load_state_dict(checkpoint['state_dict'])
class RL(object): def __init__(self, env, hidden_layer=[64, 64]): self.env = env #self.env.env.disableViewer = False self.num_inputs = env.observation_space.shape[0] self.num_outputs = env.action_space.shape[0] self.hidden_layer = hidden_layer self.params = Params() self.model = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) self.model.share_memory() self.shared_obs_stats = Shared_obs_stats(self.num_inputs) self.best_model = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) self.memory = ReplayMemory(self.params.num_steps * 10000) self.test_mean = [] self.test_std = [] self.noisy_test_mean = [] self.noisy_test_std = [] self.fig = plt.figure() #self.fig2 = plt.figure() self.lr = self.params.lr plt.show(block=False) self.test_list = [] self.noisy_test_list = [] self.queue = mp.Queue() self.mpdone = [mp.Event(), mp.Event(), mp.Event(), mp.Event()] self.process = [] self.traffic_light = TrafficLight() self.counter = Counter() self.best_trajectory = ReplayMemory(300) self.best_score_queue = mp.Queue() self.best_score = mp.Value("f", 0) self.expert_trajectory = ReplayMemory(600000) self.validation_trajectory = ReplayMemory(6000 * 9) self.best_validation = 1.0 self.current_best_validation = 1.0 self.noise = mp.Value("f", -0.5) def normalize_data(self, num_iter=50000, file='shared_obs_stats.pkl'): state = self.env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) for i in range(num_iter): self.shared_obs_stats.observes(state) state = self.shared_obs_stats.normalize(state) mu, log_std, v = model_old(state) eps = torch.randn(mu.size()) action = (mu + log_std.exp() * Variable(eps)) env_action = action.data.squeeze().numpy() state, reward, done, _ = self.env.step(env_action) if done: state = self.env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) with open(file, 'wb') as output: pickle.dump(self.shared_obs_stats, output, pickle.HIGHEST_PROTOCOL) def run_test(self, num_test=1): state = self.env.reset() #_for_test() state = Variable(torch.Tensor(state).unsqueeze(0)) model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) ave_test_reward = 0 total_rewards = [] for i in range(num_test): total_reward = 0 while True: state = self.shared_obs_stats.normalize(state) mu, log_std, v = self.model(state) action = mu.data.squeeze().numpy() state, reward, done, _ = self.env.step(action) total_reward += reward #print(state) #print("done", done, "state", state) if done: state = self.env.reset() #_for_test() #print(self.env.position) #print(self.env.time) state = Variable(torch.Tensor(state).unsqueeze(0)) ave_test_reward += total_reward / num_test total_rewards.append(total_reward) break state = Variable(torch.Tensor(state).unsqueeze(0)) #print("avg test reward is", ave_test_reward) reward_mean = statistics.mean(total_rewards) reward_std = statistics.stdev(total_rewards) self.test_mean.append(reward_mean) self.test_std.append(reward_std) self.test_list.append((reward_mean, reward_std)) #print(self.model.state_dict()) def run_test_with_noise(self, num_test=10): state = self.env.reset() #_for_test() state = Variable(torch.Tensor(state).unsqueeze(0)) model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) ave_test_reward = 0 total_rewards = [] for i in range(num_test): total_reward = 0 while True: state = self.shared_obs_stats.normalize(state) mu, log_std, v = self.model(state) eps = torch.randn(mu.size()) action = (mu + 0.1 * Variable(eps)) action = action.data.squeeze().numpy() state, reward, done, _ = self.env.step(action) total_reward += reward if done: state = self.env.reset() #_for_test() state = Variable(torch.Tensor(state).unsqueeze(0)) ave_test_reward += total_reward / num_test total_rewards.append(total_reward) break state = Variable(torch.Tensor(state).unsqueeze(0)) #print("avg test reward is", ave_test_reward) reward_mean = statistics.mean(total_rewards) reward_std = statistics.stdev(total_rewards) self.noisy_test_mean.append(reward_mean) self.noisy_test_std.append(reward_std) self.noisy_test_list.append((reward_mean, reward_std)) def plot_statistics(self): ax = self.fig.add_subplot(121) ax2 = self.fig.add_subplot(122) low = [] high = [] index = [] noisy_low = [] noisy_high = [] for i in range(len(self.test_mean)): low.append(self.test_mean[i] - self.test_std[i]) high.append(self.test_mean[i] + self.test_std[i]) noisy_low.append(self.noisy_test_mean[i] - self.noisy_test_std[i]) noisy_high.append(self.noisy_test_mean[i] + self.noisy_test_std[i]) index.append(i) plt.xlabel('iterations') plt.ylabel('average rewards') ax.plot(self.test_mean, 'b') ax2.plot(self.noisy_test_mean, 'g') ax.fill_between(index, low, high, color='cyan') ax2.fill_between(index, noisy_low, noisy_high, color='r') #ax.plot(map(sub, test_mean, test_std)) self.fig.canvas.draw() def collect_samples(self, num_samples, start_state=None, noise=-2.0, env_index=0, random_seed=1): random.seed(random_seed) torch.manual_seed(random_seed + 1) np.random.seed(random_seed + 2) if start_state == None: start_state = self.env.reset() samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] values = [] q_values = [] real_rewards = [] self.model.set_noise(self.noise.value) #print("soemthing 1") model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) #print("something 2") model_old.set_noise(self.noise.value) state = start_state state = Variable(torch.Tensor(state).unsqueeze(0)) total_reward = 0 #q_value = Variable(torch.zeros(1, 1)) while True: self.model.set_noise(self.noise.value) model_old.set_noise(self.noise.value) signal_init = self.traffic_light.get() score = 0 while samples < num_samples and not done: state = self.shared_obs_stats.normalize(state) states.append(state.data.numpy()) mu, log_std, v = model_old(state) eps = torch.randn(mu.size()) #print(log_std.exp()) #print(log_std.exp()) action = (mu + log_std.exp() * Variable(eps)) actions.append(action.data.numpy()) values.append(v.data.numpy()) env_action = action.data.squeeze().numpy() state, reward, done, _ = self.env.step(env_action) score += reward rewards.append(Variable(reward * torch.ones(1)).data.numpy()) # rewards.append(Variable(reward * torch.ones(1)).data.numpy()) real_rewards.append( Variable(reward * torch.ones(1)).data.numpy()) state = Variable(torch.Tensor(state).unsqueeze(0)) next_state = self.shared_obs_stats.normalize(state) next_states.append(next_state.data.numpy()) samples += 1 state = self.shared_obs_stats.normalize(state) _, _, v = model_old(state) if done: R = torch.zeros(1, 1) else: R = v.data R = Variable(R) for i in reversed(range(len(real_rewards))): R = self.params.gamma * R + Variable( torch.from_numpy(real_rewards[i])) q_values.insert(0, R.data.numpy()) self.queue.put([states, actions, next_states, rewards, q_values]) self.counter.increment() self.env.reset() while self.traffic_light.get() == signal_init: pass start_state = self.env.reset() state = start_state state = Variable(torch.Tensor(state).unsqueeze(0)) total_reward = 0 samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] values = [] q_values = [] real_rewards = [] model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) model_old.set_noise(self.noise.value) def collect_expert_samples(self, num_samples, filename, noise=-2.0, speed=0, y_speed=0, validation=False): expert_env = cassieRLEnvMirrorWithTransition() start_state = expert_env.reset_by_speed(speed, y_speed) samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] values = [] q_values = [] self.model.set_noise(self.noise.value) model_expert = ActorCriticNet(85, 10, [256, 256]) model_expert.load_state_dict(torch.load(filename)) model_expert.set_noise(self.noise.value) with open('torch_model/cassie3dMirror2kHz_shared_obs_stats.pkl', 'rb') as input: expert_shared_obs_stats = pickle.load(input) residual_model = ActorCriticNet(85, 10, [256, 256]) residual_model.load_state_dict( torch.load("torch_model/StablePelvisNov14_v2.pt")) state = start_state virtual_state = np.concatenate([np.copy(state[0:46]), np.zeros(39)]) state = Variable(torch.Tensor(state).unsqueeze(0)) virtual_state = Variable(torch.Tensor(virtual_state).unsqueeze(0)) total_reward = 0 total_sample = 0 #q_value = Variable(torch.zeros(1, 1)) if validation: max_sample = 300 else: max_sample = 3000 while total_sample < max_sample: model_expert.set_noise(self.noise.value) score = 0 while samples < num_samples and not done: state = expert_shared_obs_stats.normalize(state) virtual_state = expert_shared_obs_stats.normalize( virtual_state) states.append(state.data.numpy()) mu, log_std, v = model_expert(state) mu_residual, _, _ = residual_model(state) #print(log_std.exp()) action = (mu + mu_residual * 0) pos_index = [7, 8, 9, 14, 20, 21, 22, 23, 28, 34] vel_index = [6, 7, 8, 12, 18, 19, 20, 21, 25, 31] ref_pos, ref_vel = expert_env.get_kin_next_state() saved_action = action.data.numpy() + ref_pos[pos_index] actions.append(action.data.numpy()) #actions.append(saved_action) values.append(v.data.numpy()) eps = torch.randn(mu.size()) if validation: weight = 0.1 else: weight = 0.1 mu = (action + np.exp(-2) * Variable(eps)) env_action = mu.data.squeeze().numpy() state, reward, done, _ = expert_env.step(env_action) reward = 1 rewards.append(Variable(reward * torch.ones(1)).data.numpy()) #q_value = self.gamma * q_value + Variable(reward * torch.ones(1)) virtual_state = np.concatenate( [np.copy(state[0:46]), np.zeros(39)]) virtual_state = Variable( torch.Tensor(virtual_state).unsqueeze(0)) state = Variable(torch.Tensor(state).unsqueeze(0)) next_state = expert_shared_obs_stats.normalize(state) next_states.append(next_state.data.numpy()) samples += 1 #total_sample += 1 score += reward print("expert score", score) state = expert_shared_obs_stats.normalize(state) #print(state) _, _, v = model_expert(state) if done: R = torch.zeros(1, 1) else: R = v.data R = torch.ones(1, 1) * 100 R = Variable(R) for i in reversed(range(len(rewards))): R = self.params.gamma * R + Variable( torch.from_numpy(rewards[i])) q_values.insert(0, R.data.numpy()) if not validation and score >= 299: self.expert_trajectory.push( [states, actions, next_states, rewards, q_values]) total_sample += 300 elif score >= 299: self.validation_trajectory.push( [states, actions, next_states, rewards, q_values]) expert_env.reset_by_speed(speed, y_speed) start_state = expert_env.reset_by_speed(speed, y_speed) state = start_state state = Variable(torch.Tensor(state).unsqueeze(0)) total_reward = 0 samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] values = [] q_values = [] def update_critic(self, batch_size, num_epoch): self.model.train() optimizer = optim.Adam(self.model.parameters(), lr=self.lr * 10) model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) for k in range(num_epoch): batch_states, batch_actions, batch_next_states, batch_rewards, batch_q_values = self.memory.sample( batch_size) batch_states = Variable(torch.Tensor(batch_states)) batch_q_values = Variable(torch.Tensor(batch_q_values)) batch_next_states = Variable(torch.Tensor(batch_next_states)) _, _, v_pred_next = model_old(batch_next_states) _, _, v_pred = self.model(batch_states) loss_value = (v_pred - batch_q_values)**2 #loss_value = (v_pred_next * self.params.gamma + batch_rewards - v_pred)**2 loss_value = 0.5 * torch.mean(loss_value) optimizer.zero_grad() loss_value.backward(retain_graph=True) optimizer.step() #print(loss_value) def update_actor(self, batch_size, num_epoch, supervised=False): model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) model_old.set_noise(self.noise.value) self.model.train() optimizer = optim.Adam(self.model.parameters(), lr=self.lr) for k in range(num_epoch): batch_states, batch_actions, batch_next_states, batch_rewards, batch_q_values = self.memory.sample( batch_size) #mirror batch_mirror_states = np.copy(batch_states) #batch_mirror_actions = np.copy(batch_actions) batch_states = Variable(torch.Tensor(batch_states)) batch_q_values = Variable(torch.Tensor(batch_q_values)) batch_actions = Variable(torch.Tensor(batch_actions)) mu_old, log_std_old, v_pred_old = model_old(batch_states) #mu_old_next, log_std_old_next, v_pred_old_next = model_old(batch_next_states) mu, log_std, v_pred = self.model(batch_states) batch_advantages = batch_q_values - v_pred_old probs_old = normal(batch_actions, mu_old, log_std_old) probs = normal(batch_actions, mu, log_std) ratio = (probs - (probs_old)).exp() ratio = ratio.unsqueeze(1) #print(model_old.noise) #print(ratio) batch_advantages = batch_q_values - v_pred_old surr1 = ratio * batch_advantages surr2 = ratio.clamp(1 - self.params.clip, 1 + self.params.clip) * batch_advantages loss_clip = -torch.mean(torch.min(surr1, surr2)) #expert loss if supervised is True: if k % 1000 == 999: batch_expert_states, batch_expert_actions, _, _, _ = self.expert_trajectory.sample( len(self.expert_trajectory.memory)) else: batch_expert_states, batch_expert_actions, _, _, _ = self.expert_trajectory.sample( batch_size) batch_expert_states = Variable( torch.Tensor(batch_expert_states)) batch_expert_actions = Variable( torch.Tensor(batch_expert_actions)) mu_expert, _, _ = self.model(batch_expert_states) mu_expert_old, _, _ = model_old(batch_expert_states) loss_expert1 = torch.mean( (batch_expert_actions - mu_expert)**2) clip_expert_action = torch.max( torch.min(mu_expert, mu_expert_old + 0.1), mu_expert_old - 0.1) loss_expert2 = torch.mean( (clip_expert_action - batch_expert_actions)**2) loss_expert = loss_expert1 #torch.min(loss_expert1, loss_expert2) else: loss_expert = 0 #mirror loss ( negation_obs_indices, right_obs_indices, left_obs_indices, negation_action_indices, right_action_indices, left_action_indices, ) = self.env.get_mirror_indices() batch_mirror_states[:, negation_obs_indices] *= -1 rl = np.concatenate((right_obs_indices, left_obs_indices)) lr = np.concatenate((left_obs_indices, right_obs_indices)) batch_mirror_states[:, rl] = batch_mirror_states[:, lr] #with torch.no_grad(): batch_mirror_actions, _, _ = self.model(batch_states) batch_mirror_actions_clone = batch_mirror_actions.clone() batch_mirror_actions_clone[:, negation_action_indices] = batch_mirror_actions[:, negation_action_indices] * -1 rl = np.concatenate((right_action_indices, left_action_indices)) lr = np.concatenate((left_action_indices, right_action_indices)) batch_mirror_actions_clone[:, rl] = batch_mirror_actions[:, lr] #batch_mirror_actions_v2[:,] #print(vars(batch_mirror_actions)) batch_mirror_states = Variable(torch.Tensor(batch_mirror_states)) #batch_mirror_actions = Variable(torch.Tensor(batch_mirror_actions)) mirror_mu, _, _ = self.model(batch_mirror_states) mirror_loss = torch.mean( (mirror_mu - batch_mirror_actions_clone)**2) total_loss = 1.0 * loss_clip + self.weight * loss_expert + mirror_loss #print(k, loss_expert) #print(k) '''self.validation() if k % 1000 == 999: #self.run_test(num_test=2) #self.run_test_with_noise(num_test=2) #self.plot_statistics() self.save_model("expert_model/SupervisedModel16X16Jan11.pt") if (self.current_best_validation - self.best_validation) > -1e-5: break if self.best_validation > self.current_best_validation: self.best_validation = self.current_best_validation self.current_best_validation = 1.0 print(k, loss_expert)''' #print(loss_clip) optimizer.zero_grad() total_loss.backward(retain_graph=True) #print(torch.nn.utils.clip_grad_norm(self.model.parameters(),1)) optimizer.step() if self.lr > 1e-4: self.lr *= 0.99 if self.weight > 10: self.weight *= 0.99 if self.weight < 10: self.weight = 10.0 def validation(self): batch_states, batch_actions, batch_next_states, batch_rewards, batch_q_values = self.validation_trajectory.sample( 300) model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) batch_states = Variable(torch.Tensor(batch_states)) batch_q_values = Variable(torch.Tensor(batch_q_values)) batch_actions = Variable(torch.Tensor(batch_actions)) mu_old, log_std_old, v_pred_old = model_old(batch_states) loss = torch.mean((batch_actions - mu_old)**2) if loss.data < self.current_best_validation: self.current_best_validation = loss.data print("validation error", self.current_best_validation) def clear_memory(self): self.memory.clear() def save_model(self, filename): torch.save(self.model.state_dict(), filename) def save_shared_obs_stas(self, filename): with open(filename, 'wb') as output: pickle.dump(self.shared_obs_stats, output, pickle.HIGHEST_PROTOCOL) def save_statistics(self, filename): statistics = [self.noisy_test_mean, self.noisy_test_std] with open(filename, 'wb') as output: pickle.dump(statistics, output, pickle.HIGHEST_PROTOCOL) def collect_samples_multithread(self): #queue = Queue.Queue() self.lr = 1e-4 self.weight = 10 num_threads = 100 seeds = [np.random.randint(0, 4294967296) for _ in range(num_threads)] ts = [ mp.Process(target=self.collect_samples, args=(600, ), kwargs={ 'noise': -0.5, 'random_seed': seed }) for seed in seeds ] for t in ts: t.start() #print("started") self.model.set_noise(self.noise.value) while True: #if len(self.noisy_test_mean) % 100 == 1: #self.save_statistics("stats/MirrorJuly17Iter%d_v2.stat"%(len(self.noisy_test_mean))) self.save_model("torch_model/StepperSep13.pt") #print(self.traffic_light.val.value) #if len(self.test_mean) % 100 == 1 and self.test_mean[len(self.test_mean)-1] > 300: # self.save_model("torch_model/multiskill/v4_cassie3dMirrorIter%d.pt"%(len(self.test_mean),)) while len(self.memory.memory) < 60000: #print(len(self.memory.memory)) if self.counter.get() == num_threads: for i in range(num_threads): self.memory.push(self.queue.get()) self.counter.increment() if len(self.memory.memory) < 60000 and self.counter.get( ) == num_threads + 1: self.counter.reset() self.traffic_light.switch() self.update_critic(128, 1280) self.update_actor(128, 1280, supervised=False) self.clear_memory() #self.run_test(num_test=2) self.run_test_with_noise(num_test=2) #self.validation() self.plot_statistics() if self.noise.value > -1.5: self.noise.value *= 1.001 print(self.noise.value) self.model.set_noise(self.noise.value) self.traffic_light.switch() self.counter.reset() def add_env(self, env): self.env_list.append(env)
class RL(object): def __init__(self, env, hidden_layer=[64, 64]): self.env = env #self.env.env.disableViewer = False self.num_inputs = env.observation_space.shape[0] self.num_outputs = env.action_space.shape[0] self.hidden_layer = hidden_layer self.params = Params() self.Net = ActorCriticNet self.model = self.Net(self.num_inputs, self.num_outputs, self.hidden_layer) self.model.share_memory() self.shared_obs_stats = Shared_obs_stats(self.num_inputs) self.memory = ReplayMemory(10000000) self.value_memory = ReplayMemory(10000000) self.test_mean = [] self.test_std = [] self.noisy_test_mean = [] self.noisy_test_std = [] self.fig = plt.figure() #self.fig2 = plt.figure() self.lr = self.params.lr plt.show(block=False) self.test_list = [] self.noisy_test_list = [] self.queue = mp.Queue() self.value_queue = mp.Queue() self.mpdone = [mp.Event(), mp.Event(), mp.Event(), mp.Event()] self.process = [] self.traffic_light = TrafficLight() self.counter = Counter() self.best_trajectory = ReplayMemory(5000) self.best_score_queue = mp.Queue() self.best_score = mp.Value("f", 0) self.max_reward = mp.Value("f", 1) self.expert_trajectory = ReplayMemory(1e7) self.validation_trajectory = ReplayMemory(6000 * 9) self.best_validation = 1.0 self.current_best_validation = 1.0 self.return_obs_stats = Shared_obs_stats(1) self.gpu_model = self.Net(self.num_inputs, self.num_outputs, self.hidden_layer) self.base_controller = None def normalize_data(self, num_iter=1000, file='shared_obs_stats.pkl'): state = self.env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) #model_old = ActorCriticNet(self.num_inputs, self.num_outputs,self.hidden_layer) #model_old.load_state_dict(self.model.state_dict()) for i in range(num_iter): print(i) self.shared_obs_stats.observes(state) state = self.shared_obs_stats.normalize(state) #.to(device) #mu = self.model.sample_actions(state) #action = mu#(mu + log_std.exp()*Variable(eps)) #env_action = action.cpu().data.squeeze().numpy() env_action = np.random.randn(self.num_outputs) state, reward, done, _ = self.env.step(env_action * 0) if done: state = self.env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) with open(file, 'wb') as output: pickle.dump(self.shared_obs_stats, output, pickle.HIGHEST_PROTOCOL) def run_test(self, num_test=1): state = self.env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) ave_test_reward = 0 total_rewards = [] for i in range(num_test): total_reward = 0 while True: state = self.shared_obs_stats.normalize(state) mu = self.model.sample_best_actions(state) action = mu.cpu().data.squeeze().numpy() if self.base_controller is not None: base_action = self.base_controller.sample_best_actions( state[:, 0:self.base_dim]) action += base_action.cpu().data.squeeze().numpy() state, reward, done, _ = self.env.step(action) total_reward += reward #print(state) #print("done", done, "state", state) if done: state = self.env.reset() #print(self.env.position) #print(self.env.time) state = Variable(torch.Tensor(state).unsqueeze(0)) ave_test_reward += total_reward / num_test total_rewards.append(total_reward) break state = Variable(torch.Tensor(state).unsqueeze(0)) #print("avg test reward is", ave_test_reward) reward_mean = statistics.mean(total_rewards) reward_std = statistics.stdev(total_rewards) self.test_mean.append(reward_mean) self.test_std.append(reward_std) self.test_list.append((reward_mean, reward_std)) #print(self.model.state_dict()) def run_test_with_noise(self, num_test=10): state = self.env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) ave_test_reward = 0 total_rewards = [] for i in range(num_test): total_reward = 0 while True: state = self.shared_obs_stats.normalize(state) mu = self.model.sample_actions(state) #eps = torch.randn(mu.size()) action = mu.cpu().data.squeeze().numpy() if self.base_controller is not None: base_action = self.base_controller.sample_best_actions( state[:, 0:self.base_dim]) action += base_action.cpu().data.squeeze().numpy() state, reward, done, _ = self.env.step(action) total_reward += reward if done: state = self.env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) ave_test_reward += total_reward / num_test total_rewards.append(total_reward) break state = Variable(torch.Tensor(state).unsqueeze(0)) #print("avg test reward is", ave_test_reward) reward_mean = statistics.mean(total_rewards) reward_std = statistics.stdev(total_rewards) self.noisy_test_mean.append(reward_mean) self.noisy_test_std.append(reward_std) self.noisy_test_list.append((reward_mean, reward_std)) def plot_statistics(self): ax = self.fig.add_subplot(121) ax2 = self.fig.add_subplot(122) low = [] high = [] index = [] noisy_low = [] noisy_high = [] for i in range(len(self.test_mean)): low.append(self.test_mean[i] - self.test_std[i]) high.append(self.test_mean[i] + self.test_std[i]) noisy_low.append(self.noisy_test_mean[i] - self.noisy_test_std[i]) noisy_high.append(self.noisy_test_mean[i] + self.noisy_test_std[i]) index.append(i) plt.xlabel('iterations') plt.ylabel('average rewards') ax.plot(self.test_mean, 'b') ax2.plot(self.noisy_test_mean, 'g') ax.fill_between(index, low, high, color='cyan') ax2.fill_between(index, noisy_low, noisy_high, color='r') #ax.plot(map(sub, test_mean, test_std)) self.fig.canvas.draw() def collect_samples(self, num_samples, start_state=None, noise=-2.0, env_index=0, random_seed=1): random.seed(random_seed) torch.manual_seed(random_seed + 1) np.random.seed(random_seed + 2) env.seed(random_seed + 3) #env.seed(random_seed+3) #print(noise) if start_state == None: start_state = self.env.reset() samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] values = [] q_values = [] real_rewards = [] log_probs = [] noise = self.base_noise * self.explore_noise.value self.model.set_noise(noise) state = start_state state = Variable(torch.Tensor(state).unsqueeze(0)) total_reward = 0 #q_value = Variable(torch.zeros(1, 1)) while True: noise = self.base_noise * self.explore_noise.value self.model.set_noise(noise) #print("local", self.model.p_fcs[1].bias.data[0]) #self.model.load_state_dict(torch.load(self.model_name)) signal_init = self.traffic_light.get() score = 0 while samples < num_samples and not done: #self.shared_obs_stats.observes(state) states.append(state.cpu().data.numpy()) #self.shared_obs_stats.observes(state) #print("samples", samples) state = self.shared_obs_stats.normalize(state) action = self.model.sample_actions(state) log_prob = self.model.calculate_prob(state, action) actions.append(np.copy(action.cpu().data.numpy())) log_probs.append(log_prob.data.numpy()) env_action = action.data.squeeze().numpy() if self.base_controller is not None: base_action = self.base_controller.sample_best_actions( state[:, 0:self.base_dim]) env_action += base_action.cpu().data.squeeze().numpy() state, reward, done, _ = self.env.step(env_action) score += reward if reward > self.max_reward.value: self.max_reward.value = reward if self.max_reward.value > 50: self.max_reward.value = 50 #print(self.max_reward.value) #reward *= 0.3 rewards.append(Variable(reward * torch.ones(1)).data.numpy()) real_rewards.append( Variable(reward * torch.ones(1)).data.numpy()) state = Variable(torch.Tensor(state).unsqueeze(0)) next_states.append(state.cpu().data.numpy()) next_state = self.shared_obs_stats.normalize(state) samples += 1 state = self.shared_obs_stats.normalize(state) v = ( self.model.get_value(state) ) * self.max_reward.value # / self.return_obs_stats.std) + self.return_obs_stats.mean # if self.base_controller is not None: # v += self.base_controller.get_value(state)*self.max_reward.value if done: R = torch.zeros(1, 1) else: R = v.data R = Variable(R) for i in reversed(range(len(real_rewards))): reward = Variable( torch.from_numpy(real_rewards[i]).unsqueeze(0)) R = self.params.gamma * R + reward #self.return_obs_stats.normalize(reward)# Variable(torch.from_numpy(real_rewards[i])) q_values.insert(0, R.cpu().data.numpy()) #self.return_obs_stats.observes(R) #mirror # mirror_states = np.array(states) # mirror_actions = np.array(actions) # ( # negation_obs_indices, # right_obs_indices, # left_obs_indices, # negation_action_indices, # right_action_indices, # left_action_indices, # ) = self.env.get_mirror_indices() # mirror_states[:, :, negation_obs_indices] *= -1 # rl = np.concatenate((right_obs_indices, left_obs_indices)) # lr = np.concatenate((left_obs_indices, right_obs_indices)) # mirror_states[:, :, rl] = mirror_states[:, :,lr] # #mirror_actions = self.model.sample_best_actions(batch_states) # mirror_actions[:, :, negation_action_indices] = mirror_actions[:, :, negation_action_indices] * -1 # rl = np.concatenate((right_action_indices, left_action_indices)) # lr = np.concatenate((left_action_indices, right_action_indices)) # mirror_actions[:, :, rl] = mirror_actions[:, :, lr] # mirror_states = list(mirror_states) # mirror_actions = list(mirror_actions) # #self.queue.put([mirror_states, mirror_actions, np.copy(next_states), np.copy(rewards), np.copy(q_values), np.copy(log_probs)]) # value_states = states + mirror_states # value_actions = actions + mirror_actions # value_next_states = next_states + next_states # value_rewards = rewards + rewards # value_q_values = q_values + q_values # value_log_probs = log_probs + log_probs self.queue.put( [states, actions, next_states, rewards, q_values, log_probs]) #self.value_queue.put([value_states, value_actions, value_next_states, value_rewards, value_q_values, value_log_probs]) self.counter.increment() self.env.reset() #print(self.model.noise) #print(score) #if score > self.best_score.value: #self.best_score_queue.put([states, actions, next_states, rewards, q_values]) #self.best_score.value = score # print("best score", self.best_score.value) #self.max_reward.value = self.best_score.value / samples * 99 while self.traffic_light.get() == signal_init: pass start_state = self.env.reset() state = start_state state = Variable(torch.Tensor(state).unsqueeze(0)) total_reward = 0 samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] values = [] q_values = [] real_rewards = [] log_probs = [] #print("child", self.model.noise) #if self.model.noise[0] > -2: # self.model.noise *= 1.001 def collect_expert_samples(self, num_samples, filename, noise=-2.0, validation=False, difficulty=[0, 0]): import gym expert_env = gym.make("mocca_envs:Walker3DStepperEnv-v0") expert_env.set_difficulty(difficulty) start_state = expert_env.reset() samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] q_values = [] model_expert = self.Net(self.num_inputs, self.num_outputs, self.hidden_layer) model_expert.load_state_dict(torch.load(filename)) policy_noise = noise * np.ones(self.num_outputs) model_expert.set_noise(policy_noise) state = start_state state = Variable(torch.Tensor(state).unsqueeze(0)) total_reward = 0 total_sample = 0 #q_value = Variable(torch.zeros(1, 1)) if validation: max_sample = 300 else: max_sample = 50000 while total_sample < max_sample: score = 0 while samples < num_samples and not done: state = self.shared_obs_stats.normalize(state) states.append(state.data.numpy()) mu = model_expert.sample_best_actions(state) actions.append(mu.data.numpy()) eps = torch.randn(mu.size()) if validation: weight = 0.1 else: weight = 0.1 env_action = model_expert.sample_actions(state) env_action = env_action.data.squeeze().numpy() state, reward, done, _ = expert_env.step(env_action) reward = 1 rewards.append(Variable(reward * torch.ones(1)).data.numpy()) state = Variable(torch.Tensor(state).unsqueeze(0)) next_state = self.shared_obs_stats.normalize(state) next_states.append(next_state.data.numpy()) samples += 1 #total_sample += 1 score += reward print("expert score", score) state = self.shared_obs_stats.normalize(state) #print(state) v = model_expert.get_value(state) if done: R = torch.zeros(1, 1) else: R = v.data R = torch.ones(1, 1) * 100 R = Variable(R) for i in reversed(range(len(rewards))): R = self.params.gamma * R + Variable( torch.from_numpy(rewards[i])) q_values.insert(0, R.data.numpy()) if not validation and score >= num_samples: self.expert_trajectory.push( [states, actions, next_states, rewards, q_values]) total_sample += num_samples elif score >= num_samples: self.validation_trajectory.push( [states, actions, next_states, rewards, q_values]) start_state = expert_env.reset() state = start_state state = Variable(torch.Tensor(state).unsqueeze(0)) total_reward = 0 samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] q_values = [] def normalize(self): for i in range(len(self.memory.memory)): batch_states, _, _, _, _ = self.memory.sample_one_at_a_time() batch_states = Variable(torch.Tensor(batch_states)) self.shared_obs_stats.observes(batch_states) def update_critic(self, batch_size, num_epoch): self.gpu_model.train() optimizer = optim.Adam(self.gpu_model.parameters(), lr=10 * self.lr) #optimizer = RAdam(self.model.parameters(), lr=self.lr*10) for k in range(num_epoch): batch_states, batch_actions, batch_next_states, batch_rewards, batch_q_values, _ = self.memory.sample( batch_size) batch_states = self.shared_obs_stats.normalize( Variable(torch.Tensor(batch_states))).to(device) batch_q_values = Variable(torch.Tensor(batch_q_values)).to( device) / self.max_reward.value v_pred = self.gpu_model.get_value(batch_states) # if self.base_controller is not None: # v_pred = self.base_controller.get_value(batch_states) + v_pred loss_value = (v_pred - batch_q_values)**2 loss_value = 0.5 * torch.mean(loss_value) optimizer.zero_grad() loss_value.backward(retain_graph=True) optimizer.step() #print(loss_value) def update_actor(self, batch_size, num_epoch, supervised=False): model_old = self.Net(self.num_inputs, self.num_outputs, self.hidden_layer).to(device) model_old.load_state_dict(self.gpu_model.state_dict()) model_old.set_noise(self.model.noise) self.gpu_model.train() optimizer = optim.Adam(self.gpu_model.parameters(), lr=self.lr) #optimizer = RAdam(self.model.parameters(), lr=self.lr) for k in range(num_epoch): batch_states, batch_actions, _, _, batch_q_values, batch_log_probs = self.memory.sample( batch_size) #mirror batch_mirror_states = np.copy(batch_states) batch_states = self.shared_obs_stats.normalize( Variable(torch.Tensor(batch_states))).to(device) batch_q_values = Variable(torch.Tensor(batch_q_values)).to( device) / self.max_reward.value #batch_q_values = self.return_obs_stats.normalize(Variable(torch.Tensor(batch_q_values))) batch_actions = Variable(torch.Tensor(batch_actions)).to(device) v_pred_old = model_old.get_value(batch_states) # if self.base_controller is not None: # v_pred_old += self.base_controller.get_value(batch_states) batch_advantages = (batch_q_values - v_pred_old) #probs_old = model_old.calculate_prob_gpu(batch_states, batch_actions) #probs = self.model.calculate_prob_gpu(batch_states, batch_actions) #mu_old = model_old.get_mean_actions(batch_states)[0] #mu = self.model.get_mean_actions(batch_states)[0] # log_std_old = model_old.get_log_stds(mu_old) # log_std = self.model.get_log_stds(mu) # probs_old = normal(batch_actions, mu_old, log_std_old) # probs = normal(batch_actions, mu, log_std) # ratio = (probs.exp()/probs_old.exp()) # print("ratio1", ratio.mean()) probs = self.gpu_model.calculate_prob_gpu(batch_states, batch_actions) probs_old = Variable(torch.Tensor(batch_log_probs)).to( device ) #model_old.calculate_prob_gpu(batch_states, batch_actions) ratio = (probs - (probs_old)).exp() ratio = ratio.unsqueeze(1) #print("ratio", ratio) #print(probs, probs_old) surr1 = ratio * batch_advantages surr2 = ratio.clamp(1 - self.params.clip, 1 + self.params.clip) * batch_advantages loss_clip = -torch.mean(torch.min(surr1, surr2)) #expert loss if supervised is True: if k % 1000 == 999: batch_expert_states, batch_expert_actions, _, _, _ = self.expert_trajectory.sample( len(self.expert_trajectory.memory)) else: batch_expert_states, batch_expert_actions, _, _, _ = self.expert_trajectory.sample( min(batch_size, len(self.expert_trajectory.memory))) batch_expert_states = Variable( torch.Tensor(batch_expert_states)).to(device) batch_expert_actions = Variable( torch.Tensor(batch_expert_actions)).to(device) mu_expert = self.gpu_model.sample_best_actions( batch_expert_states) loss_expert = torch.mean((batch_expert_actions - mu_expert)**2) print(loss_expert) else: loss_expert = 0 #mirror loss # ( # negation_obs_indices, # right_obs_indices, # left_obs_indices, # negation_action_indices, # right_action_indices, # left_action_indices, # ) = self.env.get_mirror_indices() # batch_mirror_states[:, negation_obs_indices] *= -1 # rl = np.concatenate((right_obs_indices, left_obs_indices)) # lr = np.concatenate((left_obs_indices, right_obs_indices)) # batch_mirror_states[:, rl] = batch_mirror_states[:, lr] # #with torch.no_grad(): # batch_mirror_actions = self.gpu_model.sample_best_actions(batch_states) # if self.base_controller is not None: # batch_mirror_actions = self.base_controller.sample_best_actions(batch_states) + batch_mirror_actions # batch_mirror_actions_clone = batch_mirror_actions.clone() # batch_mirror_actions_clone[:, negation_action_indices] = batch_mirror_actions[:, negation_action_indices] * -1 # rl = np.concatenate((right_action_indices, left_action_indices)) # lr = np.concatenate((left_action_indices, right_action_indices)) # batch_mirror_actions_clone[:, rl] = batch_mirror_actions[:, lr] # batch_mirror_states = Variable(torch.Tensor(batch_mirror_states)).to(device) # mirror_mu = self.gpu_model.sample_best_actions(batch_mirror_states) # if self.base_controller is not None: # mirror_mu = self.base_controller.sample_best_actions(batch_mirror_states) + mirror_mu # mirror_loss = torch.mean((mirror_mu - batch_mirror_actions_clone)**2) loss_w = 0 #torch.mean(batch_w**2) entropy_loss = -self.gpu_model.log_std.mean() if supervised: total_loss = loss_expert else: total_loss = loss_clip #+ mirror_loss #print(total_loss) #print("mirror_loss", mirror_loss) #print(k, loss_w) optimizer.zero_grad() total_loss.backward(retain_graph=True) #print(torch.nn.utils.clip_grad_norm(self.model.parameters(),1)) optimizer.step() #print(self.shared_obs_stats.mean.data) if self.lr > 1e-4: self.lr *= 0.99 else: self.lr = 1e-4 if self.weight > 10: self.weight *= 0.99 if self.weight < 10: self.weight = 10.0 def validation(self): batch_states, batch_actions, batch_next_states, batch_rewards, batch_q_values = self.validation_trajectory.sample( 300) model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) batch_states = Variable(torch.Tensor(batch_states)) batch_q_values = Variable(torch.Tensor(batch_q_values)) batch_actions = Variable(torch.Tensor(batch_actions)) mu_old, log_std_old, v_pred_old = model_old(batch_states) loss = torch.mean((batch_actions - mu_old)**2) if loss.data < self.current_best_validation: self.current_best_validation = loss.data print("validation error", self.current_best_validation) def clear_memory(self): self.memory.clear() self.value_memory.clear() def save_model(self, filename): torch.save(self.model.state_dict(), filename) def save_shared_obs_stas(self, filename): with open(filename, 'wb') as output: pickle.dump(self.shared_obs_stats, output, pickle.HIGHEST_PROTOCOL) def save_statistics(self, filename): statistics = [ self.time_passed, self.num_samples, self.test_mean, self.test_std, self.noisy_test_mean, self.noisy_test_std ] with open(filename, 'wb') as output: pickle.dump(statistics, output, pickle.HIGHEST_PROTOCOL) def collect_samples_multithread(self): #queue = Queue.Queue() import time self.start = time.time() self.lr = 1e-4 self.weight = 10 num_threads = 20 self.num_samples = 0 self.time_passed = 0 score_counter = 0 total_thread = 0 max_samples = 40000 seeds = [i * 100 for i in range(num_threads)] self.explore_noise = mp.Value("f", -2.0) #self.base_noise = np.array([2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2]) self.base_noise = np.ones(self.num_outputs) noise = self.base_noise * self.explore_noise.value #noise[[0, 1, 5, 6]] = -3 ts = [ mp.Process(target=self.collect_samples, args=(500, ), kwargs={ 'noise': noise, 'random_seed': seed }) for seed in seeds ] for t in ts: t.start() #print("started") self.model.set_noise(noise) self.gpu_model.set_noise(noise) while score_counter < 100: #if len(self.noisy_test_mean) % 100 == 1: #self.save_statistics("stats/Humanoid_ppo_seed1_Iter%d.stat"%(len(self.noisy_test_mean))) #print(self.traffic_light.val.value) #if len(self.test_mean) % 100 == 1 and self.test_mean[len(self.test_mean)-1] > 300: # self.save_model("torch_model/multiskill/v4_cassie3dMirrorIter%d.pt"%(len(self.test_mean),)) # while len(self.memory.memory) < 50000: # if self.counter.get() == num_threads: # for i in range(num_threads): # self.memory.push(self.queue.get()) # self.counter.increment() # if len(self.memory.memory) < 50000 and self.counter.get() == num_threads + 1: # self.counter.reset() # self.traffic_light.switch() self.save_model(self.model_name) while len(self.memory.memory) < max_samples: #print(self.counter.get()) if self.counter.get() == num_threads: for i in range(num_threads): #if random.randint(0, 1) == 0: self.memory.push(self.queue.get()) #self.value_memory.push(self.value_queue.get()) total_thread += num_threads # else: # self.memory.push_half(self.queue.get()) self.counter.increment() if self.counter.get() == num_threads + 1 and len( self.memory.memory) < max_samples: self.traffic_light.switch() self.counter.reset() self.num_samples += len(self.memory.memory) #while not self.best_score_queue.empty(): # self.best_trajectory.push_half(self.best_score_queue.get()) #self.normalize() #self.model.to(device) self.gpu_model.load_state_dict(self.model.state_dict()) self.gpu_model.to(device) self.gpu_model.set_noise(self.model.noise) if self.base_controller is not None: self.base_controller.to(device) self.update_critic(min(128 * 8, len(self.memory.memory)), (len(self.memory.memory) // 3000 + 1) * 8) self.update_actor(min(128 * 8, len(self.memory.memory)), (len(self.memory.memory) // 3000 + 1) * 8, supervised=False) #self.update_critic(128, 2560) #self.update_actor(128, 2560, supervised=False) self.gpu_model.to("cpu") if self.base_controller is not None: self.base_controller.to("cpu") self.model.load_state_dict(self.gpu_model.state_dict()) self.clear_memory() self.run_test(num_test=2) self.run_test_with_noise(num_test=2) print(self.num_samples, self.noisy_test_mean[-1]) if self.noisy_test_mean[-1] > 3500: score_counter += 1 else: score_counter = 0 if self.explore_noise.value > -1.5: print("main", self.model.noise) self.explore_noise.value *= 1.001 self.model.noise = self.base_noise * self.explore_noise.value print(self.max_reward.value) self.plot_statistics() self.time_passed = time.time() - self.start total_thread = 0 #print("main", self.model.p_fcs[1].bias.data[0]) self.traffic_light.switch() self.counter.reset() def add_env(self, env): self.env_list.append(env)
class RL(object): def __init__(self, env, hidden_layer=[64, 64]): self.env = env #self.env.env.disableViewer = False self.num_inputs = env.observation_space.shape[0] self.num_outputs = env.action_space.shape[0] self.hidden_layer = hidden_layer self.params = Params() self.model = ActorCriticNet(self.num_inputs, self.num_outputs,self.hidden_layer) self.model.share_memory() self.shared_obs_stats = Shared_obs_stats(self.num_inputs) self.best_model = ActorCriticNet(self.num_inputs, self.num_outputs,self.hidden_layer) self.memory = ReplayMemory(self.params.num_steps * 10000) self.test_mean = [] self.test_std = [] self.noisy_test_mean = [] self.noisy_test_std = [] self.fig = plt.figure() #self.fig2 = plt.figure() self.lr = self.params.lr plt.show(block=False) self.test_list = [] self.noisy_test_list = [] self.queue = mp.Queue() self.mpdone = [mp.Event(), mp.Event(), mp.Event(), mp.Event()] self.process = [] self.traffic_light = TrafficLight() self.counter = Counter() self.best_trajectory = ReplayMemory(300) self.best_score_queue = mp.Queue() self.best_score = mp.Value("f", 0) self.expert_trajectory = ReplayMemory(600000) self.validation_trajectory = ReplayMemory(6000*9) self.best_validation = 1.0 self.current_best_validation = 1.0 def normalize_data(self, num_iter=50000, file='shared_obs_stats.pkl'): state = self.env.reset_for_normalization() state = Variable(torch.Tensor(state).unsqueeze(0)) model_old = ActorCriticNet(self.num_inputs, self.num_outputs,self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) for i in range(num_iter): self.shared_obs_stats.observes(state) state = self.shared_obs_stats.normalize(state) mu, log_std, v = model_old(state) eps = torch.randn(mu.size()) action = (mu + log_std.exp()*Variable(eps)) env_action = action.data.squeeze().numpy() state, reward, done, _ = self.env.step(env_action) if done: state = self.env.reset() state = Variable(torch.Tensor(state).unsqueeze(0)) with open(file, 'wb') as output: pickle.dump(self.shared_obs_stats, output, pickle.HIGHEST_PROTOCOL) def run_test(self, num_test=1): state = self.env.reset_for_test() state = Variable(torch.Tensor(state).unsqueeze(0)) model_old = ActorCriticNet(self.num_inputs, self.num_outputs,self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) ave_test_reward = 0 total_rewards = [] '''self.fig2.clear() circle1 = plt.Circle((0, 0), 0.5, edgecolor='r', facecolor='none') circle2 = plt.Circle((0, 0), 0.01, edgecolor='r', facecolor='none') plt.axis('equal')''' for i in range(num_test): total_reward = 0 while True: state = self.shared_obs_stats.normalize(state) mu, log_std, v = self.model(state) action = mu.data.squeeze().numpy() state, reward, done, _ = self.env.step(action) total_reward += reward #print(state) #print("done", done, "state", state) if done: state = self.env.reset_for_test() #print(self.env.position) #print(self.env.time) state = Variable(torch.Tensor(state).unsqueeze(0)) ave_test_reward += total_reward / num_test total_rewards.append(total_reward) break state = Variable(torch.Tensor(state).unsqueeze(0)) #print("avg test reward is", ave_test_reward) reward_mean = statistics.mean(total_rewards) reward_std = statistics.stdev(total_rewards) self.test_mean.append(reward_mean) self.test_std.append(reward_std) self.test_list.append((reward_mean, reward_std)) #print(self.model.state_dict()) def run_test_with_noise(self, num_test=10): state = self.env.reset_for_test() state = Variable(torch.Tensor(state).unsqueeze(0)) model_old = ActorCriticNet(self.num_inputs, self.num_outputs,self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) ave_test_reward = 0 total_rewards = [] '''self.fig2.clear() circle1 = plt.Circle((0, 0), 0.5, edgecolor='r', facecolor='none') circle2 = plt.Circle((0, 0), 0.01, edgecolor='r', facecolor='none') plt.axis('equal')''' for i in range(num_test): total_reward = 0 while True: state = self.shared_obs_stats.normalize(state) mu, log_std, v = self.model(state) eps = torch.randn(mu.size()) action = (mu + 0.1*Variable(eps)) action = action.data.squeeze().numpy() state, reward, done, _ = self.env.step(action) total_reward += reward if done: state = self.env.reset_for_test() state = Variable(torch.Tensor(state).unsqueeze(0)) ave_test_reward += total_reward / num_test total_rewards.append(total_reward) break state = Variable(torch.Tensor(state).unsqueeze(0)) #print("avg test reward is", ave_test_reward) reward_mean = statistics.mean(total_rewards) reward_std = statistics.stdev(total_rewards) self.noisy_test_mean.append(reward_mean) self.noisy_test_std.append(reward_std) self.noisy_test_list.append((reward_mean, reward_std)) def plot_statistics(self): ax = self.fig.add_subplot(121) ax2 = self.fig.add_subplot(122) low = [] high = [] index = [] noisy_low = [] noisy_high = [] for i in range(len(self.test_mean)): low.append(self.test_mean[i] - self.test_std[i]) high.append(self.test_mean[i] + self.test_std[i]) noisy_low.append(self.noisy_test_mean[i]-self.noisy_test_std[i]) noisy_high.append(self.noisy_test_mean[i]+self.noisy_test_std[i]) index.append(i) #ax.set_xlim([0,1000]) #ax.set_ylim([0,300]) plt.xlabel('iterations') plt.ylabel('average rewards') ax.plot(self.test_mean, 'b') ax2.plot(self.noisy_test_mean, 'g') ax.fill_between(index, low, high, color='cyan') ax2.fill_between(index, noisy_low, noisy_high, color='r') #ax.plot(map(sub, test_mean, test_std)) self.fig.canvas.draw() #plt.draw() #plt.errorbar(self.test_mean) def collect_samples(self, num_samples, start_state=None, noise=-2.0, env_index=0, random_seed=1): random.seed(random_seed) torch.manual_seed(random_seed+1) np.random.seed(random_seed+2) if start_state == None: start_state = self.env.reset() samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] values = [] q_values = [] self.model.set_noise(noise) #print("soemthing 1") model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) #print("something 2") model_old.set_noise(noise) state = start_state state = Variable(torch.Tensor(state).unsqueeze(0)) total_reward = 0 #q_value = Variable(torch.zeros(1, 1)) while True: self.model.set_noise(-2.0) model_old.set_noise(-2.0) signal_init = self.traffic_light.get() score = 0 while samples < num_samples and not done: state = self.shared_obs_stats.normalize(state) #print(state) states.append(state.data.numpy()) mu, log_std, v = model_old(state) eps = torch.randn(mu.size()) #print(log_std.exp()) action = (mu + log_std.exp()*Variable(eps)) actions.append(action.data.numpy()) values.append(v.data.numpy()) env_action = action.data.squeeze().numpy() state, reward, done, _ = self.env.step(env_action) score += reward rewards.append(Variable(reward * torch.ones(1)).data.numpy()) #q_value = self.gamma * q_value + Variable(reward * torch.ones(1)) state = Variable(torch.Tensor(state).unsqueeze(0)) next_state = self.shared_obs_stats.normalize(state) next_states.append(next_state.data.numpy()) samples += 1 state = self.shared_obs_stats.normalize(state) #print(state) _,_,v = model_old(state) if done: R = torch.zeros(1, 1) else: R = v.data R = Variable(R) for i in reversed(range(len(rewards))): R = self.params.gamma * R + Variable(torch.from_numpy(rewards[i])) q_values.insert(0, R.data.numpy()) #self.memory.push([states, actions, next_states, rewards, q_values]) #return [states, actions, next_states, rewards, q_values] self.queue.put([states, actions, next_states, rewards, q_values]) #print(score) '''if score >= self.best_score.value: self.best_score.value = score print("best score", self.best_score.value) self.best_score_queue.put([states, actions, next_states, rewards, q_values])''' self.counter.increment() self.env.reset() while self.traffic_light.get() == signal_init: pass start_state = self.env.reset() state = start_state state = Variable(torch.Tensor(state).unsqueeze(0)) total_reward = 0 samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] values = [] q_values = [] model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) model_old.set_noise(-2.0) def collect_expert_samples(self, num_samples, filename, noise=-2.0, speed=0, y_speed=0, validation=False): expert_env = cassieRLEnvMirror() start_state = expert_env.reset_by_speed(speed, y_speed) samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] values = [] q_values = [] self.model.set_noise(noise) model_expert = ActorCriticNet(85, 10, [256, 256]) model_expert.load_state_dict(torch.load(filename)) model_expert.set_noise(noise) with open('torch_model/cassie3dMirror2kHz_shared_obs_stats.pkl', 'rb') as input: expert_shared_obs_stats = pickle.load(input) state = start_state virtual_state = np.concatenate([np.copy(state[0:46]), np.zeros(39)]) state = Variable(torch.Tensor(state).unsqueeze(0)) virtual_state = Variable(torch.Tensor(virtual_state).unsqueeze(0)) total_reward = 0 total_sample = 0 #q_value = Variable(torch.zeros(1, 1)) if validation: max_sample = 300 else: max_sample = 600 while total_sample < max_sample: model_expert.set_noise(-2.0) score = 0 while samples < num_samples and not done: state = expert_shared_obs_stats.normalize(state) virtual_state = expert_shared_obs_stats.normalize(virtual_state) states.append(state.data.numpy()) mu, log_std, v = model_expert(state) #print(log_std.exp()) action = mu actions.append(action.data.numpy()) values.append(v.data.numpy()) eps = torch.randn(mu.size()) if validation: weight = 0.1 else: weight = 0.1 mu = (action + weight*Variable(eps)) env_action = mu.data.squeeze().numpy() state, reward, done, _ = expert_env.step(env_action) reward = 1 rewards.append(Variable(reward * torch.ones(1)).data.numpy()) #q_value = self.gamma * q_value + Variable(reward * torch.ones(1)) virtual_state = np.concatenate([np.copy(state[0:46]), np.zeros(39)]) virtual_state = Variable(torch.Tensor(virtual_state).unsqueeze(0)) state = Variable(torch.Tensor(state).unsqueeze(0)) next_state = expert_shared_obs_stats.normalize(state) next_states.append(next_state.data.numpy()) samples += 1 #total_sample += 1 score += reward print("expert score", score) state = expert_shared_obs_stats.normalize(state) #print(state) _,_,v = model_expert(state) if done: R = torch.zeros(1, 1) else: R = v.data R = torch.ones(1, 1) * 100 R = Variable(R) for i in reversed(range(len(rewards))): R = self.params.gamma * R + Variable(torch.from_numpy(rewards[i])) q_values.insert(0, R.data.numpy()) if not validation and score >= 299: self.expert_trajectory.push([states, actions, next_states, rewards, q_values]) total_sample += 300 elif score >= 299: self.validation_trajectory.push([states, actions, next_states, rewards, q_values]) expert_env.reset_by_speed(speed, y_speed) start_state = expert_env.reset_by_speed(speed, y_speed) state = start_state state = Variable(torch.Tensor(state).unsqueeze(0)) total_reward = 0 samples = 0 done = False states = [] next_states = [] actions = [] rewards = [] values = [] q_values = [] def update_critic(self, batch_size, num_epoch): self.model.train() optimizer = optim.Adam(self.model.parameters(), lr=self.lr*10) model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) for k in range(num_epoch): batch_states, batch_actions, batch_next_states, batch_rewards, batch_q_values = self.memory.sample(batch_size) batch_states = Variable(torch.Tensor(batch_states)) batch_q_values = Variable(torch.Tensor(batch_q_values)) batch_next_states = Variable(torch.Tensor(batch_next_states)) _, _, v_pred_next = model_old(batch_next_states) _, _, v_pred = self.model(batch_states) loss_value = (v_pred - batch_q_values)**2 #loss_value = (v_pred_next * self.params.gamma + batch_rewards - v_pred)**2 loss_value = 0.5*torch.mean(loss_value) optimizer.zero_grad() loss_value.backward(retain_graph=True) optimizer.step() #av_value_loss = loss_value.data[0] #model_old.load_state_dict(model.state_dict()) #print("value loss ", av_value_loss) def update_actor(self, batch_size, num_epoch, supervised=False): model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) model_old.set_noise(self.model.noise) self.model.train() optimizer = optim.Adam(self.model.parameters(), lr=self.lr) for k in range(num_epoch): batch_states, batch_actions, batch_next_states, batch_rewards, batch_q_values = self.memory.sample(batch_size) batch_states = Variable(torch.Tensor(batch_states)) batch_q_values = Variable(torch.Tensor(batch_q_values)) batch_actions = Variable(torch.Tensor(batch_actions)) mu_old, log_std_old, v_pred_old = model_old(batch_states) #mu_old_next, log_std_old_next, v_pred_old_next = model_old(batch_next_states) mu, log_std, v_pred = self.model(batch_states) batch_advantages = batch_q_values - v_pred_old probs_old = normal(batch_actions, mu_old, log_std_old) probs = normal(batch_actions, mu, log_std) ratio = (probs - (probs_old)).exp() ratio = ratio.unsqueeze(1) #print(model_old.noise) #print(ratio) batch_advantages = batch_q_values - v_pred_old surr1 = ratio * batch_advantages surr2 = ratio.clamp(1-self.params.clip, 1+self.params.clip) * batch_advantages loss_clip = -torch.mean(torch.min(surr1, surr2)) #expert loss if supervised is True: if k % 1000 == 999: batch_expert_states, batch_expert_actions, _, _, _ = self.expert_trajectory.sample(len(self.expert_trajectory.memory)) else: batch_expert_states, batch_expert_actions, _, _, _ = self.expert_trajectory.sample(batch_size) batch_expert_states = Variable(torch.Tensor(batch_expert_states)) batch_expert_actions = Variable(torch.Tensor(batch_expert_actions)) mu_expert, _, _ = self.model(batch_expert_states) mu_expert_old, _, _ = model_old(batch_expert_states) loss_expert1 = torch.mean((batch_expert_actions-mu_expert)**2) clip_expert_action = torch.max(torch.min(mu_expert, mu_expert_old + 0.1), mu_expert_old-0.1) loss_expert2 = torch.mean((clip_expert_action-batch_expert_actions)**2) loss_expert = loss_expert1#torch.min(loss_expert1, loss_expert2) else: loss_expert = 0 total_loss = self.policy_weight * loss_clip + self.weight*loss_expert print(k, loss_expert) optimizer.zero_grad() total_loss.backward(retain_graph=True) #print(torch.nn.utils.clip_grad_norm(self.model.parameters(),1)) optimizer.step() if self.lr > 1e-4: self.lr *= 0.99 def validation(self): batch_states, batch_actions, batch_next_states, batch_rewards, batch_q_values = self.validation_trajectory.sample(300) model_old = ActorCriticNet(self.num_inputs, self.num_outputs, self.hidden_layer) model_old.load_state_dict(self.model.state_dict()) batch_states = Variable(torch.Tensor(batch_states)) batch_q_values = Variable(torch.Tensor(batch_q_values)) batch_actions = Variable(torch.Tensor(batch_actions)) mu_old, log_std_old, v_pred_old = model_old(batch_states) loss = torch.mean((batch_actions-mu_old)**2) if loss.data < self.current_best_validation: self.current_best_validation = loss.data print("validation error", self.current_best_validation) def clear_memory(self): self.memory.clear() def save_model(self, filename): torch.save(self.model.state_dict(), filename) def save_shared_obs_stas(self, filename): with open(filename, 'wb') as output: pickle.dump(self.shared_obs_stats, output, pickle.HIGHEST_PROTOCOL) def save_statistics(self, filename): statistics = [self.noisy_test_mean, self.noisy_test_std] with open(filename, 'wb') as output: pickle.dump(statistics, output, pickle.HIGHEST_PROTOCOL) def collect_samples_multithread(self): #queue = Queue.Queue() self.lr = 1e-4 self.weight = 10 num_threads = 10 seeds = [ np.random.randint(0, 4294967296) for _ in range(num_threads) ] ts = [ mp.Process(target=self.collect_samples,args=(300,), kwargs={'noise':-2.0, 'random_seed':seed}) for seed in seeds ] for t in ts: t.start() #print("started") self.model.set_noise(-2.0) while True: self.save_model("torch_model/corl_demo.pt") while len(self.memory.memory) < 3000: #print(len(self.memory.memory)) if self.counter.get() == num_threads: for i in range(num_threads): self.memory.push(self.queue.get()) self.counter.increment() if len(self.memory.memory) < 3000 and self.counter.get() == num_threads + 1: self.counter.reset() self.traffic_light.switch() self.update_critic(128, self.critic_update_rate) self.update_actor(128, self.actor_update_rate, supervised=self.supervised) self.clear_memory() self.run_test(num_test=2) self.run_test_with_noise(num_test=2) #self.validation() self.plot_statistics() self.traffic_light.switch() self.counter.reset() def add_env(self, env): self.env_list.append(env)