if done: break writer.add_scalar('reward/train', episode_reward, i_episode) # Update param_noise based on distance metric if args.param_noise: episode_transitions = memory.memory[memory.position - t:memory.position] states = torch.cat( [transition[0] for transition in episode_transitions], 0) unperturbed_actions = agent.select_action(states, None, None) perturbed_actions = torch.cat( [transition[1] for transition in episode_transitions], 0) ddpg_dist = ddpg_distance_metric(perturbed_actions.numpy(), unperturbed_actions.numpy()) param_noise.adapt(ddpg_dist) rewards.append(episode_reward) if i_episode % 10 == 0: state = torch.Tensor([env.reset()]) episode_reward = 0 while True: action = agent.select_action(state) next_state, reward, done, _ = env.step(action.numpy()[0]) episode_reward += reward next_state = torch.Tensor([next_state]) state = next_state
def generate_B(self, coff, gamma, tau, hidden_size, num_inputs, actor_size, num_episodes=60000, exploration_end=150, batch_size=512, updates_per_step=5000): self.env = QuantizationEnv(self.C, self.b, self.x, self.hd, coff) self.agent = DDPG(gamma, tau, hidden_size, self.env.action_bin, num_inputs, actor_size) rewards = [] total_numsteps = 0 updates = 0 max_trail = 10000 best_bb = 10000 # 开启num_episodes次最佳方案寻找 for i_episode in range(num_episodes): state = torch.Tensor([self.env.reset()]) if self.ou_noise: self.ounoise.scale = (self.noise_scale - self.final_noise_scale) * max(0, exploration_end - i_episode) \ / exploration_end + self.final_noise_scale self.ounoise.reset() if self.param_noise: self.agent.perturb_actor_parameters(self.param_noise) episode_reward = 0 continuous_neg = 0 continuous_pos = 0 temp_trail = 0 control_bit = 0 next_state = self.env.compute_Cbx(self.b) next_state = torch.Tensor([next_state]) while True: # yyj if control_bit > 15: control_bit = control_bit % 16 state = next_state action = self.agent.select_action(state, self.ounoise, self.param_noise) next_state, reward, done, bb = self.env.step( action, control_bit, self.actor_size) # print(control_bit, next_state[0], reward, done, bb) control_bit = control_bit + 1 total_numsteps += 1 episode_reward += reward # bb是c_v值 if best_bb > bb: best_bb = bb self.new_b = action if reward > 0: continuous_pos += 1 continuous_neg = 0 if continuous_pos > 10: done = True if reward < 0: continuous_neg += 1 continuous_pos = 0 if continuous_neg > 10: done = True if temp_trail > max_trail: done = True action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward = torch.Tensor([reward]) self.memory.push(state, action, mask, next_state, reward) # state = next_state temp_trail += 1 # memorysize还不够,不会进入 if len(self.memory) > batch_size: for _ in range(updates_per_step): transitions = self.memory.sample(1) batch = Transition(*zip(*transitions)) # value_loss属于右边的网络,policy_loss属于左边的网络 value_loss, policy_loss = self.agent.update_parameters( batch) print("epoch:", i_episode, "updates", updates, "value_loss:", value_loss, " policy_loss:", policy_loss) updates += 1 if done: break if self.param_noise: episode_transitions = self.memory.memory[self.memory.position - batch_size:self. memory.position] states = torch.cat( [transition[0] for transition in episode_transitions], 0) unperturbed_actions = self.agent.select_action( states, None, None) perturbed_actions = torch.cat( [transition[1] for transition in episode_transitions], 0) ddpg_dist = ddpg_distance_metric(perturbed_actions.numpy(), unperturbed_actions.numpy()) self.param_noise.adapt(ddpg_dist) rewards.append(episode_reward) continuous_neg = 0 continuous_pos = 0 temp_trail = 0 if i_episode % 10 == 0 and i_episode != 0: state = torch.Tensor([self.env.reset()]) episode_reward = 0 control_bit = 0 while True: action = self.agent.select_action(state) next_state, reward, done, bb = self.env.step( action.numpy()[0], control_bit) episode_reward += reward if best_bb > bb: best_bb = bb self.new_b = action if reward > 0: continuous_pos += 1 continuous_neg = 0 if continuous_pos > 10: done = True if reward < 0: continuous_neg += 1 continuous_pos = 0 if continuous_neg > 10: done = True if temp_trail > max_trail: done = True next_state = torch.Tensor([next_state]) state = next_state temp_trail += 1 if done: break rewards.append(episode_reward) print( "Episode: {}, total numsteps: {}, reward: {}, average reward: {}" .format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) return self.new_b
writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if done: break writer.add_scalar('reward/train', episode_reward, i_episode) # Update param_noise based on distance metric if args.param_noise: episode_transitions = memory.memory[memory.position-t:memory.position] states = torch.cat([transition[0] for transition in episode_transitions], 0) unperturbed_actions = agent.select_action(states, None, None) perturbed_actions = torch.cat([transition[1] for transition in episode_transitions], 0) ddpg_dist = ddpg_distance_metric(perturbed_actions.numpy(), unperturbed_actions.numpy()) param_noise.adapt(ddpg_dist) rewards.append(episode_reward) if i_episode % 10 == 0: state = torch.Tensor([env.reset()]) episode_reward = 0 while True: action = agent.select_action(state) next_state, reward, done, _ = env.step(action.numpy()[0]) episode_reward += reward next_state = torch.Tensor([next_state]) state = next_state
def main(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space) agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space) vehicle_memory = ReplayMemory(1000000) attacker_memory = ReplayMemory(1000000) vehicle_ounoise = OUNoise(env.vehicle_action_space) if args.ou_noise else None attacker_ounoise = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([[env.reset()]]) # 4-dimensional velocity observation if args.ou_noise: vehicle_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale vehicle_ounoise.reset() attacker_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale attacker_ounoise.reset() episode_reward = 0 while True: action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle) action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker) next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0]) total_numsteps += 1 episode_reward += reward action_vehicle = torch.Tensor(action_vehicle) action_attacker = torch.Tensor(action_attacker) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([-reward]) reward_attacker = torch.Tensor([env.RC+reward]) vehicle_memory.push(state, action_vehicle, mask, next_state, reward_vehicle) attacker_memory.push(state, action_attacker, mask, next_state, reward_attacker) state = next_state if len(vehicle_memory) > args.batch_size: for _ in range(args.updates_per_step): transitions_vehicle = vehicle_memory.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transition_attacker = attacker_memory.sample(args.batch_size) batch_attacker = Transition(*zip(*transition_attacker)) value_loss_1, policy_loss_1 = agent_vehicle.update_parameters(batch_vehicle) value_loss_2, policy_loss_2 = agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if done: break # writer.add_scalar('reward/train', episode_reward, i_episode) # Update param_noise based on distance metric if args.param_noise: episode_transitions_vehicle = vehicle_memory.memory[vehicle_memory.position - t:vehicle_memory.position] states_vehicle = torch.cat([transition[0] for transition in episode_transitions_vehicle], 0) unperturbed_actions_vehicle = agent_vehicle.select_action(states_vehicle, None, None) perturbed_actions_vehicle = torch.cat([transition[1] for transition in episode_transitions_vehicle], 0) ddpg_dist_vehicle = ddpg_distance_metric(perturbed_actions_vehicle.numpy(), unperturbed_actions_vehicle.numpy()) param_noise_vehicle.adapt(ddpg_dist_vehicle) episode_transitions_attacker = attacker_memory.memory[attacker_memory.position - t:attacker_memory.position] states_attacker = torch.cat([transition[0] for transition in episode_transitions_attacker], 0) unperturbed_actions_attacker = agent_attacker.select_action(states_attacker, None, None) perturbed_actions_attacker = torch.cat([transition[1] for transition in episode_transitions_attacker], 0) ddpg_dist_attacker = ddpg_distance_metric(perturbed_actions_attacker.numpy(), unperturbed_actions_attacker.numpy()) param_noise_attacker.adapt(ddpg_dist_attacker) rewards.append(episode_reward) if i_episode % 10 == 0: state = torch.Tensor([[env.reset()]]) episode_reward = 0 while True: action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle) action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker) next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0]) episode_reward += reward next_state = torch.Tensor([[next_state]]) state = next_state if done: break # writer.add_scalar('reward/test', episode_reward, i_episode) rewards.append(episode_reward) print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) env.close()