def collect_data(): env = Osillator() EP_NUM = 1500 data_set = [] for ep in range(EP_NUM): ep_loss = 0 state = env.reset() for t in range(env.max_iteration): state = torch.from_numpy(state).float().to(device) action = ppo.choose_action(state.cpu().data.numpy(), False) with torch.no_grad(): ca1 = model_1(state) ca2 = model_2(state) control_action = ca1 * action[0] + ca2 * action[1] next_state, reward, done = env.step( control_action.cpu().data.numpy()[0]) data_set.append([ state.cpu().data.numpy()[0], state.cpu().data.numpy()[1], np.clip(control_action.cpu().data.numpy()[0], -1, 1) ]) state = next_state if done: break print(t) return np.array(data_set)
def train_weight_adapter_DDPG(EP_NUM=2000): mkdir('./adapter_soft') env = Osillator() scores_deque = deque(maxlen=100) scores = [] for ep in range(EP_NUM): state = env.reset() agent.reset() score = 0 for t in range(200): action = agent.act(state) ca1 = model_1(torch.from_numpy(state).float().to(device)).cpu().data.numpy()[0] ca2 = model_2(torch.from_numpy(state).float().to(device)).cpu().data.numpy()[0] control_action = action[0]*ca1 + action[1]*ca2 next_state, _, done = env.step(control_action, smoothness=0.5) reward = 5 reward -= weight * abs(control_action) * 20 reward -= 1 / weight * (abs(next_state[0]) + abs(next_state[1])) if done and t < 95: reward -= 100 agent.step(state, action, reward, next_state, done, t) score += reward state = next_state if done: break scores_deque.append(score) scores.append(score) score_average = np.mean(scores_deque) if ep % 1 == 0: print('\rEpisode {}, Average Score: {:.2f}, Current Score:{:.2f}, Max: {:.2f}, Min: {:.2f}, Epsilon: {:.2f}, Momery:{:.1f}'\ .format(ep, score_average, scores[-1], np.max(scores), np.min(scores), agent.epsilon, len(agent.memory)), end="\n") if ep > 0 and ep % 100 == 0: torch.save(agent.actor_local.state_dict(), './adapter_soft/adapter_'+str(ep)+'_'+str(weight)+ '.pth')
def collect_data(adapter_name, INDI_NAME): assert EXP1 == True env = Osillator() model = Weight_adapter(2, 2).to(device) model.load_state_dict(torch.load(adapter_name)) EP_NUM = 1500 data_set = [] for ep in range(EP_NUM): ep_loss = 0 state = env.reset() for t in range(env.max_iteration): state = torch.from_numpy(state).float().to(device) action = model(state).cpu().data.numpy() with torch.no_grad(): ca1 = model_1(state) ca2 = model_2(state) control_action = ca1 * action[0] + ca2 * action[1] next_state, reward, done = env.step( control_action.cpu().data.numpy()[0]) data_set.append([ state.cpu().data.numpy()[0], state.cpu().data.numpy()[1], control_action.cpu().data.numpy()[0] ]) state = next_state if done: break print(ep_loss, t) return np.array(data_set)
def train(): env = Osillator() state_dim = 2 action_dim = 2 # reproducible # env.seed(RANDOMSEED) np.random.seed(RANDOMSEED) torch.manual_seed(RANDOMSEED) ppo = PPO(state_dim, action_dim, method=METHOD) global all_ep_r, update_plot, stop_plot all_ep_r = [] for ep in range(EP_MAX): s = env.reset() ep_r = 0 t0 = time.time() for t in range(EP_LEN): if RENDER: env.render() a = ppo.choose_action(s) u = gene_u(s, a, model_1, model_2) s_, _, done = env.step(u) # print(s, a, s_, r, done) # assert False r = 10 r -= WEIGHT * abs(np.clip(u, -1, 1)) * 20 r -= 1 / WEIGHT * (abs(s_[0]) + abs(s_[1])) if done and t < 95: r -= 100 ppo.store_transition( s, a, r ) # useful for pendulum since the nets are very small, normalization make it easier to learn s = s_ ep_r += r # update ppo if len(ppo.state_buffer) == BATCH_SIZE: ppo.finish_path(s_, done) ppo.update() # if done: # break ppo.finish_path(s_, done) print( 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'. format(ep + 1, EP_MAX, ep_r, time.time() - t0)) if ep == 0: all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) if PLOT_RESULT: update_plot.set() if (ep + 1) % 500 == 0 and ep >= 3000: ppo.save_model(path='ppo', ep=ep, weight=WEIGHT) if PLOT_RESULT: stop_plot.set() env.close()
def train_switcher_DDQN(): mkdir('./adapter_ab') env = Osillator() model = DQN(2, 2).to(device) target_model = DQN(2, 2).to(device) optimizer = optim.Adam(model.parameters()) EP_NUM = 2001 frame_idx = 0 fuel_list = [] ep_reward = deque(maxlen=100) for ep in range(EP_NUM): state = env.reset() ep_r = 0 for t in range(200): state = torch.from_numpy(state).float().to(device) epsilon = epsilon_by_frame(frame_idx) action = model.act(state, epsilon) with torch.no_grad(): if action == 0: control_action = model_1(state).cpu().data.numpy()[0] elif action == 1: control_action = model_2(state).cpu().data.numpy()[0] else: assert False control_action = 0 next_state, _, done = env.step(control_action) reward = 2 reward -= weight * abs(control_action) * 20 if done and t < 190: reward -= 100 replay_buffer.push(state.cpu().numpy(), action, reward, next_state, done) fuel_list.append(abs(control_action) * 20) state = next_state ep_r += reward frame_idx += 1 if len(replay_buffer) > batch_size: loss = compute_td_loss(model, target_model, batch_size, optimizer) if frame_idx % 100 == 0: update_target(model, target_model) if done: break ep_reward.append(ep_r) print('epoch:', ep, 'reward:', ep_r, 'average reward:', np.mean(ep_reward), 'fuel cost:', sum(fuel_list[-t - 1:]), 'epsilon:', epsilon, len(replay_buffer)) if ep >= 100 and ep % 100 == 0: torch.save( model.state_dict(), './adapter_ab/ddqn_' + str(ep) + '_' + str(weight) + '.pth')
def distill(adapter_name, INDI_NAME): optimizer = torch.optim.SGD(Individual.parameters(), lr=0.001, momentum=0.9) loss_func = torch.nn.MSELoss() env = Osillator() model = DQN(2, 2).to(device) EP_NUM = 500 model.load_state_dict(torch.load(adapter_name)) for ep in range(EP_NUM): ep_loss = 0 state = env.reset() for t in range(env.max_iteration): state = torch.from_numpy(state).float().to(device) action = model.act(state, epsilon=0) with torch.no_grad(): if action == 0: control_action = model_1(state) elif action == 1: control_action = model_2(state) control_action.requires_grad = False prediction = Individual(state) loss = loss_func(prediction, control_action) optimizer.zero_grad() loss.backward() optimizer.step() ep_loss += loss.item() next_state, reward, done = env.step( control_action.cpu().data.numpy()[0]) state = next_state if done: break print(ep_loss) torch.save(Individual.state_dict(), INDI_NAME)
def test(adapter_name=None, state_list=None, renew=False, mode='switch', INDI_NAME=None): print(mode) env = Osillator() EP_NUM = 500 if mode == 'switch': model = DQN(2, 2).to(device) model.load_state_dict(torch.load(adapter_name)) if mode == 'weight': model = Weight_adapter(2, 2).to(device) model.load_state_dict(torch.load(adapter_name)) if mode == 'individual': Individual.load_state_dict(torch.load(INDI_NAME)) if renew: state_list = [] fuel_list = [] ep_reward = [] trajectory = [] safe = [] unsafe = [] control_action_list = [] for ep in range(EP_NUM): if renew: state = env.reset() state_list.append(state) else: assert len(state_list) == EP_NUM state = env.reset(state_list[ep][0], state_list[ep][1]) ep_r = 0 fuel = 0 if ep == 0: trajectory.append(state) for t in range(env.max_iteration): state = torch.from_numpy(state).float().to(device) if mode == 'switch': action = model.act(state, epsilon=0) with torch.no_grad(): if action == 0: control_action = model_1(state).cpu().data.numpy()[0] elif action == 1: control_action = model_2(state).cpu().data.numpy()[0] else: assert False control_action = 0 elif mode == 'ppo': action = ppo.choose_action(state.cpu().data.numpy(), True) ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = action[0]*ca1 + action[1]*ca2 if ep == 0: print(t, state, control_action, action, ca1, ca2) elif mode == 'average': ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = (ca1 + ca2)/2 elif mode == 'planning': ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = plan(state, ca1, ca2) elif mode == 'd1': control_action = model_1(state).cpu().data.numpy()[0] if ep == 0: print(state, control_action) elif mode == 'd2': control_action = model_2(state).cpu().data.numpy()[0] elif mode == 'individual': if ATTACK: delta, original = fgsm(Individual, state) # delta = torch.from_numpy(np.random.uniform(low=-SCALE, high=SCALE, size=state.shape)).float().to(device) control_action = Individual(state+delta).cpu().data.numpy()[0] else: control_action = Individual(state).cpu().data.numpy()[0] next_state, reward, done = env.step(control_action) control_action = np.clip(control_action, -1, 1) fuel += abs(control_action) * 20 state = next_state if ep == 0: trajectory.append(state) control_action_list.append(control_action) ep_r += reward if done: break ep_reward.append(ep_r) if t >= 95: fuel_list.append(fuel) safe.append(state_list[ep]) else: print(ep, state_list[ep]) unsafe.append(state_list[ep]) safe = np.array(safe) unsafe = np.array(unsafe) np.save('./plot/'+mode+'_safe.npy', safe) np.save('./plot/'+mode+'_unsafe.npy', unsafe) return ep_reward, np.array(fuel_list), state_list, np.array(control_action_list)
def test(adapter_name=None, state_list=None, renew=False, mode='switch', INDI_NAME=None): print(mode) env = Osillator() model = DQN(2, 2).to(device) EP_NUM = 500 if mode == 'switch': model.load_state_dict(torch.load(adapter_name)) if mode == 'individual': Individual.load_state_dict(torch.load(INDI_NAME)) if renew: state_list = [] fuel_list = [] ep_reward = [] trajectory = [] for ep in range(EP_NUM): if renew: state = env.reset() state_list.append(state) else: assert len(state_list) == EP_NUM state = env.reset(state_list[ep][0], state_list[ep][1]) ep_r = 0 fuel = 0 if ep == 0: trajectory.append(state) for t in range(env.max_iteration): state = torch.from_numpy(state).float().to(device) # flag = where_inv(state.cpu().numpy()) if mode == 'switch': action = model.act(state, epsilon=0) with torch.no_grad(): if action == 0: control_action = model_1(state).cpu().data.numpy()[0] elif action == 1: control_action = model_2(state).cpu().data.numpy()[0] else: assert False control_action = 0 if ep == 0: print(t, state, action, control_action * 20) elif mode == 'd1': control_action = model_1(state).cpu().data.numpy()[0] elif mode == 'd2': control_action = model_2(state).cpu().data.numpy()[0] elif mode == 'individual': control_action = Individual(state).cpu().data.numpy()[0] next_state, reward, done = env.step(control_action) fuel += abs(control_action) * 20 state = next_state if ep == 0: trajectory.append(state) ep_r += reward if done: break ep_reward.append(ep_r) if t >= 95: fuel_list.append(fuel) else: print(ep, state_list[ep]) if ep == 0: trajectory = np.array(trajectory) # plt.figure() plt.plot(trajectory[:, 0], trajectory[:, 1], label=mode) plt.legend() plt.savefig('trajectory.png') return ep_reward, np.array(fuel_list), state_list
def test(adapter_name=None, state_list=None, renew=False, mode='switch', INDI_NAME=None): print(mode) env = Osillator() EP_NUM = 1 if mode == 'switch': model = DQN(2, 2).to(device) model.load_state_dict(torch.load(adapter_name)) if mode == 'weight': model = Weight_adapter(2, 2).to(device) model.load_state_dict(torch.load(adapter_name)) if mode == 'individual': Individual.load_state_dict(torch.load(INDI_NAME)) if renew: state_list = [] fuel_list = [] ep_reward = [] trajectory = [] safe = [] unsafe = [] control_action_list = [] for ep in range(EP_NUM): if renew: state = env.reset() state_list.append(state) else: assert len(state_list) == EP_NUM state = env.reset(state_list[ep][0], state_list[ep][1]) ep_r = 0 fuel = 0 if ep == 0: trajectory.append(state) for t in range(env.max_iteration): # attack happens here # state += np.random.uniform(low=-0.35, high=0.35, size=state.shape) state = torch.from_numpy(state).float().to(device) if mode == 'switch': action = model.act(state, epsilon=0) with torch.no_grad(): if action == 0: control_action = model_1(state).cpu().data.numpy()[0] elif action == 1: control_action = model_2(state).cpu().data.numpy()[0] else: assert False control_action = 0 elif mode == 'weight': action = model(state).cpu().data.numpy() ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = action[0] * ca1 + action[1] * ca2 if ep == 0: print(t, state, control_action, action, ca1, ca2) elif mode == 'average': ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = (ca1 + ca2) / 2 elif mode == 'planning': ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = plan(state, ca1, ca2) elif mode == 'd1': control_action = model_1(state).cpu().data.numpy()[0] elif mode == 'd2': control_action = model_2(state).cpu().data.numpy()[0] elif mode == 'individual': # delta, original = fgsm(Individual, state) # if ep == 0: # print(delta, original) # control_action = Individual(state+delta).cpu().data.numpy()[0] control_action = Individual(state).cpu().data.numpy()[0] next_state, reward, done = env.step(control_action) control_action = np.clip(control_action, -1, 1) fuel += abs(control_action) * 20 state = next_state if ep == 0: trajectory.append(state) control_action_list.append(control_action) ep_r += reward if done: break ep_reward.append(ep_r) if t >= 95: fuel_list.append(fuel) safe.append(state_list[ep]) else: print(ep, state_list[ep]) unsafe.append(state_list[ep]) if ep == 0: trajectory = np.array(trajectory) # plt.figure() plt.plot(trajectory[:, 0], trajectory[:, 1], label=mode) plt.legend() plt.savefig('trajectory.png') # safe = np.array(safe) # unsafe = np.array(unsafe) # plt.figure() # plt.scatter(safe[:, 0], safe[:, 1], c='green') # plt.scatter(unsafe[:, 0], unsafe[:, 1], c='red') # plt.savefig('./safe_sample_plot/'+ mode +'.png') return ep_reward, np.array(fuel_list), state_list, np.array( control_action_list)
# thread = threading.Thread(target=train) # thread.daemon = True # thread.start() # if PLOT_RESULT: # drawer = Drawer() # drawer.plot() # drawer.save() # thread.join() train() assert False # test env = Osillator() state_dim = 2 action_dim = 2 ppo = PPO(state_dim, action_dim, method=METHOD) ppo.load_model() mean_epoch_reward = 0 for _ in range(TEST_EP): state = env.reset() for i in range(EP_LEN): if RENDER: env.render() action = ppo.choose_action(state, True) u = gene_u(state, action, model_1, model_2) next_state, reward, done = env.step(u) mean_epoch_reward += reward state = next_state if done: break print(mean_epoch_reward / TEST_EP) env.close()